Skip Headers

SAS  SPSS  R-PROJECT  S-PLUS  PROC-SQL

통 계
프 로 그 램 비 교

Go to Documentation Home
HOME
Go to Book List
PRO_home
Go to Table of Contents
연구회
Go to Index
자료실
Go to Master Index
R-PROJECT
Go to Feedback page
MAIL

Go to previous page
Previous
Go to next page
Next

8. By 또는 Split 파일 프로세싱.


1. SAS
2. SPSS
3. R-PROJECT
4. S-PLUS
5. PROC SQL

 


1. SAS

MAIN

* By 또는 Split 파일 프로세싱을 위한 SAS 프로그램.

* 성별을 기준으로 하여 데이터 소트;

PROC SORT DATA=BACK.mydata;

     BY gender;

run;

* 성별을 기준으로 하여 데이터 분석; 

PROC MEANS DATA=BACK.mydata;

     BY gender;

run;

---------------------------------- gender=f ----------------------------------

                                MEANS 프로시저

 

변수        N          평균값        표준편차          최소값          최대값

-----------------------------------------------------------------------------

id          4       2.5000000       1.2909944       1.0000000       4.0000000

workshop    4       1.5000000       0.5773503       1.0000000       2.0000000

q1          4       2.0000000       0.8164966       1.0000000       3.0000000

q2          4       1.2500000       0.5000000       1.0000000       2.0000000

q3          3       4.3333333       0.5773503       4.0000000       5.0000000

q4          4       2.0000000       1.1547005       1.0000000       3.0000000

-----------------------------------------------------------------------------

---------------------------------- gender=m ----------------------------------

변수        N          평균값        표준편차          최소값          최대값

-----------------------------------------------------------------------------

id          4       6.5000000       1.2909944       5.0000000       8.0000000

workshop    4       1.5000000       0.5773503       1.0000000       2.0000000

q1          4       4.5000000       0.5773503       4.0000000       5.0000000

q2          4       4.2500000       0.9574271       3.0000000       5.0000000

q3          4       4.0000000       1.4142136       2.0000000       5.0000000

q4          4       4.5000000       0.5773503       4.0000000       5.0000000

-----------------------------------------------------------------------------

 

* Class를 사용하여 변수 소트 없이 데이터 분석 가능.;

PROC MEANS DATA=BACK.mydata;

     CLASS gender;

run;

관측치

gender          수    변수        N          평균값        표준편차          최소값          최대값

---------------------------------------------------------------------------------------------------

f                4    id          4       2.5000000       1.2909944       1.0000000       4.0000000

                      workshop    4       1.5000000       0.5773503       1.0000000       2.0000000

                      q1          4       2.0000000       0.8164966       1.0000000       3.0000000

                      q2          4       1.2500000       0.5000000       1.0000000       2.0000000

                      q3          3       4.3333333       0.5773503       4.0000000       5.0000000

                      q4          4       2.0000000       1.1547005       1.0000000       3.0000000

 

m                4    id          4       6.5000000       1.2909944       5.0000000       8.0000000

                      workshop    4       1.5000000       0.5773503       1.0000000       2.0000000

                      q1          4       4.5000000       0.5773503       4.0000000       5.0000000

                      q2          4       4.2500000       0.9574271       3.0000000       5.0000000

                      q3          4       4.0000000       1.4142136       2.0000000       5.0000000

                      q4          4       4.5000000       0.5773503       4.0000000       5.0000000

---------------------------------------------------------------------------------------------------

 



2. SPSS

MAIN

* By 또는 Split 파일 프로세싱을 위한 SPSS 프로그램.

GET FILE="C:\mydata.sav".

SORT CASES BY gender .

SPLIT FILE

  SEPARATE BY gender .

DESCRIPTIVES

  VARIABLES=q1 q2 q3 q4

  /STATISTICS=MEAN STDDEV MIN MAX .

 

 


3. R-PROJECT

MAIN

* By 또는 Split 파일 프로세싱을 위한 SPSS 프로그램.

load(file="c:\\mydata.Rdata")

print(mydata)

attach(mydata) # mydata를 기본 데이터로 지정.

 

# 관측치와 모든 변수의 요약 통계 구하기.

summary(mydata)

workshop   gender       q1             q2             q3              q4     

 Min.   :1.0   f:4    Min.   :1.00   Min.   :1.00   Min.   :2.000   Min.   :1.00 

 1st Qu.:1.0   m:4    1st Qu.:2.00   1st Qu.:1.00   1st Qu.:4.000   1st Qu.:2.50 

 Median :1.5          Median :3.50   Median :2.50   Median :4.000   Median :3.50 

 Mean   :1.5          Mean   :3.25   Mean   :2.75   Mean   :4.143   Mean   :3.25 

 3rd Qu.:2.0          3rd Qu.:4.25   3rd Qu.:4.25   3rd Qu.:5.000   3rd Qu.:4.25  

 Max.   :2.0          Max.   :5.00   Max.   :5.00   Max.   :5.000   Max.   :5.00

                                                    NA's   :1.000 

 

# 성별을 기준으로 하여 각 변수에 대한 요약 통계 구하기.

by(mydata, gender, summary)

gender: f

    workshop   gender       q1             q2             q3              q4  

 Min.   :1.0   f:4    Min.   :1.00   Min.   :1.00   Min.   :4.000   Min.   :1 

 1st Qu.:1.0   m:0    1st Qu.:1.75   1st Qu.:1.00   1st Qu.:4.000   1st Qu.:1 

 Median :1.5          Median :2.00   Median :1.00   Median :4.000   Median :2 

 Mean   :1.5          Mean   :2.00   Mean   :1.25   Mean   :4.333   Mean   :2 

 3rd Qu.:2.0          3rd Qu.:2.25   3rd Qu.:1.25   3rd Qu.:4.500   3rd Qu.:3 

 Max.   :2.0          Max.   :3.00   Max.   :2.00   Max.   :5.000   Max.   :3 

                                                    NA's   :1.000             

-------------------------------------------------------------

gender: m

    workshop   gender       q1            q2             q3            q4    

 Min.   :1.0   f:0    Min.   :4.0   Min.   :3.00   Min.   :2.0   Min.   :4.0 

 1st Qu.:1.0   m:4    1st Qu.:4.0   1st Qu.:3.75   1st Qu.:3.5   1st Qu.:4.0 

 Median :1.5          Median :4.5   Median :4.50   Median :4.5   Median :4.5 

 Mean   :1.5          Mean   :4.5   Mean   :4.25   Mean   :4.0   Mean   :4.5 

 3rd Qu.:2.0          3rd Qu.:5.0   3rd Qu.:5.00   3rd Qu.:5.0   3rd Qu.:5.0 

 Max.   :2.0          Max.   :5.0   Max.   :5.00   Max.   :5.0   Max.   :5.0

 

# 열 이름에 의해 선택된 변수에 대하여 성별에 대하여 각 값에 대한 요약 통계.

by( mydata[c("q1","q2","q3","q4")] , gender, summary)

gender: f

       q1             q2             q3              q4  

 Min.   :1.00   Min.   :1.00   Min.   :4.000   Min.   :1 

 1st Qu.:1.75   1st Qu.:1.00   1st Qu.:4.000   1st Qu.:1 

 Median :2.00   Median :1.00   Median :4.000   Median :2 

 Mean   :2.00   Mean   :1.25   Mean   :4.333   Mean   :2 

 3rd Qu.:2.25   3rd Qu.:1.25   3rd Qu.:4.500   3rd Qu.:3 

 Max.   :3.00   Max.   :2.00   Max.   :5.000   Max.   :3 

                               NA's   :1.000             

-------------------------------------------------------------

gender: m

       q1            q2             q3            q4    

 Min.   :4.0   Min.   :3.00   Min.   :2.0   Min.   :4.0 

 1st Qu.:4.0   1st Qu.:3.75   1st Qu.:3.5   1st Qu.:4.0 

 Median :4.5   Median :4.50   Median :4.5   Median :4.5 

 Mean   :4.5   Mean   :4.25   Mean   :4.0   Mean   :4.5 

 3rd Qu.:5.0   3rd Qu.:5.00   3rd Qu.:5.0   3rd Qu.:5.0 

 Max.   :5.0   Max.   :5.00   Max.   :5.0   Max.   :5.0

 

# 다중 범주 변수는 리스트에서 이용되어야 하고, data.frame 함수는 리스트를 취할수 있다.

# 데이터는 workshop과 gender로 정렬될 필요가 없다.

# workshop과 gender 변수를 기준으로 각 값에 대한 요약 통계.

by(mydata[c("q1","q2","q3","q4")],

  data.frame(workshop,gender), summary)

workshop: 1  gender: f

       q1             q2             q3             q4    

 Min.   :1.00   Min.   :1.00   Min.   :4.00   Min.   :1.0 

 1st Qu.:1.25   1st Qu.:1.25   1st Qu.:4.25   1st Qu.:1.5 

 Median :1.50   Median :1.50   Median :4.50   Median :2.0 

 Mean   :1.50   Mean   :1.50   Mean   :4.50   Mean   :2.0 

 3rd Qu.:1.75   3rd Qu.:1.75   3rd Qu.:4.75   3rd Qu.:2.5 

 Max.   :2.00   Max.   :2.00   Max.   :5.00   Max.   :3.0 

-------------------------------------------------------------

workshop: 2  gender: f

       q1             q2          q3          q4    

 Min.   :2.00   Min.   :1   Min.   :4   Min.   :1.0 

 1st Qu.:2.25   1st Qu.:1   1st Qu.:4   1st Qu.:1.5 

 Median :2.50   Median :1   Median :4   Median :2.0 

 Mean   :2.50   Mean   :1   Mean   :4   Mean   :2.0 

 3rd Qu.:2.75   3rd Qu.:1   3rd Qu.:4   3rd Qu.:2.5 

 Max.   :3.00   Max.   :1   Max.   :4   Max.   :3.0 

                            NA's   :1               

-------------------------------------------------------------

workshop: 1  gender: m

       q1             q2            q3            q4  

 Min.   :4.00   Min.   :3.0   Min.   :2.0   Min.   :4 

 1st Qu.:4.25   1st Qu.:3.5   1st Qu.:2.5   1st Qu.:4 

 Median :4.50   Median :4.0   Median :3.0   Median :4 

 Mean   :4.50   Mean   :4.0   Mean   :3.0   Mean   :4 

 3rd Qu.:4.75   3rd Qu.:4.5   3rd Qu.:3.5   3rd Qu.:4 

 Max.   :5.00   Max.   :5.0   Max.   :4.0   Max.   :4 

-------------------------------------------------------------

workshop: 2  gender: m

       q1             q2             q3          q4  

 Min.   :4.00   Min.   :4.00   Min.   :5   Min.   :5 

 1st Qu.:4.25   1st Qu.:4.25   1st Qu.:5   1st Qu.:5 

 Median :4.50   Median :4.50   Median :5   Median :5 

 Mean   :4.50   Mean   :4.50   Mean   :5   Mean   :5 

 3rd Qu.:4.75   3rd Qu.:4.75   3rd Qu.:5   3rd Qu.:5 

 Max.   :5.00   Max.   :5.00   Max.   :5   Max.   :5

 

# 위 예제에서 by 문 안의 옵션을 사전에 정의하여 처리.

myVars <- c("q1","q2","q3","q4")

myBys <- data.frame(workshop,gender)

by( mydata[myVars], myBys, summary)

workshop: 1  gender: f

       q1             q2             q3             q4    

 Min.   :1.00   Min.   :1.00   Min.   :4.00   Min.   :1.0 

 1st Qu.:1.25   1st Qu.:1.25   1st Qu.:4.25   1st Qu.:1.5 

 Median :1.50   Median :1.50   Median :4.50   Median :2.0 

 Mean   :1.50   Mean   :1.50   Mean   :4.50   Mean   :2.0 

 3rd Qu.:1.75   3rd Qu.:1.75   3rd Qu.:4.75   3rd Qu.:2.5 

 Max.   :2.00   Max.   :2.00   Max.   :5.00   Max.   :3.0 

-------------------------------------------------------------

workshop: 2  gender: f

       q1             q2          q3          q4    

 Min.   :2.00   Min.   :1   Min.   :4   Min.   :1.0 

 1st Qu.:2.25   1st Qu.:1   1st Qu.:4   1st Qu.:1.5 

 Median :2.50   Median :1   Median :4   Median :2.0 

 Mean   :2.50   Mean   :1   Mean   :4   Mean   :2.0 

 3rd Qu.:2.75   3rd Qu.:1   3rd Qu.:4   3rd Qu.:2.5 

 Max.   :3.00   Max.   :1   Max.   :4   Max.   :3.0 

                            NA's   :1               

-------------------------------------------------------------

workshop: 1  gender: m

       q1             q2            q3            q4  

 Min.   :4.00   Min.   :3.0   Min.   :2.0   Min.   :4 

 1st Qu.:4.25   1st Qu.:3.5   1st Qu.:2.5   1st Qu.:4 

 Median :4.50   Median :4.0   Median :3.0   Median :4 

 Mean   :4.50   Mean   :4.0   Mean   :3.0   Mean   :4 

 3rd Qu.:4.75   3rd Qu.:4.5   3rd Qu.:3.5   3rd Qu.:4 

 Max.   :5.00   Max.   :5.0   Max.   :4.0   Max.   :4 

-------------------------------------------------------------

workshop: 2  gender: m

       q1             q2             q3          q4  

 Min.   :4.00   Min.   :4.00   Min.   :5   Min.   :5 

 1st Qu.:4.25   1st Qu.:4.25   1st Qu.:5   1st Qu.:5 

 Median :4.50   Median :4.50   Median :5   Median :5 

 Mean   :4.50   Mean   :4.50   Mean   :5   Mean   :5 

 3rd Qu.:4.75   3rd Qu.:4.75   3rd Qu.:5   3rd Qu.:5 

 Max.   :5.00   Max.   :5.00   Max.   :5   Max.   :5

 

 

4. S-PLUS

MAIN

* By 또는 Split 파일 프로세싱을 위한 S-PLUS 프로그램.

mydata<-read.table ("c:/data/mydata.csv",header=TRUE,

  sep=",",row.names="id")

print(mydata)

attach(mydata) #Makes this the default dataset.

 

# 관측치와 모든 변수의 요약 통계 구하기.

summary(mydata)

workshop   gender          q1             q2             q3               q4     

    Min.:1.0   f:4         Min.:1.00      Min.:1.00      Min.:2.000      Min.:1.00 

 1st Qu.:1.0   m:4      1st Qu.:2.00   1st Qu.:1.00   1st Qu.:4.000   1st Qu.:2.50 

  Median:1.5             Median:3.50    Median:2.50    Median:4.000    Median:3.50 

    Mean:1.5               Mean:3.25      Mean:2.75      Mean:4.143      Mean:3.25 

 3rd Qu.:2.0            3rd Qu.:4.25   3rd Qu.:4.25   3rd Qu.:5.000   3rd Qu.:4.25 

    Max.:2.0               Max.:5.00      Max.:5.00      Max.:5.000      Max.:5.00

                                                         NA's:1.000 

 

# 모든 변수에 대하여 각 성별에 대한 요약 통계 구하기.

by(mydata, gender, summary)

gender:f

    workshop   gender          q1             q2             q3               q4  

    Min.:1.0   f:4         Min.:1.00      Min.:1.00      Min.:4.000      Min.:1 

 1st Qu.:1.0   m:0      1st Qu.:1.75   1st Qu.:1.00   1st Qu.:4.000   1st Qu.:1 

  Median:1.5             Median:2.00    Median:1.00    Median:4.000    Median:2 

    Mean:1.5               Mean:2.00      Mean:1.25      Mean:4.333      Mean:2 

 3rd Qu.:2.0            3rd Qu.:2.25   3rd Qu.:1.25   3rd Qu.:4.500   3rd Qu.:3 

    Max.:2.0               Max.:3.00      Max.:2.00      Max.:5.000      Max.:3 

                                                         NA's:1.000 

-----------------------------------------------------------

gender:m

    workshop   gender          q1            q2             q3             q4    

    Min.:1.0   f:0         Min.:4.0      Min.:3.00      Min.:2.0      Min.:4.0 

 1st Qu.:1.0   m:4      1st Qu.:4.0   1st Qu.:3.75   1st Qu.:3.5   1st Qu.:4.0 

  Median:1.5             Median:4.5    Median:4.50    Median:4.5    Median:4.5 

    Mean:1.5               Mean:4.5      Mean:4.25      Mean:4.0      Mean:4.5 

 3rd Qu.:2.0            3rd Qu.:5.0   3rd Qu.:5.00   3rd Qu.:5.0   3rd Qu.:5.0 

    Max.:2.0               Max.:5.0      Max.:5.00      Max.:5.0      Max.:5.0

 

# 열 이름에 의해 선택된 변수에 대하여 성별의 각 값에 대한 요약 통계.

by( mydata[c("q1","q2","q3","q4")] , gender, summary)

gender:f

        q1             q2             q3              q4  

    Min.:1.00      Min.:1.00      Min.:4.000      Min.:1 

 1st Qu.:1.75   1st Qu.:1.00   1st Qu.:4.000   1st Qu.:1 

  Median:2.00    Median:1.00    Median:4.000    Median:2 

    Mean:2.00      Mean:1.25      Mean:4.333      Mean:2 

 3rd Qu.:2.25   3rd Qu.:1.25   3rd Qu.:4.500   3rd Qu.:3 

    Max.:3.00      Max.:2.00      Max.:5.000      Max.:3 

                                  NA's:1.000             

-----------------------------------------------------------

gender:m

        q1            q2             q3            q4    

    Min.:4.0      Min.:3.00      Min.:2.0      Min.:4.0 

 1st Qu.:4.0   1st Qu.:3.75   1st Qu.:3.5   1st Qu.:4.0 

  Median:4.5    Median:4.50    Median:4.5    Median:4.5 

    Mean:4.5      Mean:4.25      Mean:4.0      Mean:4.5 

 3rd Qu.:5.0   3rd Qu.:5.00   3rd Qu.:5.0   3rd Qu.:5.0 

    Max.:5.0      Max.:5.00      Max.:5.0      Max.:5.0

 

# 다중 범주 변수는 리스트에서 이용되어야 하고, data.frame 함수는 리스트를 취할수 있다.

# 데이터는 workshop과 gender로 정렬될 필요가 없다.

# workshop과 gender 변수를 기준으로 각 값에 대한 요약 통계.

by(mydata[c("q1","q2","q3","q4")],

   data.frame(workshop,gender), summary)

workshop:1  gender:f

        q1             q2             q3             q4    

    Min.:1.00      Min.:1.00      Min.:4.00      Min.:1.0 

 1st Qu.:1.25   1st Qu.:1.25   1st Qu.:4.25   1st Qu.:1.5 

  Median:1.50    Median:1.50    Median:4.50    Median:2.0 

    Mean:1.50      Mean:1.50      Mean:4.50      Mean:2.0 

 3rd Qu.:1.75   3rd Qu.:1.75   3rd Qu.:4.75   3rd Qu.:2.5 

    Max.:2.00      Max.:2.00      Max.:5.00      Max.:3.0 

-----------------------------------------------------------

workshop:2  gender:f

        q1             q2          q3          q4    

    Min.:2.00      Min.:1      Min.:4      Min.:1.0 

 1st Qu.:2.25   1st Qu.:1   1st Qu.:4   1st Qu.:1.5 

  Median:2.50    Median:1    Median:4    Median:2.0 

    Mean:2.50      Mean:1      Mean:4      Mean:2.0 

 3rd Qu.:2.75   3rd Qu.:1   3rd Qu.:4   3rd Qu.:2.5 

    Max.:3.00      Max.:1      Max.:4      Max.:3.0 

                               NA's:1               

-----------------------------------------------------------

workshop:1  gender:m

        q1             q2            q3            q4  

    Min.:4.00      Min.:3.0      Min.:2.0      Min.:4 

 1st Qu.:4.25   1st Qu.:3.5   1st Qu.:2.5   1st Qu.:4 

  Median:4.50    Median:4.0    Median:3.0    Median:4 

    Mean:4.50      Mean:4.0      Mean:3.0      Mean:4 

 3rd Qu.:4.75   3rd Qu.:4.5   3rd Qu.:3.5   3rd Qu.:4 

    Max.:5.00      Max.:5.0      Max.:4.0      Max.:4 

-----------------------------------------------------------

workshop:2  gender:m

        q1             q2             q3          q4  

    Min.:4.00      Min.:4.00      Min.:5      Min.:5 

 1st Qu.:4.25   1st Qu.:4.25   1st Qu.:5   1st Qu.:5 

  Median:4.50    Median:4.50    Median:5    Median:5 

    Mean:4.50      Mean:4.50      Mean:5      Mean:5 

 3rd Qu.:4.75   3rd Qu.:4.75   3rd Qu.:5   3rd Qu.:5 

    Max.:5.00      Max.:5.00      Max.:5      Max.:5 

 

# 위 예제에서 by 문 안의 옵션을 사전에 정의.

myVars <- c("q1","q2","q3","q4")

myBys <- data.frame(workshop,gender)

by( mydata[myVars], myBys, summary)

workshop:1  gender:f

        q1             q2             q3             q4    

    Min.:1.00      Min.:1.00      Min.:4.00      Min.:1.0 

 1st Qu.:1.25   1st Qu.:1.25   1st Qu.:4.25   1st Qu.:1.5 

  Median:1.50    Median:1.50    Median:4.50    Median:2.0 

    Mean:1.50      Mean:1.50      Mean:4.50      Mean:2.0 

 3rd Qu.:1.75   3rd Qu.:1.75   3rd Qu.:4.75   3rd Qu.:2.5 

    Max.:2.00      Max.:2.00      Max.:5.00      Max.:3.0 

-----------------------------------------------------------

workshop:2  gender:f

        q1             q2          q3          q4    

    Min.:2.00      Min.:1      Min.:4      Min.:1.0 

 1st Qu.:2.25   1st Qu.:1   1st Qu.:4   1st Qu.:1.5 

  Median:2.50    Median:1    Median:4    Median:2.0 

    Mean:2.50      Mean:1      Mean:4      Mean:2.0 

 3rd Qu.:2.75   3rd Qu.:1   3rd Qu.:4   3rd Qu.:2.5 

    Max.:3.00      Max.:1      Max.:4      Max.:3.0 

                               NA's:1               

-----------------------------------------------------------

workshop:1  gender:m

        q1             q2            q3            q4  

    Min.:4.00      Min.:3.0      Min.:2.0      Min.:4 

 1st Qu.:4.25   1st Qu.:3.5   1st Qu.:2.5   1st Qu.:4 

  Median:4.50    Median:4.0    Median:3.0    Median:4 

    Mean:4.50      Mean:4.0      Mean:3.0      Mean:4 

 3rd Qu.:4.75   3rd Qu.:4.5   3rd Qu.:3.5   3rd Qu.:4 

    Max.:5.00      Max.:5.0      Max.:4.0      Max.:4  

-----------------------------------------------------------

workshop:2  gender:m

        q1             q2             q3          q4  

    Min.:4.00      Min.:4.00      Min.:5      Min.:5 

 1st Qu.:4.25   1st Qu.:4.25   1st Qu.:5   1st Qu.:5 

  Median:4.50    Median:4.50    Median:5    Median:5 

    Mean:4.50      Mean:4.50      Mean:5      Mean:5 

 3rd Qu.:4.75   3rd Qu.:4.75   3rd Qu.:5   3rd Qu.:5 

    Max.:5.00      Max.:5.00      Max.:5      Max.:5 

 

 


5. PROC SQL

MAIN

* By 또는 Split 파일 프로세싱을 위한 PROC SQL 프로그램.

* Group by 옵션을 이용하여, 성별을 기준으로 지정된 각 함수 처리;

proc sql;

  create table mydata as

    select gender,

           sum(q1)  as q1_sum,

           mean(q2) as q2_mean,

           min(q3)  as q3_min,

           max(q4)  as q4_max

    from   BACK.mydata a

    group by gender;

  select * from mydata;

quit;

gender      q1_sum   q2_mean    q3_min    q4_max

------------------------------------------------

f                8      1.25         4         3

m               18      4.25         2         5

 

proc sql;

  create table mydata as

    select workshop,

           gender,

           sum(q1)  as q1_sum,

           mean(q2) as q2_mean,

           min(q3)  as q3_min,

           max(q4)  as q4_max

    from   BACK.mydata a

    group by workshop,

             gender;

  select * from mydata;

quit;

workshop  gender      q1_sum   q2_mean    q3_min    q4_max

----------------------------------------------------------

       1  f                3       1.5         4         3

       1  m                9         4         2         4

       2  f                5         1         4         3

       2  m                9       4.5         5         5