You are on page 1of 11

R Working Manuals

Shankar MM
Mentor – Research & Training
mmshankar@gmail.com
CARES, Bangalore

1. Set the working directory


setwd(“Pathname”)

2. Import emp dataset in to R

emp<-read.csv('emp.csv')

3. Exports in R

Export to txt
write.table(emp, file = 'emp.txt', sep="\t")
Export to CSV
Write.csv(dataset, file=’newdataset.csv’)

install.packages("foreign")
library(foreign)

Export to SPSS
write.foreign(emp, 'Pathname\\emp.txt', 'Pathname\\
emp.sps', package = "SPSS")
Export to SAS
write.foreign(emp, "Pathname\\emp.txt", "Pathname\\emp.sas",
package="SAS")

To know the names of variables in dataset


names(emp)
[1] "id" "gender" "bdate" "educ" "jobcat" "salary" "salbegi " jobtime"
"prevexp" "minority"

To know the head means 1st 6 rows of the dataset


head(emp)
id gender bdate educ jobcat salary salbegin jobtime
prevexp minority
2 m 5/23/1958 16 1 40200 18750 98
36 0
3 f 7/26/1929 12 1 21450 12000 98
381 0
4 f 4/15/1947 8 1 21900 13200 98
190 0
5 m 2/9/1955 15 1 45000 21000 98
138 0
7 m 8/22/1958 15 1 32100 13500 98
67 0
7 m 4/26/1956 15 1 36000 18750 98
114 0

To know the tail means last 6 rows of the dataset


tail(emp)
id gender bdate educ jobcat salary salbegin jobtime
prevexp minority455 m 1/17/1964 16 3 43650 19500
65 19 0
456 m 10/17/1959 19 3 75000 42510 65 54
0
458 m 7/6/1965 19 3 61875 28740 65 26
0
462 f 10/18/1963 16 3 34410 19500 65 79
0
464 m 3/20/1962 19 3 47550 33000 64 27
0
468 f 11/28/1965 16 3 55750 19980 64 36
0

1. To know structure of the dataset


str(emp)
'data.frame': 474 obs. of 10 variables:
$ id : int 2 3 4 5 6 7 8 9 10 11 ...
$ gender : Factor w/ 2 levels "f","m": 2 1 1 2 2 2 1 1 1 1 ...
$ bdate : Factor w/ 462 levels " ","1/10/1964",..: 275 3 240
290 36 1...
$ educ : int 16 12 8 15 15 15 12 15 12 16 ...
$ jobcat : int 1 1 1 1 1 1 1 1 1 1 ...
$ salary : int 40200 21450 21900 45000 32100 36000 21900 27900
30300 ...
$ salbegin: int 18750 12000 13200 21000 13500 18750 9750 13500
16500 ...
$ jobtime : int 98 98 98 98 98 98 98 98 98 98 ...
$ prevexp : int 36 381 190 138 67 114 0 115 244 143 ...
$ minority: int 0 0 0 0 0 0 0 0 0 0 ...

2. To know the dimension of the set

> dim(emp)
[1] 474 10

3. Class of the dataset


> class(emp)
[1] "data.frame"

4. Summary of the data

One variable and more than one variable


5. Selecting the variables and observations
 To select the 10 obs:
> emp10obs<-emp[1:10,]
> emp10obs
id gender bdate educ jobcat salary salbegin jobtime
prevexp minority
1 2 m 5/23/1958 16 1 40200 18750 98
36 0
2 3 f 7/26/1929 12 1 21450 12000 98
381 0
3 4 f 4/15/1947 8 1 21900 13200 98
190 0
4 5 m 2/9/1955 15 1 45000 21000 98
138 0
5 6 m 8/22/1958 15 1 32100 13500 98
67 0
6 7 m 4/26/1956 15 1 36000 18750 98
114 0
7 8 f 5/6/1966 12 1 21900 9750 98
0 0
8 9 f 1/23/1946 15 1 27900 12750 98
115 0
9 10 f 2/13/1946 12 1 24000 13500 98
244 0
10 11 f 2/7/1950 16 1 30300 16500 98
143 0
 To select the 5 variable
> emp5var<-emp[,1:5]
> emp5var
id gender bdate educ jobcat
1 2 m 5/23/1958 16 1
2 3 f 7/26/1929 12 1
3 4 f 4/15/1947 8 1
4 5 m 2/9/1955 15 1
5 6 m 8/22/1958 15 1
6 7 m 4/26/1956 15 1
7 8 f 5/6/1966 12 1
8 9 f 1/23/1946 15 1
9 10 f 2/13/1946 12 1
10 11 f 2/7/1950 16 1
11 12 m 1/11/1966 8 1
6. Select the continuous variables
> emp_count<-emp[,c(4,6,7,8,9)]
> emp_count

summary by using library "Psych"


install.packages("psych")
library(psych)
7. Describe the dataset
8. Describe by statement

Table function frequency distribution


Table statement one dimension
> table(emp$jobcat)

1 2 3
363 27 84

> prop.table(table(emp$jobcat))

1 2 3
0.76582278 0.05696203 0.17721519

> round(prop.table(table(emp$jobcat)), digits = 2)

1 2 3
0.77 0.06 0.18

Table statement two dimension


> table(emp$gender, emp$jobcat)

1 2 3
f 206 0 10
m 157 27 74

# cross table
Install.packages(gmodel)
library(gmodels)

> CrossTable(emp$jobcat,emp$gender)

Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|

Total Observations in Table: 474

| emp$gender
emp$jobcat | f | m | Row Total |
-------------|-----------|-----------|-----------|
clerical | 206 | 157 | 363 |
| 9.956 | 8.335 | |
| 0.567 | 0.433 | 0.766 |
| 0.954 | 0.609 | |
| 0.435 | 0.331 | |
-------------|-----------|-----------|-----------|
custodian | 0 | 27 | 27 |
| 12.304 | 10.301 | |
| 0.000 | 1.000 | 0.057 |
| 0.000 | 0.105 | |
| 0.000 | 0.057 | |
-------------|-----------|-----------|-----------|
manager | 10 | 74 | 84 |
| 20.891 | 17.490 | |
| 0.119 | 0.881 | 0.177 |
| 0.046 | 0.287 | |
| 0.021 | 0.156 | |
-------------|-----------|-----------|-----------|
Column Total | 216 | 258 | 474 |
| 0.456 | 0.544 | |
-------------|-----------|-----------|-----------|
Data Management using Cut, breaks and Add labels

> label<-c('0-50000','50000-100000','100000-150000')
> emp_label<-data.frame(emp, label=cut(emp$salary,
c(0,50000,100000,150000), labels = label))

Loop Function using various apply


apply

> apply(emp[,c(6,7)], 2, mean)


salary salbegin
34419.57 17016.09
sapply

> sapply(emp[,c(6,7)], sum)


salary salbegin
16314875 8065625

lapply

> lapply(emp[,c(6,7)],sum)
$salary
[1] 16314875

$salbegin
[1] 8065625

tapply
> em<-as.list(emp)
> tapply(em$salary, em$gender, mean)
f m
26031.92 41441.78
> tapply(em$jobcat, em$gender, sum)
f m
236 433

Aggregate function

# aggregate salary by gender


> agg_gen<-aggregate(salary~gender, emp, mean)
> agg_gen
gender salary
1 f 26031.92
2 m 41441.78

# one way anova

anova_emp<-aov(emp$salary~emp$jobcat)

summary(anova_emp)
Df Sum Sq Mean Sq F value Pr(>F)
emp$jobcat 2 8.944e+10 4.472e+10 434.5 <2e-16 ***
Residuals 471 4.848e+10 1.029e+08
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

TukeyHSD(anova_emp)
Tukey multiple comparisons of means
95% family-wise confidence level

Fit: aov(formula = emp$salary ~ emp$jobcat)

$`emp$jobcat`
diff lwr upr p adj
custodian-clerical 3100.349 -1657.805 7858.503 0.2768689
manager-clerical 36139.258 33251.225 39027.291 0.0000000
manager-custodian 33038.909 27761.979 38315.839 0.0000000
Simple Linear regression
Data set used : women - R inbuilt dataset

Ind. Var. = Height

Dep. Var. = Weight

To apply linear model on data

Womenreg <- lm(weight ~ height, data= women)

Summary(womenreg)

Check R square, Residuals s.e, F value, p value and Coefficient value to assess the model goodness and
significant.

To predict the Dep. Var. weight,

Womenpred <- fitted(womenreg)

Multiple linear regression with Emp data

Ind var = Salary Dep. Var = educ, jobcat

Split the data into training and testing

Emptrain <- emp[1:350,]

Emptest <- emp[351:nrow(emp)]

Model training

Empreg <- lm(salary~educ+jobcat,data=emptrain)

Model Testing

Emppred <- predict(empreg,emptiest)

#################### Data Reduction Techq using PCA in Factor Analysis####################

use taj.csv data


taj<-read.csv(“Path”)
dim(taj)
describe(taj)
missvar<-sum(is.na(taj$MenuSelection))
missvar
apply(taj,2,function(x)sum(is.na(x)))
taj1<- na.omit(taj)
dim(taj1)
taj2 <- taj1[,-c(1,2,14)]
apply(taj2,2,mean)
library(psych)
pairs.panels(taj2[1:6])
cor(taj2)
ev <- eigen(cor(taj2)) # get eigenvalues
ev$values
library(psych)
fitfa <- principal(taj2, nfactors=3,
rotate="none", scores = T,
method = "regression")
fitfa$rot.mat
fitfa$communality
fitfa$scores
summary(fitfa)
fitfa$communality
fitfa$loadings
factscore <- fitfa$scores
class(factscore)
colnames(factscore)<- c("service","cuisine","ambience")
head(factscore)
factscore <- as.data.frame(factscore)
factscore<-cbind(taj1$Experience_MoneyValue,factscore)
head(factscore)
dim(factscore)
apply(factscore,2,mean)
corfact<-cor(factscore[,-1])

You might also like