You are on page 1of 4

SecB Q1 (FA20028)

library(readxl)
Q_1 <- read_excel("C:/Users/gitan/AppData/Roaming/Microsoft/Windows/Network
Shortcuts/Q_167026_2.xlsx")
View(Q_1)

a) Mean salaries of Female


aggregate(Q_1$EstimatedSalary ~ Q_1$Gender, FUN = mean)

## Q_1$Gender Q_1$EstimatedSalary
## 1 Female 71759.80
## 2 Male 67642.86

b) Mean salaries of people who have purchased the house and not
aggregate(Q_1$EstimatedSalary ~ Q_1$Purchased_house, FUN = mean)

## Q_1$Purchased_house Q_1$EstimatedSalary
## 1 0 60544.75
## 2 1 86272.73

c) Mean age of males who have purchased the house


aggregate(Q_1$Age ~ Q_1$Gender + Q_1$Purchased_house, FUN = mean)

## Q_1$Gender Q_1$Purchased_house Q_1$Age


## 1 Female 0 33.11024
## 2 Male 0 32.48462
## 3 Female 1 47.15584
## 4 Male 1 45.50000

d) Is there any relationship between gender and estimated salary


t.test(Q_1$EstimatedSalary ~ Q_1$Gender)

##
## Welch Two Sample t-test
##
## data: Q_1$EstimatedSalary by Q_1$Gender
## t = 1.2101, df = 396.88, p-value = 0.2269
## alternative hypothesis: true difference in means between group Female and
group Male is not equal to 0
## 95 percent confidence interval:
## -2571.367 10805.261
## sample estimates:
## mean in group Female mean in group Male
## 71759.80 67642.86

as the p value is greater than 0.05 we accept the null hypothesis which there is no
relationship between gender and estimated salary

e) Dataset of female who have purchased house


Fem_house <- subset(Q_1, Gender=="Female" & Purchased_house==1)
head(Fem_house)

## # A tibble: 6 x 5
## `User ID` Gender Age EstimatedSalary Purchased_house
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 15694829 Female 32 150000 1
## 2 15621083 Female 48 29000 1
## 3 15736760 Female 47 49000 1
## 4 15599081 Female 45 22000 1
## 5 15633531 Female 47 30000 1
## 6 15729054 Female 27 137000 1

f) Barchart of Gender and House purchased


tQ1 <- table(Q_1$Gender, Q_1$Purchased_house)
bQ1 <- barplot(tQ1, beside = T, col = c(3,4), legend.text = T, ylim =
c(0,150))
# g) Histogram of
age
hist(Q_1$Age, freq = FALSE, col = 3)
lines(density(Q_1$Age), col=7, lwd=3)
as we can see in the
above histogram the distriburion is slightly skewed towards the left which means the
average people were young in the survey.

You might also like