You are on page 1of 11

資科三 A 10173143 柯建


2023-12-12
# Part 1: Exploring the Dataset
data(airquality) 導入資料集,也是 R 內建的資
料集
str(airquality) 顯示資料集結構

# Load the airquality dataset


data(airquality)
# Part 1: Exploring the Dataset
str(airquality)

## 153 obs. of 6 variables:


’data.frame’: 41 36 12 18 NA 28 23 19 8 NA ...
## $ Ozone: 190 118 149 313 NA NA 299 99 19 194 ...
int ##$ 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1
Solar.R: int ## 8.6 ...
$ Wind : 67 72 74 62 56 66 65 59 61 69 ...
num ##$ Temp :
int ##$ Month: 5 5 5 5 5 5 5 5 5 5 ...
int ##$ Day : 1 2 3 4 5 6 7 8 9 10 ...
int
head(airquality)

## Ozone Solar.R Wind Temp Month head(airquality) 顯示前


Day 六筆資料

summary(airquality)

summary(airquality) 顯示各變數結

boxplot(airquality)

boxplot(airquality) 顯示變數箱形

2
5
0

1
5
0
5
0
0
Ozo Sola Wi Te Mo D
ne r.R nd mp nth a
y

data <- airquality[, c("Ozone", "Solar.R", "Wind", "Temp")]


boxplot(data)
data <- airquality[, c("Ozone", "Solar.R", "Wind", "Temp")] 後續只取其中四個變數分

boxplot(data) 顯示變數箱形圖
2
5
0

1
5
0
5
0
0
Ozo Sola Wi Te
ne r.R nd mp
# Calculate correlation matrix
cor_matrix <- cor(data) 兩兩變數間的相關性矩陣

# Calculate correlation matrix


cor_matrix <- cor(data)
# Visualize correlations
library(corrplot)

## corrplot 0.92
loaded
corrplot(cor_matrix, method = "color")

# Visualize correlations
library(corrplot)
corrplot(cor_matrix, method = “color”) 把相關性矩陣畫成圖
1
O S W T
Ozo z o i e 0
? ? ? .
ne o l n m
8
n a d p 0
e r .
6
. 0
Sola ? R ? ? .
0
4
r.R .
02

−0
Wi ? ? .2
nd
−0
.4
−0
.6
Te ? ?
−0
mp .8

1
# Part 2: Descriptive Statistics and Plotting
# Base Plotting System with regression line and labels plot(x =
airquality$Temp, y = airquality$Ozone, pch = 2)
abline(lm(Ozone ~ Temp, data = airquality), col = "red")
text(airquality$Temp, airquality$Ozone, labels = row.names(airquality), pos = 3)
# Lattice with regression line
library(lattice)
# Part 2: Descriptive Statistics and Plotting
# Base Plotting System with regression line and labels
plot(x = airquality$Temp, y = airquality$Ozone, pch = 2) 畫 temp 與 ozone 的各數據點
abline(lm(Ozone ~ Temp, data = airquality), col = “red”) 畫 temp 與 ozone 的迴歸線
text(airquality$Temp, airquality$Ozone, labels = row.names(airquality), pos = 3) 畫數據編號到各點

6
2 9 1
1 3 9
8 2
5 0 1
6 12674 1
0 0
8956808 123
681 10927 1
1 118
971100 122 2
1 0 6
0 10 9 9 2 12 5 0
1 13 1 16 72 97 40
69617981 10 1 26
1 21 1 3 98467
0 2 1 7 19149 428 7 8
4 6 4 2 1363891028
1 964
714
3 1 44
a 5 1 4
414 18 6
79 218 5 3 13091 7
27
i 0 15 920 1 4 4
13133 147 31
13171 611 76 42
404
9 1
88 21 23 14 0 145
1 412
13123 14
8 1251 15
1 011783 31 8 945
r 13850 332 5 41 94 5 3
4 11
q 2 736
0 1
u 6 7 8 9
a 0 0 0 0
l airquality$Te
i mp
t
yxyplot(Ozone ~ Temp, data = airquality, type = c("p", "r"))
$
O
z
o
n
e
以另個畫法呈現上

1
5
0

1
0
0

O
5
z 0
o
n
e
0

6 7 8 9
0 0 0 0
Te
mp

# ggplot2
library(ggplot2)
ggplot(data = airquality)
+
geom_point(aes(x = Temp, y = Ozone)) +
theme_bw()

## Warning: Removed 37 rows containing missing values


(‘geom_point()‘).
再以另個畫法呈現上

1
5
0

1
0
0

O
z
5
o 0
n
e

6 7 8 9
0 0 Te 0 0
將資料遺失值,用平均值取代 m
p
# Part 3: Data Preprocessing
# Impute missing values with the mean of the column
data_imputed <- airquality
for(i in 1:6){
data_imputed[is.na(data_imputed[,i]),i] <- colMeans(data_imputed, na.rm = TRUE)[i]
}
# Part 4: Regression Analysis #
進行 ozone
Linear 對於其他三個變數的迴歸分析, p<0.05 代表此效果
regression
顯著
model <- lm(formula = Ozone ~ Solar.R + Wind + Temp, data = airquality)
summary(model)

##
## Call:
## lm(formula = Ozone ~ Solar.R + Wind + Temp, data =
airquality) ##
## Residuals:
## Min 1Q Median 3Q Max
## -40.485 -14.219 -3.551 10.097
95.619 ##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64.34208 23.05472 -2.791 0.00623 **
## Solar.R 0.059820.023192.580 0.01124 *
## - 0.65441-5.094 1.52e-06 ***
Wind 3.333 0.253536.516 2.42e-09 ***
##
##
Temp --- 59
## Signif. codes:1.6520 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05
’.’ 0.1 ’ ’ 109 ##
## Residual standard error: 21.18 on 107 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.6059, Adjusted R-squared:
0.5948 ## F-statistic: 54.83 on 3 and 107 DF, p-value:
< 2.2e-16
# Residual plot
plot(model, which = 1)

建立迴歸分析的殘差圖,可以透過分析殘差趨勢,表示分析方式適不適

Residuals vs Fitted
117

1
0 6
3
0 0 2

5
0

R
0
e
s
i
d
u − − 0 20 40 60 8 1
a 5 2 Fitted values 0 0
l 0 0 lm(Ozone ~ Solar.R + Wind + 0
s Temp)

# Influence plot
library(car)

## Loading required package:


carData
influencePlot(model)
點越大,顏色越深,表示此筆資料對於迴歸結果影響越大,因此第 117 筆資料

對於迴歸結果影響越大的資料

Cook's 0.2
D: 0 61
1
1
5 7
4 6
3 2
9 4
2 8

S
t
u
d − 0. 0. 0. 0. 0. 0.
e 2 0 0 0 0 1 1
n 2 4 6
Hat−Valu 8 0 2
t − es
i 1
z## StudResHat CookD
e## 9 0 1.887975 0.10273618
d0.09964369
## 48 1.744348 0.11615763
0.09809937
R## 62 2.896262 0.03966374
e0.08101924
###Shapiro-Wilk test for
117 5.143983 normality of residuals
0.04651548
sshapiro.test(model$residual)
0.26068742
i此檢定測試殘差有沒有符合常態假設, p<0.05 ,表示殘差沒有符合常態假設
d
u##
a## Shapiro-Wilk normality
test ##
l ## data: model$residual
s## W = 0.91709, p-value = 3.618e-
06
# Durbin-Watson test for autocorrelation of residuals
durbinWatsonTest(model)

## lag Autocorrelation D-W Statistic p-


value ## 1 0.03150895
1.935476 0.652
## Alternative hypothesis: rho != 0
此檢定測試殘差有沒有符合自相關假設, p<0.05 ,表示殘差有符合自相關假

此檢定測試殘差有沒有符合同質假設, p<0.05 ,表示殘差沒有符合同質假設

# NCV test for non-constant variance of residuals


ncvTest(model)

## Non-constant Variance Score


Test ## Variance formula: ~
fitted.values
## Chisquare = 5.375405, Df = 1, p =
0.020423
# Prediction 依建立的迴歸模型預測 ozone
new.airquality <- data.frame(Solar.R = 190, Wind = 7.4, Temp = 80) 值
predicted_values <- predict(model, new.airquality)

# Predicted vs. Observed plot


plot(airquality$Temp, airquality$Ozone, pch = 2, xlab = "Temperature", ylab = "Ozone")
points(new.airquality$Temp, predicted_values, pch = 3, col = "red")
legend("topleft", legend = c("Observed", "Predicted"), col = c("black", "red"), pch = c(2, 3
)
)

將實際值與預測 ozone 值畫在圖上比



Observ
ed
Predict
1 ed
5
0

1
0
O 0
z
o 5
n 0
e
0
6 7 8 9
0 0 0 0
Temperat
ure
以 anova 檢定測試 ozone 是否能被 day 與 month 顯著地預測,由結果可知兩變數效應都未達統
計顯著
# Perform ANOVA for different models
a.lm <- lm(Ozone ~ Month, data =
airquality) anova_result_a <- anova(a.lm)

b.lm <- lm(Ozone ~ Day, data =


airquality) anova_result_b <-
anova(b.lm)
# Check statistical significance
if (anova_result_a$Pr[1] < 0.05) {
cat("Model with Month is statistically significant\n")
} else {
cat("Model with Month is not statistically significant\n")
}

## Model with Month is not statistically


significant
if (anova_result_b$Pr[1] < 0.05) {
cat("Model with Day is statistically significant\n")
} else {
cat("Model with Day is not statistically significant\n")
}

## Model with Day is not statistically


significant
# Variable Inflation Factor (VIF)
vif_values <- car::vif(model)
print(vif_values)

## Solar.RWind Temp
## 1.095253 1.329070
1.431367
vif 為測試變數共線性問題的指標,所有值皆 <5 ,表示沒有共線性問題

You might also like