You are on page 1of 20

Spatial Modeling

Calling libraries:
library(spdep)
library(spatialreg)
library(tmap)
library(sf)
library(rgdal)
library(car)

# Change the presentation of decimal numbers to 4 and avoid scientific notation


options(prompt="R> ", digits=4, scipen=7)

Merging all three data files by common field(LSOA).


crimes<-read.csv(file.choose())
lsoa<-read.csv(file.choose())
dep<-read.csv(file.choose())

library(dplyr)
london <- left_join(crimes, lsoa)
london <- left_join(london, dep,by=c("LSOA"="ï..LSOA"))

All three files of data has merged in one file.


shape<-readOGR(dsn=file.choose()) # Reding the Shape file of London

## OGR data source with driver: ESRI Shapefile


## Source: "C:\Users\j\Desktop\Course Work\LSOA\LSOA.shp", layer: "LSOA"
## with 4835 features
## It has 1 fields

coords <- coordinates(shape)


london <- cbind(coords,london)
long<-london[,1]
lat<-london[,2]

#Joining the coordinates of shape file with the data

Now we have the coordinates of each LSOA in the data file.


Saving the merged data file:
write.csv(london, file="london.csv")

The resulting file looks like this.


london<-read.csv(file.choose())
names(london)

## [1] "LSOA" "NAME"


## [3] "long" "lat"
## [5] "AVERAGE_CRIME" "POPULATION"
## [7] "RATE_per_1000" "Pop_Estimate_2013"
## [9] "Age_0_15" "Age_16_23"
## [11] "Age_30_44" "Age_45_64"
## [13] "Age_65_Plus" "Working_Age"
## [15] "Households_with_people_2011" "Dependent_Children"
## [17] "No_Dependent_Children" "Lone_Parent"
## [19] "One_Person" "Multi_Person"
## [21] "White" "Mixed"
## [23] "Asian" "Black"
## [25] "Other" "BAME"
## [27] "UK_Born" "Not_UK_Born"
## [29] "HH_English_Speaking" "HH_No_English_Speaking"
## [31] "Christian" "Buddhist"
## [33] "Hindu" "Jewish"
## [35] "Muslim" "Sikh"
## [37] "Other_Religion" "No_Religion"
## [39] "Religion_not_stated" "Owned_outright"
## [41] "Owned_Mortgage" "Social_Rented"
## [43] "Private_Rented" "All_Households_2011"
## [45] "Detached_2011" "Semi_2011"
## [47] "Terrace_2011" "Flat_2011"
## [49] "POP_DEN" "AV_HH_SIZE"
## [51] "Median_House_Price_2014" "MEAN_INCOME_2013"
## [53] "MEDIAN_INCOME_2013" "IMD_SCORE"
## [55] "IMD_RANK" "IMD_DECILE"
## [57] "INCOME_SCORE" "INCOME_RANK"
## [59] "INCOME_DECILE" "EMPLOYMENT_SCORE"
## [61] "EMPLOYMENT_RANK" "EMPLOYMENT_DECILE"
## [63] "EDUCATION_SCORE" "EDUCATION_RANK"
## [65] "EDUCATION_DECILE" "HEALTH_SCORE"
## [67] "HEALTH_RANK" "HEALTH_DECILE."
## [69] "CRIME_SCORE" "CRIME_RANK"
## [71] "CRIME_DECILE" "BARRIERS_SCORE"
## [73] "BARRIERS_RANK" "BARRIERS_DECILE"
## [75] "ENVIRONMENT_SCORE" "ENVIRONMENT_RANK"
## [77] "ENVIRONMENT_DECILE" "IDACI_SCORE"
## [79] "IDACI_RANK" "IDACI_DECILE"

Creating a shapefile which contains all the variables for making maps.
Saving the shape file:
shape2 <- shape
shape2@data <- merge(shape2@data,london,by.x="LSOA", by.y="LSOA", all.x=T, sort=F)
writeOGR(shape2,dsn=".", layer="londonshape", driver="ESRI Shapefile", overwrite_layer =
T)

Our dependent variable is AVERAG_CRIME, which will be used as dependent variable for modeling.

Map of the London with Average Crime in each LSOA:


library(RColorBrewer)
my.palette <- brewer.pal(n = 7, name = "BrBG")
spplot(shape2, "AVERAGE_CRIME", main = "AVERAGE CRIMES",
col = "transparent")

Shows the number of crimes reported in each LSOA.


Bar Graph:
library(mosaic)
library(tidyverse)
gf_bar(~AVERAGE_CRIME, data=london)

We can see from this bar graph and scatter point graph that the bulk of LSOAs have reported crimes in
the 20-100 range and a relatively small number above 250.
Testing for spatial dependence: Making spacial weights matrix.
sf<-st_as_sf(shape)
cent<- st_centroid(sf$geometry)
knn<-knearneigh(cent,k=3)
nn<-knn2nb(knn)
listw1<-nb2listw(nn)

Moran test for spatial autocorrelation:


moran.test(london$AVERAGE_CRIME, listw1)

##
## Moran I test under randomisation
##
## data: london$AVERAGE_CRIME
## weights: listw1
##
## Moran I statistic standard deviate = 29, p-value <2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.3131067 -0.0002069 0.0001205
moran.plot(london$AVERAGE_CRIME, listw1)

The positive sign of Moran’s I suggest that there is spatial autocorrelation. The test statistic is 29, and the
p-value is under 0.05, which indicates that we can reject the null hypothesis.

Variogram:
library(gstat)
plot(variogram(shape2$AVERAGE_CRIME ~ 1, locations = coordinates(shape2), data = shape2,
cloud = F),
type = "b", pch = 16, main = "Variogram of Average Crimes")
summary(lm(lag.listw(listw1, london$AVERAGE_CRIME) ~ london$AVERAGE_CRIME))

##
## Call:
## lm(formula = lag.listw(listw1, london$AVERAGE_CRIME) ~ london$AVERAGE_CRIME)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.7 -12.8 -3.8 7.8 534.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.00094 0.51228 56.6 <2e-16 ***
## london$AVERAGE_CRIME 0.31311 0.00822 38.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.8 on 4833 degrees of freedom
## Multiple R-squared: 0.231, Adjusted R-squared: 0.231
## F-statistic: 1.45e+03 on 1 and 4833 DF, p-value: <2e-16

xyplot(lag.listw(listw1, london$AVERAGE_CRIME) ~ london$AVERAGE_CRIME, type=c("r","p"))

Statistically significant positive spatial autocorrelation between the dependent values in the data set.
This variogram and Moran’s I indicate that there is spatial autocorrelation in Average Crime, but this
spatial pattern could be cause by the spatial patterning of the covariates of Average Crime. A full analysis
would involve fitting an OLS model and comparing that model to spatial models.
Selection of significant independent variables: With top down method it would be easy to eliminate
unsignificant variables.
mod1<-lm(AVERAGE_CRIME ~ POPULATION+RATE_per_1000+Pop_Estimate_2013+Age_0_15+Age_16_23+Ag
e_30_44+Age_45_64+Age_65_Plus+Working_Age+Households_with_people_2011+Dependent_Children+
No_Dependent_Children+Lone_Parent+One_Person+Multi_Person+White+Mixed+Asian+Black+Other+B
AME+UK_Born+Not_UK_Born+HH_English_Speaking+HH_No_English_Speaking+Christian+Buddhist+Hin
du+Jewish+Muslim+Sikh+Other_Religion+No_Religion+Religion_not_stated+Owned_outright+Owned
_Mortgage+Social_Rented+Private_Rented+All_Households_2011+Detached_2011+Semi_2011+Terrac
e_2011+Flat_2011+POP_DEN+AV_HH_SIZE+Median_House_Price_2014+MEAN_INCOME_2013+MEDIAN_INCOM
E_2013+IMD_SCORE+IMD_RANK+IMD_DECILE+INCOME_SCORE+INCOME_RANK+INCOME_DECILE+EMPLOYMENT_SC
ORE+EMPLOYMENT_RANK+EMPLOYMENT_DECILE+EDUCATION_SCORE+EDUCATION_RANK+EDUCATION_SCORE+EDUC
ATION_DECILE+HEALTH_SCORE+HEALTH_RANK+HEALTH_DECILE.+CRIME_SCORE+CRIME_RANK+CRIME_DECILE+
BARRIERS_SCORE+BARRIERS_RANK+BARRIERS_DECILE+ENVIRONMENT_SCORE+ENVIRONMENT_RANK+ENVIRONME
NT_DECILE+IDACI_SCORE+IDACI_RANK+IDACI_DECILE, data=london)

summary(mod1)

##
## Call:
## lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + Pop_Estimate_2013 +
## Age_0_15 + Age_16_23 + Age_30_44 + Age_45_64 + Age_65_Plus +
## Working_Age + Households_with_people_2011 + Dependent_Children +
## No_Dependent_Children + Lone_Parent + One_Person + Multi_Person +
## White + Mixed + Asian + Black + Other + BAME + UK_Born +
## Not_UK_Born + HH_English_Speaking + HH_No_English_Speaking +
## Christian + Buddhist + Hindu + Jewish + Muslim + Sikh + Other_Religion +
## No_Religion + Religion_not_stated + Owned_outright + Owned_Mortgage +
## Social_Rented + Private_Rented + All_Households_2011 + Detached_2011 +
## Semi_2011 + Terrace_2011 + Flat_2011 + POP_DEN + AV_HH_SIZE +
## Median_House_Price_2014 + MEAN_INCOME_2013 + MEDIAN_INCOME_2013 +
## IMD_SCORE + IMD_RANK + IMD_DECILE + INCOME_SCORE + INCOME_RANK +
## INCOME_DECILE + EMPLOYMENT_SCORE + EMPLOYMENT_RANK + EMPLOYMENT_DECILE +
## EDUCATION_SCORE + EDUCATION_RANK + EDUCATION_SCORE + EDUCATION_DECILE +
## HEALTH_SCORE + HEALTH_RANK + HEALTH_DECILE. + CRIME_SCORE +
## CRIME_RANK + CRIME_DECILE + BARRIERS_SCORE + BARRIERS_RANK +
## BARRIERS_DECILE + ENVIRONMENT_SCORE + ENVIRONMENT_RANK +
## ENVIRONMENT_DECILE + IDACI_SCORE + IDACI_RANK + IDACI_DECILE,
## data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -184.57 -2.37 -0.14 2.26 189.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4511.220172281 7399.386167000 0.61 0.54211
## POPULATION 0.033915949 0.000576861 58.79 < 2e-16 ***
## RATE_per_1000 1.864238083 0.008158732 228.50 < 2e-16 ***
## Pop_Estimate_2013 -0.010208808 0.001297960 -7.87 4.5e-15 ***
## Age_0_15 -47.913150579 27.872821582 -1.72 0.08568 .
## Age_16_23 -44.306225896 24.204857721 -1.83 0.06724 .
## Age_30_44 -44.443065557 24.204583552 -1.84 0.06640 .
## Age_45_64 -44.305839134 24.206932705 -1.83 0.06727 .
## Age_65_Plus -47.924740235 27.870179262 -1.72 0.08558 .
## Working_Age -3.536066383 31.075196355 -0.11 0.90941
## Households_with_people_2011 0.021789687 0.007042933 3.09 0.00199 **
## Dependent_Children -0.156969642 2.028067748 -0.08 0.93831
## No_Dependent_Children -0.353119728 2.027754252 -0.17 0.86176
## Lone_Parent -0.446119653 2.028430522 -0.22 0.82593
## One_Person -0.561028787 2.028271659 -0.28 0.78210
## Multi_Person -0.389627452 2.027523809 -0.19 0.84762
## White 14.280489894 37.955215645 0.38 0.70675
## Mixed 5.849352531 2.097594005 2.79 0.00531 **
## Asian 5.643046463 2.097486633 2.69 0.00716 **
## Black 5.695237612 2.098070319 2.71 0.00666 **
## Other 5.431226495 2.098257925 2.59 0.00967 **
## BAME 8.572004074 37.971285772 0.23 0.82141
## UK_Born -3.544207029 38.002329106 -0.09 0.92570
## Not_UK_Born -3.554422648 38.003785338 -0.09 0.92549
## HH_English_Speaking -7.869273955 41.671014260 -0.19 0.85022
## HH_No_English_Speaking -7.808891265 41.670211369 -0.19 0.85136
## Christian -0.310091314 1.546685431 -0.20 0.84111
## Buddhist -0.247618027 1.563007666 -0.16 0.87413
## Hindu -0.205712693 1.547810119 -0.13 0.89427
## Jewish -0.372788170 1.547713835 -0.24 0.80967
## Muslim -0.285231613 1.547689533 -0.18 0.85379
## Sikh -0.286677913 1.547304395 -0.19 0.85302
## Other_Religion -0.348303895 1.560857279 -0.22 0.82343
## No_Religion -0.402366872 1.548200333 -0.26 0.79496
## Religion_not_stated -0.334402909 1.546931315 -0.22 0.82886
## Owned_outright -0.077741152 0.085601409 -0.91 0.36383
## Owned_Mortgage -0.203581475 0.078287706 -2.60 0.00934 **
## Social_Rented -0.138014257 0.076719255 -1.80 0.07209 .
## Private_Rented -0.148662809 0.076522216 -1.94 0.05211 .
## All_Households_2011 -0.004927303 0.006056727 -0.81 0.41596
## Detached_2011 0.231040188 0.238068964 0.97 0.33186
## Semi_2011 0.218695638 0.236589680 0.92 0.35534
## Terrace_2011 0.214988717 0.236856186 0.91 0.36410
## Flat_2011 0.236631284 0.236881672 1.00 0.31787
## POP_DEN 0.003892026 0.003466086 1.12 0.26154
## AV_HH_SIZE -1.713605572 1.758465005 -0.97 0.32986
## Median_House_Price_2014 0.000001286 0.000000897 1.43 0.15168
## MEAN_INCOME_2013 -0.000031059 0.000036932 -0.84 0.40041
## MEDIAN_INCOME_2013 -0.000109121 0.000088661 -1.23 0.21847
## IMD_SCORE -0.389172688 0.097221672 -4.00 6.4e-05 ***
## IMD_RANK 0.000271692 0.000247207 1.10 0.27180
## IMD_DECILE -0.032017508 0.462882984 -0.07 0.94486
## INCOME_SCORE 7.437382448 4.556747179 1.63 0.10271
## INCOME_RANK 0.000318501 0.000182071 1.75 0.08030 .
## INCOME_DECILE -0.407728262 0.459689415 -0.89 0.37514
## EMPLOYMENT_SCORE 4.529250798 4.593043345 0.99 0.32413
## EMPLOYMENT_RANK -0.000338206 0.000170755 -1.98 0.04769 *
## EMPLOYMENT_DECILE -0.002504766 0.468437639 -0.01 0.99573
## EDUCATION_SCORE 0.120246302 0.075613512 1.59 0.11184
## EDUCATION_RANK 0.000247636 0.000172663 1.43 0.15158
## EDUCATION_DECILE -0.537425852 0.467178573 -1.15 0.25005
## HEALTH_SCORE 0.357880204 1.038157059 0.34 0.73032
## HEALTH_RANK -0.000069876 0.000184152 -0.38 0.70437
## HEALTH_DECILE. -0.409960258 0.476972103 -0.86 0.39010
## CRIME_SCORE 7.768756057 1.476918325 5.26 1.5e-07 ***
## CRIME_RANK 0.000632616 0.000187721 3.37 0.00076 ***
## CRIME_DECILE -0.826026637 0.464041139 -1.78 0.07513 .
## BARRIERS_SCORE 0.184794568 0.055698505 3.32 0.00091 ***
## BARRIERS_RANK 0.000317953 0.000176435 1.80 0.07159 .
## BARRIERS_DECILE -0.719649017 0.487549328 -1.48 0.14000
## ENVIRONMENT_SCORE 0.094131293 0.052478871 1.79 0.07292 .
## ENVIRONMENT_RANK -0.000031184 0.000174796 -0.18 0.85842
## ENVIRONMENT_DECILE 0.164048318 0.468227192 0.35 0.72608
## IDACI_SCORE 6.344043476 4.032765779 1.57 0.11576
## IDACI_RANK 0.000044053 0.000160615 0.27 0.78388
## IDACI_DECILE 0.179767906 0.467162642 0.38 0.70040
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.24 on 4757 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.956, Adjusted R-squared: 0.955
## F-statistic: 1.36e+03 on 75 and 4757 DF, p-value: <2e-16

Residuals are well symmetrical and R squared is good. But there are variables to be dropped as they are
less significant for the model.
Fitting second model:
mod2<-lm(AVERAGE_CRIME ~ POPULATION+RATE_per_1000+Pop_Estimate_2013+POP_DEN+IMD_SCORE+Mix
ed+BARRIERS_RANK+Private_Rented+Other+Black+Asian+Owned_Mortgage+CRIME_SCORE+BARRIERS_SCO
RE+EMPLOYMENT_RANK+CRIME_RANK+Households_with_people_2011, data=london)

summary(mod2)

##
## Call:
## lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + Pop_Estimate_2013 +
## POP_DEN + IMD_SCORE + Mixed + BARRIERS_RANK + Private_Rented +
## Other + Black + Asian + Owned_Mortgage + CRIME_SCORE + BARRIERS_SCORE +
## EMPLOYMENT_RANK + CRIME_RANK + Households_with_people_2011,
## data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -187.65 -2.22 -0.18 2.03 192.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.16634891 3.05096653 -20.05 < 2e-16 ***
## POPULATION 0.03309899 0.00055049 60.13 < 2e-16 ***
## RATE_per_1000 1.86543606 0.00771709 241.73 < 2e-16 ***
## Pop_Estimate_2013 -0.00588259 0.00097299 -6.05 0.0000000016 ***
## POP_DEN -0.00038154 0.00301496 -0.13 0.89930
## IMD_SCORE -0.18568610 0.05027886 -3.69 0.00022 ***
## Mixed 0.00140521 0.10114837 0.01 0.98892
## BARRIERS_RANK 0.00017166 0.00006202 2.77 0.00566 **
## Private_Rented -0.08204661 0.01622919 -5.06 0.0000004452 ***
## Other -0.10107796 0.05612466 -1.80 0.07177 .
## Black 0.01353304 0.02068812 0.65 0.51305
## Asian 0.02368851 0.01227511 1.93 0.05369 .
## Owned_Mortgage 0.00972244 0.02050129 0.47 0.63535
## CRIME_SCORE 7.16089657 1.42622834 5.02 0.0000005330 ***
## BARRIERS_SCORE 0.14441299 0.04374314 3.30 0.00097 ***
## EMPLOYMENT_RANK -0.00000767 0.00004622 -0.17 0.86826
## CRIME_RANK 0.00041223 0.00011030 3.74 0.00019 ***
## Households_with_people_2011 0.00542901 0.00190491 2.85 0.00439 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.35 on 4817 degrees of freedom
## Multiple R-squared: 0.954, Adjusted R-squared: 0.954
## F-statistic: 5.86e+03 on 17 and 4817 DF, p-value: <2e-16

Residuals are well symmetrical and R squared is almost same, which is good. Which tells us that 95% of
the variation in Average Crimes is accounted for by the variables in second model. What we see here, one
unit increase in Crime Score will increase Average Crime by 7 units, which is understandable.And
similarly we can see the one unit increase in Population will cause 0.033 units increase in Average Crime.
If we just imagine having 100 or 500 more people in any area. Although P value is very small, Reject the
null hypothesis. The model using these variables as the independent variables does predict crime rate
quite well.But we should try dropping the insignificant variables for the next model.
The Variance Inflation Factor (vif):
vif(mod2)

## POPULATION RATE_per_1000
## 3.373 1.416
## Pop_Estimate_2013 POP_DEN
## 4.852 1.883
## IMD_SCORE Mixed
## 16.609 2.135
## BARRIERS_RANK Private_Rented
## 8.589 2.394
## Other Black
## 1.383 2.959
## Asian Owned_Mortgage
## 2.171 3.197
## CRIME_SCORE BARRIERS_SCORE
## 36.761 10.040
## EMPLOYMENT_RANK CRIME_RANK
## 8.553 36.057
## Households_with_people_2011
## 3.579

Many of the variables showing multicollinearity. This would confirm our decision to drop some of these
variables.
Fitted valuse of from second model of our dependent variable:
fitd2 = fitted.values(mod2)
head(fitd2,10)

## 1 2 3 4 5 6 7 8 9 10
## 267.17 270.35 253.73 236.32 38.93 117.56 56.42 83.96 199.97 43.24
Diagnostic Plots:
par(mfrow=c(2,2))
plot(mod2)

there are some outliers affecting the model, but we might consider dropping more insignificant variables
for the third model instead.

The Breush Pagan test for second model:


library(lmtest)
bptest(mod2)

##
## studentized Breusch-Pagan test
##
## data: mod2
## BP = 1161, df = 17, p-value <2e-16

The p value being less than 0.05 shows there is heteroskedasticity and reject the null hypothesis.
Fitting third model with only significant variables:
mod3<-lm(AVERAGE_CRIME ~ POPULATION+RATE_per_1000+IMD_SCORE+Private_Rented+CRIME_RANK, da
ta=london)
summary(mod3)

##
## Call:
## lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + IMD_SCORE +
## Private_Rented + CRIME_RANK, data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.07 -2.31 -0.26 2.04 190.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -49.0863554 0.8538049 -57.49 < 2e-16 ***
## POPULATION 0.0308280 0.0003190 96.63 < 2e-16 ***
## RATE_per_1000 1.8721676 0.0072969 256.57 < 2e-16 ***
## IMD_SCORE -0.1671292 0.0168293 -9.93 < 2e-16 ***
## Private_Rented -0.0759282 0.0114841 -6.61 0.000000000042 ***
## CRIME_RANK -0.0001250 0.0000244 -5.13 0.000000303265 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.42 on 4829 degrees of freedom
## Multiple R-squared: 0.953, Adjusted R-squared: 0.953
## F-statistic: 1.96e+04 on 5 and 4829 DF, p-value: <2e-16

Now each variable is significant and we can do further testing. And the linear model equation is: Average
Crime= -49.083554+.0308280(Population)+1.8721676(Rate per 1000)-0.1671292(IMD Score)-
0.0759282(Private Rented)-0.0001250(Crime Rank)
Partial regression Plots of third model:
avPlots(mod3)
Comparing second and third model:
anova(mod2,mod3)

## Analysis of Variance Table


##
## Model 1: AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + Pop_Estimate_2013 +
## POP_DEN + IMD_SCORE + Mixed + BARRIERS_RANK + Private_Rented +
## Other + Black + Asian + Owned_Mortgage + CRIME_SCORE + BARRIERS_SCORE +
## EMPLOYMENT_RANK + CRIME_RANK + Households_with_people_2011
## Model 2: AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + IMD_SCORE + Private_Rented +
## CRIME_RANK
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4817 421440
## 2 4829 428587 -12 -7146 6.81 2.6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The comparison enable us to select the better model which is the third model.
The Variance Inflation Factor (vif) of the variables in third model:
vif(mod3)

## POPULATION RATE_per_1000 IMD_SCORE Private_Rented CRIME_RANK


## 1.117 1.248 1.834 1.182 1.734

There is still a little multicollinearity in the 3rd model and comparison between 2nd and 3rd tells there is
a difference and 3rd is better than 2nd. Will have the final best model after comparing mod3 with a
simpler model.
Fitted values of third model:
Fitd3 = fitted.values(mod3)
head(Fitd3,20)

## 1 2 3 4 5 6 7 8 9 10 11
## 272.49 277.72 252.21 236.56 36.80 114.09 56.06 82.32 200.40 41.79 56.71
## 12 13 14 15 16 17 18 19 20
## 77.22 52.38 48.12 31.78 35.17 56.11 19.83 40.73 51.38

Fitting a simple model:


mod4<-lm(AVERAGE_CRIME ~ POPULATION+RATE_per_1000+IMD_SCORE, data=london)
summary(mod4) #taking fewer most significant variables.

##
## Call:
## lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + IMD_SCORE,
## data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.09 -2.24 -0.21 2.12 190.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.772844 0.608323 -86.75 < 2e-16 ***
## POPULATION 0.030307 0.000308 98.37 < 2e-16 ***
## RATE_per_1000 1.866238 0.007147 261.11 < 2e-16 ***
## IMD_SCORE -0.105617 0.013571 -7.78 8.6e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.48 on 4831 degrees of freedom
## Multiple R-squared: 0.952, Adjusted R-squared: 0.952
## F-statistic: 3.23e+04 on 3 and 4831 DF, p-value: <2e-16

The R Squared is same.


Comparison:
anova(mod3,mod4)

## Analysis of Variance Table


##
## Model 1: AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + IMD_SCORE + Private_Rented +
## CRIME_RANK
## Model 2: AVERAGE_CRIME ~ POPULATION + RATE_per_1000 + IMD_SCORE
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4829 428587
## 2 4831 433934 -2 -5347 30.1 1e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This enable us to select our final regression model which is the 3rd model. Will use third model for
further testing towards the final spatial model.
Diagnostic Plots (mod3):
par(mfrow=c(2,2))
plot(mod3)

The Diagnostic Plots are almost the same as of model 2 and the outliers (2903,4641,4783) might be the
problem.
Breusch Pagan test (mod3):
bptest(mod3)

##
## studentized Breusch-Pagan test
##
## data: mod3
## BP = 1158, df = 5, p-value <2e-16

This model rejects null hypothesis too. Removing large outliers is not needed as our R-squared value is
good, we should work with mod3.

Residuals plot:
library(sf)
london_sf = st_as_sf(london,coords = c("long","lat"))
st_crs(london_sf) = "+init=epsg:27700 +towgs84=375,-111,431,0,0,0,0"
library(tmap)
london_sf$Residuals_mod3 = residuals(mod3)
plot(london_sf["Residuals_mod3"],pch=20)
Moran Test for spatial dependence in third model:
moran.lm<-lm.morantest(mod3, listw1, alternative="two.sided")
print(moran.lm)

##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 +
## IMD_SCORE + Private_Rented + CRIME_RANK, data = london)
## weights: listw1
##
## Moran I statistic standard deviate = 3.5, p-value = 0.0004
## alternative hypothesis: two.sided
## sample estimates:
## Observed Moran I Expectation Variance
## 0.0385559 -0.0006093 0.0001228

The test statistic is 3.5, and the p-value is under 0.05, which indicates that we can reject the null
hypothesis.

Lagrange Multiplier test: A nice feature of Moran’s I test is that ‘i’ has high power against a wide range
of alternatives. However, it does not guide us in the selection of alternative models. On the other hand,
Lagrange Multiplier test specify the alternative hypothesis which will help us with the task. The LM tests
for spatial dependence are included in the lm.LMtests function and include as alternatives the presence of
a spatial lag and the presence of a spatial lag in the error term. Both tests, as well as their robust forms
are included in the lm.LMtests function. To call them we use the option test=“all”.
lmLMtests <- lm.LMtests(mod3, listw1, test="all")
lmLMtests

##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 +
## IMD_SCORE + Private_Rented + CRIME_RANK, data = london)
## weights: listw1
##
## LMerr = 12, df = 1, p-value = 0.0005
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 +
## IMD_SCORE + Private_Rented + CRIME_RANK, data = london)
## weights: listw1
##
## LMlag = 31, df = 1, p-value = 0.00000003
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 +
## IMD_SCORE + Private_Rented + CRIME_RANK, data = london)
## weights: listw1
##
## RLMerr = 29, df = 1, p-value = 0.00000006
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 +
## IMD_SCORE + Private_Rented + CRIME_RANK, data = london)
## weights: listw1
##
## RLMlag = 48, df = 1, p-value = 4e-12
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AVERAGE_CRIME ~ POPULATION + RATE_per_1000 +
## IMD_SCORE + Private_Rented + CRIME_RANK, data = london)
## weights: listw1
##
## SARMA = 60, df = 2, p-value = 9e-14

This test enable us to determine which spatial model is suitable, and the answer is spatial lag mode1 as
RLMlag is more significant than RLMlag but both are acceptable. will try both Error and Lag models.

Fiting Spatial Error Model: Spatial Error Models (SEM) Another way to model spatial autocorrelation in
a regression model is to specify the autoregressive process in the error term: y=Xβ+ϵ with ϵ=λWϵ+u. If
this is the “true” form of spatial dependence OLS estimates will be unbiased but inefficient.
spatial_error = errorsarlm(mod3, listw=listw1)

summary(spatial_error)

## Call:errorsarlm(formula = mod3, listw = listw1)


##
## Residuals:
## Min 1Q Median 3Q Max
## -194.64941 -2.33108 -0.23104 2.05600 190.61924
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -49.123006221 0.867196283 -56.6458 < 2.2e-16
## POPULATION 0.030830646 0.000321287 95.9598 < 2.2e-16
## RATE_per_1000 1.875077488 0.007432619 252.2768 < 2.2e-16
## IMD_SCORE -0.166294558 0.017257146 -9.6363 < 2.2e-16
## Private_Rented -0.077868122 0.011930737 -6.5267 0.00000000006724
## CRIME_RANK -0.000125714 0.000024725 -5.0844 0.00000036880106
##
## Lambda: 0.07185, LR test value: 13.44, p-value: 0.00024624
## Asymptotic standard error: 0.01836
## z-value: 3.914, p-value: 0.000090793
## Wald statistic: 15.32, p-value: 0.000090793
##
## Log likelihood: -17695 for error model
## ML residual variance (sigma squared): 88.28, (sigma: 9.396)
## Number of observations: 4835
## Number of parameters estimated: 8
## AIC: 35407, (AIC for lm: 35418)

lambda, is positive (0.07) and this shows that the unexplained variation in Average Crime is correlated.
The error model equation for dependent the variable: Average Crime= -49.123006 +
.03083065(Population) +1.875077488(Rate per 1000)-0.16629456(IMD Score)-0.077868122(Private
Rented)-0.000125714(Crime Rank)

Hausman Test(Spatial Error Model):


Hausman.test(spatial_error)

##
## Spatial Hausman test (asymptotic)
##
## data: NULL
## Hausman test = 30, df = 6, p-value = 0.00004

From the spatial Hausman test, we can reject the null hypothesis that the spatial errors and the OLS
model is giving us the same estimates.

Spatial Lag model:


spatial_lag <- lagsarlm(mod3, data = london, listw1)

summary(spatial_lag)

##
## Call:lagsarlm(formula = mod3, data = london, listw = listw1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -186.19149 -2.35244 -0.25572 2.13055 189.72679
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -48.691951741 0.853863552 -57.0254 < 2.2e-16
## POPULATION 0.031008906 0.000319179 97.1520 < 2.2e-16
## RATE_per_1000 1.888235669 0.007810951 241.7421 < 2.2e-16
## IMD_SCORE -0.157374516 0.016862462 -9.3328 < 2.2e-16
## Private_Rented -0.068048979 0.011525962 -5.9040 0.000000003548
## CRIME_RANK -0.000137363 0.000024382 -5.6339 0.000000017622
##
## Rho: -0.03123, LR test value: 30.92, p-value: 0.000000026894
## Asymptotic standard error: 0.005622
## z-value: -5.555, p-value: 0.000000027741
## Wald statistic: 30.86, p-value: 0.000000027741
##
## Log likelihood: -17687 for lag model
## ML residual variance (sigma squared): 88.06, (sigma: 9.384)
## Number of observations: 4835
## Number of parameters estimated: 8
## AIC: 35389, (AIC for lm: 35418)
## LM test for residual autocorrelation
## test value: 29.4, p-value: 0.000000058888

From the results of both Error and Lag models we could see the AIC value reduced more in Lag than in
Error model. The results of the model indicate that each of the predictors and the spatial lag are
significant. We can evaluate the significance of the spatial lag a variety of ways. The output includes a z-
test (“z-value” in the output) based on the standard error of Moran’s I -0.03123/0.005622=-5.555. It also
includes a Likelihood Ratio Test (“LR test”) which is a test of the model with and without the spatial
lag/error. The reported “LR test” suggests that the addition of the lag/error is an improvement in the
model.
Finally, if we look at the likelihood for the Lag model and Error model we see that we achieve a lower
value for the Lag model that was the model favored by the LMtests. The residuals plot presented above
still show some presence of spatial autocorrelation. The lag model equation for dependent variable:
Average Crime= -48.091951741+0.031008906(Population)+1.888235669(Rate per 1000)-
0.157374516(IMD Score)-0.068048979(Private Rented)-0.000137363(Crime Rank)

Direct and indirect impacts of independent variables on the dependent variable:


impacts_lag<-impacts(spatial_lag, listw = listw1)

impacts_lag

## Impact measures (lag, exact):


## Direct Indirect Total
## POPULATION 0.0310168 -0.000946955 0.0300698
## RATE_per_1000 1.8887144 -0.057663255 1.8310512
## IMD_SCORE -0.1574144 0.004805929 -0.1526085
## Private_Rented -0.0680662 0.002078091 -0.0659881
## CRIME_RANK -0.0001374 0.000004195 -0.0001332

The direct impact refers to average total impact of a change of an independent variable on the dependent
fore each observation, i.e., n−1∑ni=1∂E(yi)∂Xi, the indirect impact which is the sum of the impact
produced on one single observation by all other observations and the impact of one observation on all the
other. The total is the summation of the two. Which means, 1 unit change in population will increase
average crime by 0.0300698 similarly 100 units will cause 3.00698. And increase in IMD Score will cause
a decrease in Average Crime.
A comparison between the predicted values of both models and the actual values (first 20):
predict_error<-predict(spatial_error, newdata = NULL, listw = listw1, pred.type = "TS", a
ll.data = FALSE,
zero.policy = NULL, legacy = TRUE, legacy.mixed = FALSE, power = NULL, order = 250,
tol = .Machine$double.eps^(3/5), spChk = NULL)
head(predict_error,20)

## 1 2 3 4 5 6 7 8 9 10 11
## 270.82 277.08 251.13 236.63 36.71 116.09 56.40 82.33 201.54 42.06 56.97
## 12 13 14 15 16 17 18 19 20
## 77.12 52.41 48.33 31.67 35.22 56.08 19.83 40.88 51.46

predict_lag<-predict(spatial_lag, newdata = NULL, listw = listw1, pred.type = "TS", all.d


ata = FALSE,
zero.policy = NULL, legacy = TRUE, legacy.mixed = FALSE, power = NULL, order = 250,
tol = .Machine$double.eps^(3/5), spChk = NULL)
head(predict_lag,20)

## 1 2 3 4 5 6 7 8 9 10 11
## 270.33 273.67 251.80 237.59 36.38 112.33 55.43 83.16 200.82 40.63 56.34
## 12 13 14 15 16 17 18 19 20
## 77.37 52.47 47.02 31.59 35.17 56.46 19.85 41.12 52.15

head(london$AVERAGE_CRIME,20)

## [1] 224.0 224.0 224.0 224.0 33.5 126.5 55.5 83.5 281.5 43.0 55.0 76.5
## [13] 51.5 41.5 32.5 37.5 55.0 24.0 39.5 52.5

You might also like