Professional Documents
Culture Documents
library(ggplot2)
library(trelliscopejs)
library(tidyr)
library(purrr)
library(gapminder)
space_to_dash <- function(x) gsub(" ", "-", x)
library(plotly)
Now that we have a data frame nested by stock symbol, let's make a Trelliscope
display with a plot for each stock. In this exercise, we use just the first 10
stocks in the by_symbol dataset we created in the previous exercise and create an
"open-high-low-close" plot, a plot similar to candlestick plots. The available
by_symbol dataset available in your session has already been subsetted to the first
10 stocks.
Question: Examine the by_symbol_plot output you just created to see what the plot
variables look like. The panel column contains a plotly object.
# Create market_cap_log
by_symbol <- mutate(by_symbol,
market_cap_log = cog(
val = log10(market_cap), desc = "log base 10 market capitalization"
)
)
# Compute by_symbol_avg
by_symbol_avg <- mutate(by_symbol,
stats = map(data, function(x) {
data_frame(
mean_close = mean(x$close),
mean_volume = mean(x$volume),
annual_return = annual_return(x)
)
}))
## call map not map_dbl
library(trelliscopejs)
pokemon %>%
# Reduce the variables in the dataset # Respecify pokemon # Create panel var
select(pokemon, type_1, attack, generation_id, url_image) %>%
mutate( pokemon = cog(pokemon, default_label = T),
panel = img_panel(url_image)
) %>%
### trelliscope(pokemon, name = 'pokemon', nrow = 3, ncol = 6) ### error
trelliscope(name = "pokemon", nrow = 3, ncol = 6)
# Compute daily_may
daily_may <- bike %>%
filter(start_mon == 5 ) %>%
group_by(start_day, start_hod, membership) %>%
summarise(n = n())
# Plot the result
ggplot(daily_may, aes(start_hod, n, color = membership)) +
geom_point() +
facet_wrap( ~ start_day, ncol = 7)
### rbokeh
# Filter gapminder data by year 1982
dat_1982 <- gapminder %>% filter(year == 1982)
## plot
figure(dat_80_13) %>%
ly_points(x = log(budget_2013), y = log(roi_total))
# Create a bar plot for age group tb_2534 with % on the y-axis
bar_2534_percent <- figure(ylab = "share") %>%
ly_bar(x = year, y = count, tb_2534, color = gender, hover = T, position =
"fill")
# View figure
bar_2534_percent
## define a function to create a bar plot with the number of tb cases over time
plot_bar <- function(x){
figure() %>% ly_bar( x = year, y = count, color = gender, position = 'dodge',
data = x, hover = T)
}
### PLOTLY
## add_histogram(nbinsx = 25)
## add_histogram(xbins = list(start = 0, end = 100, size = 10))
# Create a frequency for Genre
genre_table <- vgsales %>%
count(Genre)
# Create a histogram of Critic_Score with navy bars that are 50% transparent
vgsales2016 %>%
plot_ly(x = ~Critic_Score) %>%
add_histogram(color = I('navy'), opacity = .5)
# Create a bar chart of Platform with hoverinfo only for the bar heights
vgsales2016 %>%
count(Platform) %>%
plot_ly(x = ~Platform, y = ~n, hoverinfo = 'y') %>%
add_bars()
# Polish the scatterplot by transforming the x-axis and labeling both axes
vgsales2016 %>%
plot_ly(x = ~Global_Sales, y = ~Critic_Score) %>%
add_markers(marker = list(opacity = 0.5)) %>%
layout(xaxis = list(title = "Global sales (millions of units)", type = "log"),
yaxis = list(title = "Critic score"))
# Set the background color to #ebebeb and remove the vertical grid
annual_vgsales %>%
plot_ly(x = ~Year, y = ~Global_Sales) %>%
add_lines() %>%
layout( xaxis = list(showgrid = F), paper_bgcolor = "#ebebeb")
vgsales %>%
plot_ly(x = ~Critic_Score, y = ~User_Score) %>%
add_histogram2dcontour(plotly)
###COMPARE
turnout %>%
plot_ly() %>%
add_markers(x = ~turnout2014, y = ~turnout2018 ) %>%
layout(xaxis = list(title = '2014 voter turnout'),
yaxis = list(title = '2018 voter turnout'))
# Create a choropleth map of the change in voter turnout from 2014 to 2018
turnout %>%
mutate(change = turnout2018 - turnout2014) %>%
plot_geo(locationmode = 'USA-states') %>%
add_trace(z = ~change, locations = ~state.abbr) %>%
layout(geo = list(scope = 'usa'))
###HIGHCHART
## highchart() vs hchart()
# Show the dates
index(xlk_prices)
# Use the base function and set the correct chart type
highchart(type = 'stock') %>%
# Add the price data
hc_add_series(xlk_prices)
# Create a line chart of the 'close' prices
hchart(xlk_prices_tibble, hcaes(x = date, y = close), type = "line")
# Create a line chart of the open prices
hchart(xlk_prices_tibble, hcaes(x = date, y = open), type = 'line')
# Add JPM to the chart # Enable a shared tooltip # Change the text of the
title of the y-axis
hchart(stock_wide_tibble_prices, hcaes(x = date, y = KO), name = "KO", type =
"line") %>%
hc_add_series(stock_wide_tibble_prices, hcaes(x = date, y = JPM), name =
'JPM', type = "line") %>%
hc_tooltip(shared = T, pointFormat = "{point.name.series}: ${point.y: .
2f}<br>") %>%
hc_yAxis(title = list(text = "prices (USD)"))
stock_tidy_tibble_prices %>%
hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>%
hc_title(text = "Daily Prices from Tidy Tibble") %>%
hc_yAxis(title = list(text = "Prices (USD)"),
labels = list(format = "${value}"),
opposite = FALSE)
## grouped_df
stock_tidy_tibble_returns %>%
# Calculate the standard deviation and mean of returns
summarize(std_dev = sd(returns),
mean = mean(returns)) %>%
hchart(., hcaes(x = symbol, y = std_dev, group = symbol, size = mean), type =
"scatter") %>%
hc_title(text = "Standard Dev and Mean Return")
stock_tidy_tibble_returns %>%
summarize(avg_returns = mean(returns),
risk = sd(returns),
risk_return = risk/avg_returns) %>%
# Pass the summary statistics to hchart # Color by symbol
hchart(., hcaes(x = symbol, y = risk_return, group = symbol), type =
'column') %>%
hc_title(text = "Risk/Return") %>%
hc_subtitle(text = "lower bars are better")
stock_tidy_tibble_prices %>%
mutate(sector = case_when(symbol == "AMZN" ~ "tech",
symbol == "GOOG" ~ "tech",
symbol == "DIS" ~ "fun",
symbol == "JPM" ~ "bank",
symbol == "KO" ~ "food")) %>%
hchart(., hcaes(x = date, y = price, group = symbol), type = "line") %>%
hc_tooltip(pointFormat = "{point.symbol}: ${point.price: .2f}<br> sector:
{point.sector}")
######
### MARKETING
## RESPONSE MODEL
##
colMeans(choice.data[c('HOPPINESS','BUD','PRICE.HOP','PRICE.BUD')])
price.ratio <- log(choice.data$PRICE.HOP/choice.data$PRICE.BUD)
head(cbind(price.ratio, choice.data$PRICE.BUD, choice.data$PRICE.HOP))
library(margins)
coef(probability.model)
margins(logistic.model)
> library(margins)
> coef(probability.model)
(Intercept) price.ratio
0.09700236 -0.29594939
> margins(logistic.model)
Average marginal effects
glm(formula = HOPPINESS ~ price.ratio, family = binomial, data = choice.data)
price.ratio
-0.4585
## The price ratio effect of the logistic model is similar in size to the price
ratio coefficient of the linear probability model. On average, the purchase
probability for Hoppiness increases around 46%, if the price ratio decreases one
unit. This is much more than obtained for the linear probability model.
margins(logistic.model)
margins(probit.model)
# Backward elemination
final.model <- stepAIC(extended.model, direction = 'backward', trace = TRUE)
summary(final.model)
SPATIAL R SF RASTER
library(dplyr)
library(sf)
# Read in the trees shapefile
trees <- st_read("trees.shp")
# Use filter() to limit to honey locust trees
honeylocust <- trees %>% filter(species == "honeylocust")
# Count the number of rows
nrow(honeylocust)
### Computing geo-information vector can be done with functions like st_area() and
st_length(). The result can be used in additional calculations. !!!Careful: the
result is a units object that requires additional processing like using unclass().
### these plots are not pretty but you can't beat plot() for a quick look using
few keystrokes. And remember you can use plot(st_geometry(geo_object)) to plot just
the geometry of your object.
# Get the extent of the canopy object # Get the CRS of the manhattan object
extent(canopy)
crs(manhattan)
# Determine the number of grid cells in both raster objects
ncell(manhattan)
ncell(canopy)
# Check if the data is in memory
inMemory(canopy)
# Use getValues() to read the values into a vector # Use hist() to create a
histogram of the values
vals <- getValues(canopy)
hist(vals)
plot(canopy)
plot(manhattan) # Plot a single image for each layer
plotRGB(manhattan) # Plot the manhattan raster as an image
# Determine the CRS for the neighborhoods and trees vector objects
st_crs(neighborhoods)
st_crs(trees)
####
# Create a data frame of counts by species
species_counts <-count(trees, species, sort = T)
# Convert back to sf
trees_sf <- st_as_sf(trees_sp)
# Confirm conversion
class(trees_sf)
# Plot the neighborhoods, add the beech trees and add the new box
plot(st_geometry(neighborhoods), col = "grey", border = "white")
plot(beech, add = T, pch = 16, col = "forestgreen")
plot(beech_box, add = T)
Plot the beech trees (beech) on top of the neighborhoods (neighborhoods). You will
want to plot only the geometry of the neighborhoods.
Use class() to see if the beech object has class data.frame or if it's just
geometry.
Convert the sf geometry object beech to an sf data frame with st_sf().
Use class() to confirm that beech now has a class of data.frame (as well as sf).
Use st_join() to conduct a spatial join in order to add neighborhood information to
the beech object.
Use head() to confirm that the new object has neighborhood information -- for
example, it should now have neighborhood name (ntaname).
## A note about the output of functions that test relationships between two sets of
features. The output of these and related functions is a special kind of list (with
the class sgbp). For example, when using st_intersects(), the first element in the
output can be accessed using [[1]], which shows polygons from the second polygon
that intersect with the first polygon. Likewise, [[2]] would show the polygons from
from the first polygon that intersect with the second polygon.
## If you look at the result for the last line of code carefully you'll see that
the closest park is Greeley Square Park, it's just around the corner from the
Empire State Building. In this case one of our feature sets was a single feature.
You may end up applying this function in situations where there are multiple
features in both objects. In this situation sf will return a matrix.
# Project parks to match canopy
parks_cp <- st_transform(parks, crs = crs(canopy, asText = TRUE))
# Plot the result (low tree canopy and high impervious areas)
plot(canopy_imperv_overlay)
##You've now learned to perform raster math using the raster function overlay().
You limited to areas with < 20% tree canopy and > 80% impervious, these areas will
be the most urban areas of the city including parts of Manhattan and Brooklyn.
# Compute the mean of canopy values by neighborhood # Add the mean canopy values to
neighborhoods
canopy_neighborhoods <- extract(canopy, neighborhoods_sp, fun = mean)
neighborhoods_avg_canopy <- mutate(neighborhoods, avg_canopy =
canopy_neighborhoods)
## Note that you transformed the neighborhoods object's CRS. This is actually not
strictly necessary because extract() can transform CRS on the fly. But it will be
needed for plotting and other operations later so doing manually is important here.
## GGPLOT
# Load the ggplot2 package
library(ggplot2)
ggplot(neighborhoods) +
geom_sf(aes(fill = avg_canopy)) +
scale_fill_gradient(low = "#edf8e9", high = "#005a32")
## TMAP
# Create a simple map of neighborhoods
tm_shape(neighborhoods) +
tm_polygons()
## Combine two maps with tmap_arrange(), asp = NA map height/width to match the
bounding box.
### LEAFLET
# Map with CartoDB tile centered on DataCamp's NYC office with zoom of 6
leaflet() %>% addProviderTiles("CartoDB") %>%
setView(lng = -73.98575, lat = 40.74856, zoom = 6)
# Set minZoom and dragging, Set default zoom level, set max bounds of map
# Use `addMarkers` to plot all of the colleges in `ca` on the `m` leaflet map
map %>% addMarkers(lng = ca$lng, lat = ca$lat)
# Center the map on LA
map %>% addMarkers(data = ca) %>%
setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 12)
# Make a color palette called pal for the values of `sector_label` using
`colorFactor()`
# Colors "red", "blue", and "#9b4a11" for "Public", "Private", and "For-Profit"
pal <- colorFactor(palette = c("red", "blue", "#9b4a11"),
levels = c("Public", "Private", "For-Profit"))
# Add circle markers that color colleges using pal() and the values of sector_label
map2 <- map %>% addCircleMarkers(data = ca, radius = 2,
color = ~pal(sector_label), label = ~paste0(name, " (",
sector_label, ")"))
library(leaflet.extras)
leaflet() %>%
# Add the OSM, CartoDB and Esri tiles # Use addLayersControl to allow users to
toggle between basemaps
addTiles(group = "OSM") %>%
addProviderTiles('CartoDB', group = "CartoDB") %>%
addProviderTiles("Esri", group = "Esri") %>%
addLayersControl(baseGroups = c('OSM', 'CartoDB', 'Esri'))
ipeds %>%
leaflet() %>% addTiles() %>%
# Sanitize any html in our labels # Color colleges by sector using the
`pal` palette # Cluster all colleges using `clusterOptions`
addCircleMarkers(radius = 2, label = ~htmlEscape(name), color =
~pal(sector_label), clusterOptions = markerClusterOptions())
## POLYGONS
class(shp)
slotNames(shp)
# Glimpse the data
glimpse(shp@data)
glimpse(nc_income)
# subset shp to include only zip codes in the top quartile of mean income
high_inc <- shp[!is.na(shp$mean_income) & shp$mean_income > 55917,]
# map the boundaries of the zip codes in the top quartile of mean income
high_inc %>%
leaflet() %>% addTiles() %>% addPolygons()
high_inc %>%
leaflet() %>% addTiles() %>%
# set boundary thickness to 1 and color polygons # add labels that display
mean income # highlight polygons on hover
addPolygons(weight = 1, color = ~nc_pal(mean_income),
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlight = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))
wealthy_zips %>%
leaflet() %>%
addProviderTiles("CartoDB") %>%
# set color to green and create Wealth Zipcodes group
addPolygons(weight = 1, fillOpacity = .7, color = "green", group = "Wealthy
Zipcodes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))
################
library(sp)
library(rgdal)
dir()
dir("nynta_16c")
# Read in shapefile with readOGR(): neighborhoods
neighborhoods <- readOGR("nynta_16c","nynta")
# neighborhoods
summary(neighborhoods)
plot(neighborhoods)
library(raster)
dir("nyc_grid_data")
# Use raster() with file path: income_grid
income_grid <- raster("nyc_grid_data/m5602ahhi00.tif")
summary(income_grid)
plot(income_grid)
library(tigris)
summary(nyc_tracts)
plot(nyc_tracts)
library(sp)
> proj4string(nyc_tracts)
[1] "+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"
> proj4string(neighborhoods)
[1] "+proj=lcc +lat_1=40.66666666666666 +lat_2=41.03333333333333
+lat_0=40.16666666666666 +lon_0=-74 +x_0=300000 +y_0=0 +datum=NAD83 +units=us-ft
+no_defs +ellps=GRS80 +towgs84=0,0,0"
## We didn't see the tracts on our plot of neighborhoods because the coordinates of
the tracts put them way off the boundaries of our plot.
library(sp)
library(raster)
## If you plot the untransformed objects with tmap, it actually transforms on the
fly, but it's useful to know how to do it manually.
library(sp)
# Use str() on nyc_income # ...and on nyc_tracts@data
str(nyc_income)
str(nyc_tracts@data)
library(sp)
library(tmap)
library(tmap)
## CAREFUL WITH """
tm_shape(nyc_tracts_merge) + tm_fill(col = "estimate") +
tm_shape(water) + tm_fill(col = "grey90") +
tm_shape(neighborhoods) + tm_borders()
library(tmap)
library(tmap)
library(tmap)
tm_shape(nyc_tracts_merge) +
tm_fill(col = "estimate", title = 'Median Income', palette = 'Greens') +
tm_borders(col = 'grey60', lwd = .5) +
tm_shape(water) +
tm_fill(col = "grey90") +
tm_shape(manhat_hoods) +
tm_borders(col = 'grey40', lwd = 2 ) +
tm_text(text = "name", size = 0.5) +
tm_credits("Source: ACS 2014 5-year Estimates, \n accessed via acs package",
position = c("right", "bottom"))
######
ls.str() ## list variables
n <- 300
x <- runif(n, 0, 1)
y <- runif(n, 0, 2)
mapxy(1)
Nearest-neighbor distributions
Another way of assessing clustering and regularity is to consider each point, and
how it relates to the other points. One simple measure is the distribution of the
distances from each point to its nearest neighbor.
The nndist() function in spatstat takes a point pattern and for each point returns
the distance to its nearest neighbor. You can then plot the histogram.
Plotting the output from Gest shows the theoretical cumulative distribution and
several estimates of the cumulative distribution using different edge corrections.
Often these edge corrections are almost indistinguishable, and the lines overlap.
The plot can be used as a quick exploratory test of complete spatial randomness.
The plot method for estimates of K uses a formula system where a dot on the left of
a formula refers to K(r). So the default plot uses . ~ r. You can compare the
estimate of K to a Poisson process by plotting . - pi * r ^ 2 ~ r. If the data was
generated by a Poisson process, then the line should be close to zero for all
values of r.
# Take the square root of the values to get a uniform spatial distribution
x <- sqrt(r_squared) * cos(angle)
y <- sqrt(r_squared) * sin(angle)
plot(disc(radius)); points(x, y)
## spatstat::disc() plot a disc
# Set coordinates and window
ppxy <- ppp(x = x, y = y, window = disc(radius) )
# Estimate G(r)
G_poisson <- Gest(p_poisson)
## Bandwidth selection
## We can get a more principled measure of the violent crime ratio using a spatial
segregation model. The spatialkernel package implements the theory of spatial
segregation. The first step is to compute the optimal bandwidth for kernel
smoothing under the segregation model. A small bandwidth would result in a density
that is mostly zero, with spikes at the event locations. A large bandwidth would
flatten out any structure in the events, resulting in a large "blob" across the
whole window. Somewhere between these extremes is a bandwidth that best represents
an underlying density for the process.
## spseg() will scan over a range of bandwidths and compute a test statistic using
a cross-validation method. The bandwidth that maximizes this test statistic is the
one to use. The returned value from spseg() in this case is a list, with h and cv
elements giving the values of the statistic over the input h values. The
spatialkernel package supplies a plotcv function to show how the test value varies.
The hcv element has the value of the best bandwidth.
Segregation probabilities
The second step is to compute the probabilities for violent and non-violent crimes
as a smooth surface, as well as the p-values for a point-wise test of segregation.
This is done by calling spseg() with opt = 3 and a fixed bandwidth parameter h.
Normally you would run this process for at least 100 simulations, but that will
take too long to run here. Instead, run for only 10 simulations. Then you can use a
pre-loaded object seg which is the output from a 1000 simulation run that took
about 20 minutes to complete.
Mapping segregation
With a base map and some image and contour functions we can display both the
probabilities and the significance tests over the area with more control than the
plotmc() function.
The seg object is a list with several components. The X and Y coordinates of the
grid are stored in the $gridx and $gridy elements. The probabilities of each class
of data (violent or non-violent crime) are in a matrix element $p with a column for
each class. The p-value of the significance test is in a similar matrix element
called $stpvalue. Rearranging columns of these matrices into a grid of values can
be done with R's matrix() function. From there you can construct list objects with
a vector $x of X-coordinates, $y of Y-coordinates, and $z as the matrix. You can
then feed this to image() or contour() for visualization.
This process may seem complex, but remember that with R you can always write
functions to perform complex tasks and those you may repeat often. For example, to
help with the mapping in this exercise you will create a function that builds a map
from four different items.
The seg object from 1000 simulations is loaded, as well as the preston_crime points
and the preston_osm map image.
# background map
plotRGB(preston_osm)
# p-value areas
image(pv_list,
col = c("#00000000", "#FF808080"), add = TRUE)
# probability contours
contour(prob_list,
levels = c(low, high),
col = c("#206020", "red"),
labels = c("Low", "High"),
add = TRUE)
# boundary window
plot(Window(preston_crime), add = TRUE)
}
The base R hist() function has a method for dates that lets you specify a time unit
for the breaks. You pass a string to the breaks argument, such as "days", "weeks",
"months", "quarters" or "years".
# Set the time limit to 1 day before and 1 day after the range of times
tlimits <- range(sasq_t) + c(-1, 1)
# Histogram the simulated statistics and add a line at the data value
ggplot(data.frame(sasq_mc), aes(x = t)) +
geom_histogram(binwidth = 1e13) +
geom_vline(aes(xintercept = t0))
# Compute the p-value as the proportion of tests greater than the data
sum(sasq_mc$t > sasq_mc$t0) / 1000
Cartogram
Large areas, such as cities or countries, are often divided into smaller
administrative units, often into zones of approximately equal population. But the
area of those units may vary considerably. When mapping them, the large areas carry
more visual "weight" than small areas, although just as many people live in the
small areas.
One technique for correcting for this is the cartogram. This is a controlled
distortion of the regions, expanding some and contracting others, so that the area
of each region is proportional to a desired quantity, such as the population. The
cartogram also tries to maintain the correct geography as much as possible, by
keeping regions in roughly the same place relative to each other.
The cartogram package contains functions for creating cartograms. You give it a
spatial data frame and the name of a column, and you get back a similar data frame
but with regions distorted so that the region area is proportional to the column
value of the regions.
You'll also use the rgeos package for computing the areas of individual regions
with the gArea() function.
The spdep package has functions for measures of spatial correlation, also known as
spatial dependency. Computing these measures first requires you to work out which
regions are neighbors via the poly2nb() function, short for "polygons to
neighbors". The result is an object of class nb. Then you can compute the test
statistic and run a significance test on the null hypothesis of no spatial
correlation. The significance test can either be done by Monte-Carlo or theoretical
models.
In this example you'll use the Moran "I" statistic to test the spatial correlation
of the population and the percentage "Remain" vote.
# Map % Remain
spplot(london_ref, zcol = "Pct_Remain")
>> You should have found that the p-value was around 0.1 in the first case, thus
you did not find any significant spatial correlation. In the second case, the p-
value was around 0.001, so you did find some significant spatial correlation.
Exceedence probabilities
Distributions and confidence intervals can be difficult things to present to non-
statisticians. An alternative is to present a probability that a value is over a
threshold. For example, public health teams might be interested in when an SMR has
more than doubled, and as a statistician you can give a probability that this has
happened. Then the public health team might decide to go to some alert level when
the probability of a doubling of SMR is over 0.95.
Again, the properties of the binomial distribution let you compute this for
proportional data. You can then map these exceedence probabilities for some
threshold, and use a sensible color scheme to highlight probabilities close to 1.
A Poisson GLM
A Poisson generalized linear model is a way of fitting count data to explanatory
variables. You get out parameter estimates and standard errors for your explanatory
variables, and can get fitted values and residuals.
The glm() function fits Poisson GLMs. It works just like the lm() function, but you
also specify a family argument. The formula has the usual meaning - response on the
left of the ~, and explanatory variables on the right.
To cope with count data coming from populations of different sizes, you specify an
offset argument. This adds a constant term for each row of the data in the model.
The log of the population is used in the offset term.
# Fit a poisson GLM.
model_flu <- glm(data = london, Flu_OBS ~ HealthDeprivation, offset =
log(TOTAL_POP), family = poisson)
# Is HealthDeprivation significant?
summary(model_flu)
Residuals
A linear model should fit the data and leave uncorrelated residuals. This applies
to non-spatial models, where, for example, fitting a straight line through points
on a curve would lead to serially-correlated residuals. A model on spatial data
should aim to have residuals that show no significant spatial correlation.
You can test the model fitted to the flu data using moran.mc() from the spdep
package. Monte Carlo Moran tests were previously discussed in the Spatial
autocorrelation test exercise earlier in the chapter.
Before you fit a model with spatial correlation, you'll first fit the same model as
before, but using Bayesian inference.
# Use R2BayesX
library(R2BayesX)
# Fit a GLM
model_flu <- glm(Flu_OBS ~ HealthDeprivation, offset = log(TOTAL_POP),
data = london, family = poisson)
# Summarize it
summary(model_flu)
# Summarize it
summary(bayes_flu)
The summary function will show you information for the linear model terms and the
smoothing terms in two separate tables. The spatial term is called "sx(i):mrf" -
standing for "Markov Random Field".
Bayesian analysis returns samples from a distribution for our S(x) term at each of
the London boroughs. The fitted function from bayesx models returns summary
statistics for each borough. You'll just look at the mean of that distribution for
now.
Look at the names of columns in the data and get a summary of the numerical pH
values. You should notice there are some missing values (NA's). Make a histogram of
the acidity.
Construct a vector that is TRUE for the rows with missing pH values. You should
have 33.
Plot a map of the survey data. You need to subset the data to remove the missing
values. The spplot() function needs a column name in quotes to map that data.
Instructions
100 XP
The acidity survey data, ca_geo has been pre-defined.
The response, on the left of the ~ sign, is the name of the column we are modeling.
The explanatory variables are on the right of the ~ sign, separated by a + sign,
and are the names of the coordinate columns obtained by coordnames().
Fit the model and see if the model parameters are significant by seeing stars in
the coefficients table.
The acidity survey data, ca_geo, and the linear model, m_trend have been pre-
defined.
Construct a vector that is TRUE for the rows with missing pH values.
Take a subset of the data wherever the pH is missing, assigning the result to
ca_geo_miss.
By default predict() will return predictions at all the original locations.
Pass the model as the first argument, as usual.
Pass ca_geo_miss to the newdata argument to predict missing values.
Assign the result to predictions.
Alkaline soils are those with a pH over 7. Our linear model gives us estimates and
standard deviation based on a normal (Gaussian) assumption. Compute the probability
of the soil being over 7 using pnorm() with the mean and standard deviation values
from the prediction data.
Variogram estimation
You can use the gstat package to plot variogram clouds and the variograms from
data. Recall:
The variogram cloud shows the differences of the measurements against distance for
all pairs of data points.
The binned variogram divides the cloud into distance bins and computes the average
difference within each bin.
The y-range of the binned variogram is always much smaller than the variogram cloud
because the cloud includes the full range of values that go into computing the mean
for the binned variogram.
The shape of the previous variogram tells you there is a large-scale trend in the
data. You can fit a variogram considering this trend with gstat. This variogram
should flatten out, indicating there is no more spatial correlation after a certain
distance with the trend taken into account.
The sill is the the upper limit of the model. That is, the long-range largest
value, ignoring any outliers.
# ca_geo, miss, ph_vgm have been pre-defined
ls.str()
To ensure that the grid and the study area have the same coordinates, some
housekeeping is involved. SpatialPoints() converts the points to a coordinate
reference system (CRS), or projection (different packages use different terminology
for the same concept). The CRS is created by wrapping the study area in
projection(), then in CRS(). For the purpose of this exercise, you don't need to
worry about exactly what these functions do, only that this data manipulation is
necessary to align the grid and the study area.
Now that you have that alignment, crop(), as the name suggests, crops the grid to
the study area.
The acidity survey data, ca_geo, the missing value index, miss, the variogram, vgm,
and the variogram model, v_model, have been pre-defined.
# Define a 2.5km square grid over the polygon extent. The first parameter is
# the bottom left corner.
grid <- GridTopology(c(537853,5536290), c(2500, 2500), c(72, 48))
# Convert to SpatialPixels
spgrid <- SpatialPixels(cropped_gridpoints)
coordnames(spgrid) <- c("x", "y")
plot(spgrid)
Gridded predictions
Constructing the grid is the hard part done. You can now compute kriged estimates
over the grid using the variogram model from before (v_model) and the grid of
SpatialPixels.
Instructions
100 XP
The spatial pixel grid of the region, spgrid, and the variogram model of pH,
v_model have been pre-defined.
Use kriging to predict pH in each grid rectangle throughout the study area.
Call krige().
The formula and input data are already specified.
Pass spgrid as the new data to predict.
Pass the variogram model to the model argument.
Calculate the probability of alkaline samples in each grid rectangle.
The mean of the predictions is the var1.pred element of ph_grid.
The variance of the predictions is the var1.var element of ph_grid. Take the square
root to get the standard deviation.
Plot the alkalinity in each grid rectangle.
Call spplot().
Pass the alkalinity column to the zcol argument as a string.
In this example you will get predictions at the missing data locations.
autoKrige() can try several variogram model types. In the example, you'll use a
Matern variogram model, which is commonly used in soil and forestry analyses. You
can see a complete list of available models by calling vgm() with no arguments.
Instructions
100 XP
The acidity survey data, ca_geo, and the missing value index, miss, have been pre-
defined.
Instructions
100 XP
The acidity survey data, ca_geo, the missing value index, miss, the spatial pixel
grid of the region, spgrid, the manual kriging grid model, ph_grid, and the
variogram model of pH, v_model have been pre-defined.
######
Your job is ggplot stacked bar-chart, then into a pie-chart by using the transform
coord_polar(theta = 'y').
Notice x = 1 in the aesthetics. This is because we only want one bar chart here.
# Use a column polar coordinate system to polar and set theta to 'y'.
ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) +
geom_col() +
coord_polar(theta = 'y')+
theme_void() +
ggtitle('Title')
##waffle !!!
disease_counts <- who_disease %>%
group_by(disease) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = round(total_cases/sum(total_cases)*100))
# Create an array of rounded percentages for diseases.
# Name the percentage array with disease_counts$disease
# Pass case_counts vector to the waffle function to plot
case_counts <- disease_counts$percent
names(case_counts) <- disease_counts$disease
waffle(case_counts)
# add rug plot using geom_rug to see individual datapoints, set alpha to 0.5.
ggplot(truck_speeding, aes(x = hour_of_day)) +
geom_density(bw = 1.5, fill = 'steelblue', alpha = .7) +
geom_rug(alpha = .5) +
labs(title = 'Citations by hour', subtitle = "Gaussian kernel SD = 1.5")
library(ggbeeswarm)
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# change point size to 0.5 and alpha to 0.8
geom_beeswarm(cex = .5, alpha = .8) +
# add a transparent boxplot on top of points
geom_boxplot(alpha = 0)
# violin geometry with kernel width of 2.5, add individual points on top of violins
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
geom_violin(bw = 2.5) +
geom_point(alpha = .3, size = .5)
### element_blank()
#make ridgeline densities a bit see-through
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels =
c("Mon","Tues","Wed","Thu","Fri","Sat","Sun") )) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
geom_density_ridges(bandwidth = 3.5, alpha = .7) +
scale_x_continuous(limits = c(0,150), expand = c(0,0) ) +
labs(subtitle = 'Guassian kernel SD = 3.5') +
theme(axis.ticks.y = element_blank())
#####
library(lattice)
#trellis
# 'USCancerRates' is pre-loaded
str(USCancerRates)
# Inspect dimension
dim(tplot)
dimnames(tplot)
We have seen how the scales argument can be used to control various aspects of how
the coordinate axes are computed and annotated. A common use of the scales argument
is to explicitly specify tick mark locations using the at sub-component, and
optionally the associated text labels using the labels sub-component.
Log scales are useful for economic metrics which tend to show exponential growth
over time.
In this exercise we will create a dot plot of the WorldPhones dataset available in
R.
In the video, you learned how to log-transform the axis by specifying a suitable
base as the log component of scales. There is one more component you need to know,
equispaced.log. This component indicates if the tick marks are equispaced when log
scales are in use. By default, equispaced.log is set to TRUE. Note: If you set
equispaced.log = FALSE, you don't have to explicitly specify a base for the log
component; just log = TRUE should do the trick!
# Create the dot plot # Log-transform the x-axis # Set x-axis relation to
"sliced"
dotplot(Year ~ Phones | Region, data = WorldPhonesDF,
as.table = T, scales = list(x = list(log = T, equispaced.log = F, relation
= "sliced")))
#recreate dot plot where for males (pch = 1, open circle) and females (pch = 3,
plus sign), reflected in the legend as well.
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
par.settings = simpleTheme(pch = c(3,1) ),
scales = list(x = list(log = 2, equispaced.log = F)))
Convert the Month code into a new variable Month.Name containing month names,
suitably ordering the levels. Drop empty levels of Month.Name using droplevels().
Obtain five colors from the RColorBrewer::brewer.pal(n), n should be the desired
number of colors. Create a density plot of Ozone grouped by Month.Name, with line
colors taken from RColorBrewer, line width doubled, and legend on the right.
levels(airquality$Month.Name)
#Densityplot: colorbrewed
densityplot(~ Ozone , data = airquality, groups = Month.Name,
plot.points = FALSE,
auto.key = list(space = 'right'),
par.settings = simpleTheme(col = my.colors, lwd = 2))
The airquality dataset: A base R graphics scatter plot of ozone - solar radiation:
plot(Ozone ~ Solar.R, data = airquality)
However, these measurements include some missing values, which are omitted from the
scatter plot, but could be informative. One common approach to include them in the
plot is by adding marginal "rugs" after the main scatter plot has been plotted.
with(airquality,
{ na.ozone <- is.na(Ozone)
na.solar.r <- is.na(Solar.R)
rug(Ozone[na.solar.r], side = 2)
rug(Solar.R[na.ozone], side = 1)
})
# Create violin plot # Specify outer # Specify x-axis relation, layout, label,
panel function
bwplot(Month.Name ~ Ozone + Temp, airquality,
outer = T, scales = list(x = list(relation = 'free')), layout = c(2,1),
xlab = 'Measured value', panel = panel.violin)
You can compare the output of this code to the default scatter plot produced when
you omit the panel argument.
In this exercise you will customize this plot further by adding a nonparametric
LOESS smooth and a reference line along the y=x diagonal.
# Create plot
xyplot(rate.female ~ rate.male, USCancerRates,
panel = panel.ss, main = "County-wise deaths due to cancer")
Prepanel functions with scales
While panel functions control the data display in each panel, prepanel functions
are used to set up the coordinate system for the display by calculating the minimal
axis extents required to contain the display. This calculation is done separately
for each panel, and these still need to be combined. As you have seen earlier, this
is controlled by the relation sub-component of the scales argument.
The goal of this exercise is to use the panel.histdens() and prepanel.histdens.2()
functions defined in the previous video exercise to reproduce the histogram of log-
tranformed death rates in the USCancerRates data, but with "sliced" x-axis limits
for each panel, so that each panel has different axis limits and has a common
range.
When fitting a model, this kind of plot can help to decide whether it is reasonable
to assume equal variance for the two subgroups.
Graphics are used not only for initial exploration but also as an integral part of
the iterative process of model-building, where it is particularly useful for
analysis of residuals and other model diagnostics. Optional arguments that are
passed on to the default panel function make it relatively easy to create fairly
complex plots that are needed routinely in practice.
Suppose we fit the following linear model for the USRegionalMortality dataset based
on the previous interaction plot, and store the corresponding residuals in the
resid variable.
For this exercise, your task is to look at the following three interaction plots
and decide which of the explanatory variables are most likely to be needed in the
model.
For the last exercise in this chapter, your task is to recreate a grouped dot plot
you have seen before, but replace the plotting characters by emoji images. To do
so, you will use the panel.xyimage() function in the latticeExtra package, which is
similar to the panel.xyplot() function, except that plotting symbols are replaced
by images whose locations (file names or URLs of JPEG or PNG image files) are
specified as the pch argument.
# Reorder levels
library(dplyr)
USCancerRates.state <- mutate(USCancerRates.state, State = reorder(State, Rate))
lattice is less strict than ggplot2 about the format of the dataset. Here, the data
is a time series rather than a data frame.
library(latticeExtra)
xyplot(EuStockMarkets, panel = panel.horizonplot, prepanel =
prepanel.horizonplot)
Map projections
latticeExtra::mapplot() displays numeric data associated with geographical regions,
as colors.
Since the earth is three dimensional but the plot is two dimensional, a projection
is required to reduce the number of dimensions. The list of available projections
is given in the Details section of the mapproject() help page: Mercator, polyconic
projection.
Map plots are drawn in two stages.
First, a map object is created using the maps::map() with plot = FALSE.
Second, mapplot() is called with a formula, a data frame, and a map.
Categories are displayed on the y-axis, and the confidence intervals are displayed
on the x-axis. The point estimates, usually a mean or median value for that
category, are specified using the centers argument, not the formula. An optional
argument, draw.bands, let's you choose between confidence bands and confidence
intervals. This argument is passed to the default panel function panel.segplot().
The estimated county-wise death rates in the USCancerRates also have associated 95%
confidence bounds, LCL95.male UCL95.male LCL95.female UCL95.female. Plotting the
confidence bounds for all counties is not useful because there are too many
counties. For this exercise, your goal is to plot the county-wise confidence
intervals for males for the state of Louisiana.
Your task for this exercise is to use hexbinplot() to create a plot of death rates
among males and females in the USCancerRates dataset.
The formula and data argument in a hexbinplot() call is interpreted in the same way
as xyplot(). You will also use the following optional arguments:
- type argument can be set to "r" to add a regression line.
- trans argument can be a function that is applied to the observed counts before
creating bands for different colors. By default, the range of counts is divided up
evenly into bands, but taking the square root of the counts, for example,
emphasizes differences in the lower range of counts more.
- inv argument gives the inverse function of trans, so that transformed counts can
be converted back before being shown in the legend.
library(hexbin)
# Create hexbin plot, add regression line # function to transform and invert
transformed
hexbinplot(rate.female ~ rate.male, data = USCancerRates, type = "r", trans = sqrt,
inv = function(x) x^2)
library(directlabels)
# Create factor variable
airquality$Month.Name <- factor(month.name[airquality$Month], levels = month.name)
#### GEOSPATIAL R
What does this look like? You've seen how you might make a basic plot of the sales:
ggplot() +
geom_point(aes(lon, lat, size = $, col = $), data = sales)
ggmap(corvallis_map_bw,
base_layer = ggplot(sales, aes(lon, lat))) +
geom_point(aes(color = class)) +
facet_wrap(~class)
A quick alternative
Like qplot() in ggplot2, qmplot() is less flexible than a full specification, but
often involves significantly less typing. qmplot() replaces both
downloading/displaying the map and its syntax is a blend between qplot(),
get_map(), and ggmap().
Using qmplot():
We didn't specify a map, since qmplot() will grab one on its own. qmplot() call
looks a lot like the qplot() call: use points to display the sales data, mapping
lon to the x-axis, lat to the y-axis, and class to color. qmplot() also sets the
default dataset and mapping (without the need for base_layer) so you can add facets
without any extra work.
# geom_path geom_polygon
### alternative
qmplot(lon, lat, data = ward_sales, geom = "polygon", group = group, fill =
avg_price)
Raster data as a heatmap. The predicted house prices in preds are called raster
data: you have a variable measured (or in this case predicted) at every location in
a regular grid.
Looking at head(preds) in the console, you can see the lat values stepping up in
intervals of about 0.002, as lon is constant. After 40 rows, lon increases by about
0.003, as lat runs through the same values. For each lat/lon location, you also
have a predicted_price. You'll see later in Chapter 3, that a more useful way to
think about (and store) this kind of data is in a matrix.
Spatial OBject
# Call str() on countries_sp
str(countries_sp)
# Plot nz
plot(nz)
###
library(sp)
library(tmap)
the tmap package makes visualizing spatial classes in sp easy. it works with the
raster classes too! You simply pass your Raster___ object as the shp argument to
the tm_shape(), and then add a tm_raster():
tm_shape(raster_object) +
tm_raster()
You'll work with tmap throughout the course, but we also want to show you another
package, rasterVis, also designed specifically for visualizing raster objects.
There are a few different functions you can use in rasterVis to make plots, but
let's just try one of them for now: levelplot().
ibrary(tmap)
library(rasterVis)
# Call levelplot() on pop
levelplot(pop)
library(RColorBrewer)
blups <- brewer.pal(n = 9, 'BuPu')
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors = blups)
library(viridisLite)
vir = viridis(9)
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors = vir)
library(viridisLite)
mag <- magma(9)
ggplot(preds) +
geom_tile(aes(lon, lat, fill = predicted_price), alpha = 0.8) +
scale_fill_gradientn(colors = mag)
### Shortcut to get RColorBrewer palette. Add scale_xxx_distiller and you only need
to specify the palette name in the palette argument. See ?scale_fill_distiller.
Unlike ggplot2, where setting a custom color scale happens in a scale_ call, colors
in tmap layers are specified in the layer in which they are mapped. For example,
take a plot of the age_18_24 variable from prop_by_age:
tm_shape(prop_by_age) +
tm_raster(col = "age_18_24")
Since color is mapped in the tm_raster() call, the specification of the palette
also occurs in this call. Specify a vector of colors in the palette argument. This
is a another reason it's worth learning ways to generate a vector of colors. While
different packages could have very different shortcuts for specifying palettes from
color packages, they will generally always have a way to pass in a vector of
colors.
library(viridisLite)
vir <- viridis(9)
mag <- magma(9)
library(classInt)
# Print migration
migration
library(raster)
# Plot land_cover
tm_shape(land_cover) +
tm_raster()