You are on page 1of 55

Exploring Data with R

Abhik Seal
May 8, 2014
This is a introductory tutorial to get you started with Visualization data and Exploring Data with R. There
are some popular books and many online materials i will Provide the links and references at the end of the
tutorial.
library(ggplot2)
library(gcookbook)
Scatter Plots and line plots
plot(cars$dist~cars$speed, # y~x
main="Relationship between car distance & speed", #Plot Title
xlab="Speed (miles per hour)", #X axis title
ylab="Distance travelled (miles)", #Y axis title
xlim=c(0,30), #Set x axis limits from 0 to 30
yaxs="i", #Set y axis style as internal
col="red", #Set the colour of plotting symbol to red
pch=19) #Set the plotting symbol to filled dots
0 5 10 15 20 25 30
2
0
4
0
6
0
8
0
1
2
0
Relationship between car distance & speed
Speed (miles per hour)
D
i
s
t
a
n
c
e

t
r
a
v
e
l
l
e
d

(
m
i
l
e
s
)
Lets draw vertical error bars with 5% errors on our cars scatterplot using arrows function
1
plot(mpg~disp,data=mtcars)
arrows(x0=mtcars$disp,
y0=mtcars$mpg*0.95,
x1=mtcars$disp,
y1=mtcars$mpg*1.05,
angle=90,
code=3,
length=0.04,
lwd=0.4)
100 200 300 400
1
0
1
5
2
0
2
5
3
0
disp
m
p
g
How to draw histograms in the top and right margins of a bivariate scatter plot
layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), widths=c(3,1), heights=c(1,3), TRUE)
par(mar=c(5.1,4.1,0.1,0))
plot(cars$dist~cars$speed, # y~x
xlab="Speed (miles per hour)", #X axis title
ylab="Distance travelled (miles)", #Y axis title
xlim=c(0,30), #Set x axis limits from 0 to 30 ylim=c(0,140), #Set y axis limits from 0 to 30140 xaxs="i", #Set x axis style as internal
yaxs="i", #Set y axis style as internal
col="red", #Set the colour of plotting symbol to red
pch=19) #Set the plotting symbol to filled dots
par(mar=c(0,4.1,3,0))
hist(cars$speed,ann=FALSE,axes=FALSE,col="black",border="white")
yhist <- hist(cars$dist,plot=FALSE)
par(mar=c(5.1,0,0.1,1))
barplot(yhist$density,
2
horiz=TRUE,space=0,axes=FALSE,
col="black",border="white")
0 5 10 15 20 25 30
2
0
4
0
6
0
8
0
1
0
0
1
2
0
Speed (miles per hour)
D
i
s
t
a
n
c
e

t
r
a
v
e
l
l
e
d

(
m
i
l
e
s
)
#Using ggplot library
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()
3
10
15
20
25
30
35
2 3 4 5
wt
m
p
g
# Multiple lines in a plot
plot(pressure$temperature, pressure$pressure, type="l")
points(pressure$temperature, pressure$pressure)
lines(pressure$temperature, pressure$pressure/2, col="red")
points(pressure$temperature, pressure$pressure/2, col="red")
4
0 50 150 250 350
0
2
0
0
4
0
0
6
0
0
8
0
0
pressure$temperature
p
r
e
s
s
u
r
e
$
p
r
e
s
s
u
r
e
ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line()
0
200
400
600
800
0 100 200 300
temperature
p
r
e
s
s
u
r
e
5
# Lines and points together
ggplot(pressure, aes(x=temperature, y=pressure)) +
geom_line() +
geom_point()
0
200
400
600
800
0 100 200 300
temperature
p
r
e
s
s
u
r
e
# Showing Lines Along the Axes
ggplot(pressure, aes(x=temperature, y=pressure)) +
geom_line() + geom_point() +
theme(axis.line = element_line(colour="black"))
6
0
200
400
600
800
0 100 200 300
temperature
p
r
e
s
s
u
r
e
# Logarithmic axis
ggplot(pressure, aes(x=temperature, y=pressure)) + geom_line() +
geom_point() +
theme(axis.line = element_line(colour="black")) +
scale_x_log10() + scale_y_log10()
7
1e03
1e01
1e+01
1e+03
100
temperature
p
r
e
s
s
u
r
e
From library(gcookbook) I am using heightweight dataset to group data points by variables, The grouping
variable must be categoricalin other words, a factor or character vector.
# Other shapes and color can be used by scale_shape_manual() scale_colour_manual()
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex, colour=sex)) +
geom_point()
8
50
55
60
65
70
12 14 16
ageYear
h
e
i
g
h
t
I
n
sex
f
m
# Change shape of points
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(shape=3)
50
55
60
65
70
12 14 16
ageYear
h
e
i
g
h
t
I
n
9
# Change point size sex is categorical
ggplot(heightweight, aes(x=ageYear, y=heightIn, shape=sex)) +
geom_point(size=3) +
scale_shape_manual(values=c(1, 4))
50
55
60
65
70
12 14 16
ageYear
h
e
i
g
h
t
I
n
sex
f
m
# Represent a third continuous variable using color or size.
ggplot(heightweight, aes(x=weightLb, y=heightIn, fill=ageYear)) +
geom_point(shape=21, size=2.5) +
scale_fill_gradient(low="black", high="white", breaks=12:17,
guide=guide_legend())
10
50
55
60
65
70
50 75 100 125 150 175
weightLb
h
e
i
g
h
t
I
n
ageYear
12
13
14
15
16
17
Adding Fitted Regression Model Lines
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn))
sp + geom_point() + stat_smooth(method=lm)
50
55
60
65
70
12 14 16
ageYear
h
e
i
g
h
t
I
n
11
# Adding annotations to regression plot
model <- lm(heightIn ~ ageYear, heightweight)
summary(model)
# First generate prediction data
# Given a model, predict values of yvar from xvar
# This supports one predictor and one predicted variable
# xrange: If NULL, determine the x range from the model object. If a vector with
# two numbers, use those as the min and max of the prediction range.
# samples: Number of samples across the x range.
# ...: Further arguments to be passed to predict()
predictvals <- function(model, xvar, yvar, xrange=NULL, samples=100, ...) {
# If xrange isnt passed in, determine xrange from the models.
# Different ways of extracting the x range, depending on model type
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]])
else if (any(class(model) %in% "loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point() +
geom_line(data=pred)
sp + annotate("text", label="r^2 == 0.42", x=16.5, y=52,parse=TRUE)
12
r
2
= 0.42
50
55
60
65
70
12 14 16
ageYear
h
e
i
g
h
t
I
n
Scatter plot matrix and correlation matrix using mtcars dataset and rst ve variables
library(corrplot)
pairs(mtcars[,1:5])
mpg
4 6 8 50 250
1
0
2
5
4
6
8
cyl
disp
1
0
0
4
0
0
5
0
2
5
0
hp
10 25 100 400 3.0 4.5
3
.
0
4
.
5
drat
13
# Scatter plot with correlations in the upper triangle, smoothing lines in the
# lower triangle, and histograms on the diagonal
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use="complete.obs"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}
pairs(mtcars[,1:5], upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.smooth)
mpg
4 6 8
0.85 0.85
50 250
0.78
1
0
2
5
0.68
4
6
8
cyl
0.90 0.83 0.70
disp
0.79
1
0
0
4
0
0
0.71
5
0
2
5
0
hp
0.45
10 25 100 400 3.0 4.5
3
.
0
4
.
5
drat
14
mcor <- cor(mtcars)
corrplot(mcor)
1
0.8
0.6
0.4
0.2
0
0.2
0.4
0.6
0.8
1
m
p
g
c
y
l
d
i
s
p
h
p
d
r
a
t
w
t
q
s
e
c
v
s
a
m
g
e
a
r
c
a
r
b
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
# Correlation matrix with colored squares and black, rotated labels
corrplot(mcor, method="shade", shade.col=NA, tl.col="black", tl.srt=45)
15
1
0.8
0.6
0.4
0.2
0
0.2
0.4
0.6
0.8
1
m
p
g
c
y
l
d
i
s
p
h
p
d
r
a
t
w
t
q
s
e
c
v
s
a
m
g
e
a
r
c
a
r
b
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
# create a three-dimensional (3D) scatter plot.
library(rgl)
plot3d(mtcars$wt, mtcars$disp, mtcars$mpg, type="s", size=0.75, lit=FALSE)
# add vertical segments to help give a sense of the spatial positions of the points
interleave <- function(v1, v2) as.vector(rbind(v1,v2))
# Plot the points
plot3d(mtcars$wt, mtcars$disp, mtcars$mpg,
xlab="Weight", ylab="Displacement", zlab="MPG",
size=.75, type="s", lit=FALSE)
# Add the segments
segments3d(interleave(mtcars$wt, mtcars$wt),
interleave(mtcars$disp, mtcars$disp),
interleave(mtcars$mpg, min(mtcars$mpg)),
alpha=0.4, col="blue")
Scattter plot with jitter rugs,spikes and density
x <- rnorm(1000, 50, 30)
y <- 3*x + rnorm(1000, 0, 20)
require(Hmisc)
plot(x,y)
#scat1d adds tick marks (bar codes. rug plot)
# on any of the four sides of an existing plot,
# corresponding with non-missing values of a vector x.
scat1d(x, col = "red") # density bars on top of graph
scat1d(y, 4, col = "blue") # density bars at right
16
50 0 50 100 150

2
0
0
0
1
0
0
2
0
0
3
0
0
4
0
0
x
y
plot(x,y, pch = 20)
histSpike(x, add=TRUE, col = "green4", lwd = 2)
histSpike(y, 4, add=TRUE,col = "blue", lwd = 2 )
histSpike(x, type=density,col = "red", add=TRUE) # smooth density at bottom
histSpike(y, 4, type=density, col = "red", add=TRUE)
17
50 0 50 100 150

2
0
0
0
1
0
0
2
0
0
3
0
0
4
0
0
x
y
Bar graphs and Histograms
barplot(BOD$demand, names.arg=BOD$Time)
18
1 2 3 4 5 7
0
5
1
0
1
5
# Using the table function
barplot(table(mtcars$cyl))
4 6 8
0
2
4
6
8
1
0
1
4
19
qplot(BOD$Time, BOD$demand, geom="bar", stat="identity")
0
5
10
15
20
2 4 6
BOD$Time
B
O
D
$
d
e
m
a
n
d
# Conisdering facotr
qplot(factor(BOD$Time), BOD$demand, geom="bar", stat="identity")
20
0
5
10
15
20
1 2 3 4 5 7
factor(BOD$Time)
B
O
D
$
d
e
m
a
n
d
# cyl is continuous here
qplot(mtcars$cyl)
0
5
10
4 5 6 7 8
mtcars$cyl
c
o
u
n
t
21
# Treat cyl as discrete
qplot(factor(mtcars$cyl))
0
5
10
4 6 8
factor(mtcars$cyl)
c
o
u
n
t
# Bar graph of values. This uses the BOD data frame, with the
# "Time" column for x values and the "demand" column for y values.
ggplot(BOD, aes(x=Time, y=demand)) +
geom_bar(stat="identity")
22
0
5
10
15
20
2 4 6
Time
d
e
m
a
n
d
ggplot(mtcars, aes(x=factor(cyl))) +
geom_bar(fill="white",color="black")
0
5
10
4 6 8
factor(cyl)
c
o
u
n
t
23
# Specify approximate number of bins with breaks
ggplot(mtcars, aes(x=mpg)) +
geom_histogram(binwidth=4,fill="white", colour="black")
0
2
4
6
8
10 20 30 40
mpg
c
o
u
n
t
# Change the x axis origin using origin parameter
ggplot(mtcars, aes(x=mpg)) +
geom_histogram(binwidth=4,fill="white", colour="black",origin=20)
24
0
2
4
6
20 25 30 35
mpg
c
o
u
n
t
Histograms of multiple groups of data
library(MASS)
ggplot(heightweight, aes(x=heightIn)) +
geom_histogram(fill="white", colour="black") +
facet_grid(sex ~ .)
25
0
5
10
15
20
0
5
10
15
20
f
m
50 55 60 65 70
heightIn
c
o
u
n
t
hw<-heightweight
# Using plyr and revalue() to change the names on sex variable
library(plyr)
hw$sex<- revalue(hw$sex,c("f"="Female","m"="Male"))
# Using facetting
ggplot(hw, aes(x=heightIn)) +
geom_histogram(fill="white", colour="black") +
facet_grid(sex ~ .)
26
0
5
10
15
20
0
5
10
15
20
F
e
m
a
l
e
M
a
l
e
50 55 60 65 70
heightIn
c
o
u
n
t
ggplot(hw, aes(x=heightIn, y = ..density.. ,fill=sex)) +
geom_histogram(position="identity",alpha=0.4)+
theme_bw()+geom_density(alpha=0.3)
0.00
0.05
0.10
0.15
0.20
0.25
50 55 60 65 70
heightIn
d
e
n
s
i
t
y
sex
Female
Male
27
Negative and Positive Bar plot
csub <- subset(climate, Source=="Berkeley" & Year >= 1900)
head(csub)
csub$pos <- csub$Anomaly10y >= 0
ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) +
geom_bar(stat="identity", color="black",position="identity")
0.0
0.5
1920 1950 1980
Year
A
n
o
m
a
l
y
1
0
y
pos
FALSE
TRUE
Error Bar plot in ggplot2
myd <- data.frame (X = c(1:12,1:12),
Y = c(8, 12, 13, 18, 22, 16, 24, 29, 34, 15, 8, 6,
9, 10, 12, 18, 26, 28, 28, 30, 20, 10, 9, 9),
group = rep (c("X-Group", "Y-group"), each = 12),
error = rep (c(2.5, 3.0), each = 12))
plt = ggplot(data = myd, aes(x=X, y=Y, fill=group, width=0.8) ) +
geom_errorbar(aes(ymin=Y, ymax=Y+error, width = 0.2),
position=position_dodge(width=0.8)) +
geom_bar(stat="identity", position=position_dodge(width=0.8)) +
geom_bar(stat="identity", position=position_dodge(width=0.8),
colour="black", legend=FALSE) +
scale_fill_manual(values=c("grey70", "white")) +
scale_x_discrete("X", limits=c(1:12)) +
scale_y_continuous("Y (units)", expand=c(0,0),
limits = c(0, 40), breaks=seq(0, 40, by=5)) +
ggtitle ("My nice plot") +
theme_bw() +
theme( plot.title = element_text(face="bold", size=14),
28
axis.title.x = element_text(face="bold", size=12),
axis.title.y = element_text(face="bold", size=12, angle=90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y=element_text(angle=90, hjust=0.5),
legend.title = element_blank(),
legend.position = c(0.85,0.85),
legend.key.size = unit(1.5, "lines"),
legend.key = element_rect()
)
plt
0
5
1
0
1
5
2
0
2
5
3
0
3
5
4
0
1 2 3 4 5 6 7 8 9 10 11 12
X
Y

(
u
n
i
t
s
)
XGroup
Ygroup
My nice plot
Box plots
# Using the ToothGrowth dataset
# Formula syntax
boxplot(len ~ supp, data = ToothGrowth)
29
OJ VC
5
1
0
1
5
2
0
2
5
3
0
3
5
# Put interaction of two variables on x-axis
boxplot(len ~ supp + dose, data = ToothGrowth)
OJ.0.5 OJ.1 OJ.2
5
1
0
1
5
2
0
2
5
3
0
3
5
30
ggplot(ToothGrowth, aes(x=supp, y=len)) +
geom_boxplot()
10
20
30
OJ VC
supp
l
e
n
# Adding notches
ggplot(ToothGrowth, aes(x=supp, y=len)) +
geom_boxplot(notch=TRUE)
31
10
20
30
OJ VC
supp
l
e
n
# Adding mean
ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot() +
stat_summary(fun.y="mean", geom="point", shape=24, size=4, fill="white")
10
20
30
OJ VC
supp
l
e
n
32
# Using three separate vectors
ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) +
geom_boxplot()
10
20
30
OJ.0.5 VC.0.5 OJ.1 VC.1 OJ.2 VC.2
interaction(supp, dose)
l
e
n
Violin plots are a way of comparing multiple data distributions
# Use the heightweight datasets
p <- ggplot(heightweight, aes(x=sex, y=heightIn))
p + geom_violin(trim=FALSE,adjuts=2)+
geom_boxplot(width=.1, fill="Grey", outlier.colour=NA)+
theme_bw()+
stat_summary(fun.y="mean", geom="point", shape=24, size=4, fill="white")
33
50
60
70
f m
sex
h
e
i
g
h
t
I
n
Plotting curves
curve(x^3 - 5*x, from=-4, to=4)
34
4 2 0 2 4

4
0

2
0
0
2
0
4
0
x
x
^
3


5

*

x
# Plot a user-defined function
myfun <- function(xvar) {
1/(1 + exp(-xvar + 10))
}
curve(myfun(x), from=0, to=20)
# Add a line:
curve(1-myfun(x), add = TRUE, col = "red")
35
0 5 10 15 20
0
.
0
0
.
2
0
.
4
0
.
6
0
.
8
1
.
0
x
m
y
f
u
n
(
x
)
# This sets the x range from 0 to 20
ggplot(data.frame(x=c(0, 20)), aes(x=x)) +
stat_function(fun=myfun, geom="line")
0.00
0.25
0.50
0.75
1.00
0 5 10 15 20
x
y
36
Miscellaneous plots
Making Density Plot of Two-Dimensional Data
p <- ggplot(faithful, aes(x=eruptions, y=waiting))
p + geom_point() + stat_density2d()
50
60
70
80
90
2 3 4 5
eruptions
w
a
i
t
i
n
g
p + stat_density2d(aes(colour=..level..))
37
50
60
70
80
90
2 3 4 5
eruptions
w
a
i
t
i
n
g
0.005
0.010
0.015
0.020
level
p + stat_density2d(aes(fill=..density..), geom="raster", contour=FALSE)
50
60
70
80
90
2 3 4 5
eruptions
w
a
i
t
i
n
g
0.005
0.010
0.015
0.020
0.025
density
38
# With points, and map density estimate to alpha
p + geom_point() +
stat_density2d(aes(alpha=..density..), geom="tile", contour=FALSE)
50
60
70
80
90
2 3 4 5
eruptions
w
a
i
t
i
n
g
density
0.005
0.010
0.015
0.020
0.025
Plotting Pie Charts
library(RColorBrewer)
slices <- c(10, 12,4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pie(slices, labels = lbls, main="Pie Chart of Countries",col=brewer.pal(7,"Set1"))
39
IN
AK
ID
MA
MO
Pie Chart of Countries
Pie Chart with Percentages
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Pie Chart of US States")
40
IN 20%
AK 24%
ID 8%
MA 32%
MO 16%
Pie Chart of US States
3D Pie chart
library(plotrix)
slices <- c(10, 12, 4, 16, 8)
lbls <- c("IN", "AK", "ID", "MA", "MO")
pie3D(slices,labels=lbls,explode=0.1,
main="Pie Chart of Countries ",col=brewer.pal(7,"Set1"))
41
Pie Chart of Countries
IN
AK
ID
MA
MO
A dendrogram is the fancy word that we use to name a tree diagram to display the groups formed by
hierarchical clustering. # Using Corrgrams package
library(corrgram)
R <- cor(mtcars)
# default corrgram
corrgram(R)
42
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
# corrgram with pie charts
corrgram(R, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie,
text.panel = panel.txt, main = "mtcars Data")
gear
am
drat
mpg
vs
qsec
wt
disp
cyl
hp
carb
mtcars Data
43
The package ellipse provides the function plotcorr() that helps us to visualize correlations. plotcorr() uses
ellipse-shaped glyphs for each entry of the correlation matrix. Heres the default plot using our matrix of R:
# default corrgram
library(ellipse)
plotcorr(R)
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
m
p
g
c
y
l
d
i
s
p
h
p
d
r
a
t
w
t
q
s
e
c
v
s
a
m
g
e
a
r
c
a
r
b
# colored corrgram
plotcorr(R, col = colorRampPalette(c("firebrick3", "white", "navy"))(10))
44
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
m
p
g
c
y
l
d
i
s
p
h
p
d
r
a
t
w
t
q
s
e
c
v
s
a
m
g
e
a
r
c
a
r
b
Another colored corrgram
plotcorr(R, col = colorRampPalette(c("#E08214", "white", "#8073AC"))(10), type = "lower")
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
m
p
g
c
y
l
d
i
s
p
h
p
d
r
a
t
w
t
q
s
e
c
v
s
a
m
g
e
a
r
45
Visualizing Dendrograms
# prepare hierarchical cluster
hc = hclust(dist(mtcars))
plot(hc, hang = -1) ## labels at the same level
M
a
s
e
r
a
t
i

B
o
r
a
C
h
r
y
s
l
e
r

I
m
p
e
r
i
a
l
C
a
d
i
l
l
a
c

F
l
e
e
t
w
o
o
d
L
i
n
c
o
l
n

C
o
n
t
i
n
e
n
t
a
l
F
o
r
d

P
a
n
t
e
r
a

L
D
u
s
t
e
r

3
6
0
C
a
m
a
r
o

Z
2
8
H
o
r
n
e
t

S
p
o
r
t
a
b
o
u
t
P
o
n
t
i
a
c

F
i
r
e
b
i
r
d
H
o
r
n
e
t

4

D
r
i
v
e
V
a
l
i
a
n
t
M
e
r
c

4
5
0
S
L
C
M
e
r
c

4
5
0
S
E
M
e
r
c

4
5
0
S
L
D
o
d
g
e

C
h
a
l
l
e
n
g
e
r
A
M
C

J
a
v
e
l
i
n
H
o
n
d
a

C
i
v
i
c
T
o
y
o
t
a

C
o
r
o
l
l
a
F
i
a
t

1
2
8
F
i
a
t

X
1

9
F
e
r
r
a
r
i

D
i
n
o
L
o
t
u
s

E
u
r
o
p
a
M
e
r
c

2
3
0
V
o
l
v
o

1
4
2
E
D
a
t
s
u
n

7
1
0
T
o
y
o
t
a

C
o
r
o
n
a
P
o
r
s
c
h
e

9
1
4

2
M
e
r
c

2
4
0
D
M
a
z
d
a

R
X
4
M
a
z
d
a

R
X
4

W
a
g
M
e
r
c

2
8
0
M
e
r
c

2
8
0
C
0
3
0
0
Cluster Dendrogram
hclust (*, "complete")
dist(mtcars)
H
e
i
g
h
t
An alternative way to produce dendrograms is to specically convert hclust objects into dendrograms objects.
# using dendrogram objects
hcd = as.dendrogram(hc)
# alternative way to get a dendrogram
plot(hcd)
46
0
1
0
0
2
0
0
3
0
0
4
0
0
M
a
s
e
r
a
t
i

B
o
r
a
C
h
r
y
s
l
e
r

I
m
p
e
r
i
a
l
C
a
d
i
l
l
a
c

F
l
e
e
t
w
o
o
d
L
i
n
c
o
l
n

C
o
n
t
i
n
e
n
t
a
l
F
o
r
d

P
a
n
t
e
r
a

L
D
u
s
t
e
r

3
6
0
C
a
m
a
r
o

Z
2
8
H
o
r
n
e
t

S
p
o
r
t
a
b
o
u
t
P
o
n
t
i
a
c

F
i
r
e
b
i
r
d
H
o
r
n
e
t

4

D
r
i
v
e
V
a
l
i
a
n
t
M
e
r
c

4
5
0
S
L
C
M
e
r
c

4
5
0
S
E
M
e
r
c

4
5
0
S
L
D
o
d
g
e

C
h
a
l
l
e
n
g
e
r
A
M
C

J
a
v
e
l
i
n
H
o
n
d
a

C
i
v
i
c
T
o
y
o
t
a

C
o
r
o
l
l
a
F
i
a
t

1
2
8
F
i
a
t

X
1

9
F
e
r
r
a
r
i

D
i
n
o
L
o
t
u
s

E
u
r
o
p
a
M
e
r
c

2
3
0
V
o
l
v
o

1
4
2
E
D
a
t
s
u
n

7
1
0
T
o
y
o
t
a

C
o
r
o
n
a
P
o
r
s
c
h
e

9
1
4

2
M
e
r
c

2
4
0
D
M
a
z
d
a

R
X
4
M
a
z
d
a

R
X
4

W
a
g
M
e
r
c

2
8
0
M
e
r
c

2
8
0
C
Having an object of class dendrogram, we can also plot the branches in a triangular form.
# using dendrogram objects
plot(hcd, type = "triangle")
47
0
1
0
0
2
0
0
3
0
0
4
0
0
M
a
s
e
r
a
t
i

B
o
r
a
C
h
r
y
s
l
e
r

I
m
p
e
r
i
a
l
C
a
d
i
l
l
a
c

F
l
e
e
t
w
o
o
d
L
i
n
c
o
l
n

C
o
n
t
i
n
e
n
t
a
l
F
o
r
d

P
a
n
t
e
r
a

L
D
u
s
t
e
r

3
6
0
C
a
m
a
r
o

Z
2
8
H
o
r
n
e
t

S
p
o
r
t
a
b
o
u
t
P
o
n
t
i
a
c

F
i
r
e
b
i
r
d
H
o
r
n
e
t

4

D
r
i
v
e
V
a
l
i
a
n
t
M
e
r
c

4
5
0
S
L
C
M
e
r
c

4
5
0
S
E
M
e
r
c

4
5
0
S
L
D
o
d
g
e

C
h
a
l
l
e
n
g
e
r
A
M
C

J
a
v
e
l
i
n
H
o
n
d
a

C
i
v
i
c
T
o
y
o
t
a

C
o
r
o
l
l
a
F
i
a
t

1
2
8
F
i
a
t

X
1

9
F
e
r
r
a
r
i

D
i
n
o
L
o
t
u
s

E
u
r
o
p
a
M
e
r
c

2
3
0
V
o
l
v
o

1
4
2
E
D
a
t
s
u
n

7
1
0
T
o
y
o
t
a

C
o
r
o
n
a
P
o
r
s
c
h
e

9
1
4

2
M
e
r
c

2
4
0
D
M
a
z
d
a

R
X
4
M
a
z
d
a

R
X
4

W
a
g
M
e
r
c

2
8
0
M
e
r
c

2
8
0
C
Phylogenetic trees
library(ape)
# plot basic tree
plot(as.phylo(hc), cex = 0.9, label.offset = 1)
48
Mazda RX4
Mazda RX4 Wag
Datsun 710
Hornet 4 Drive
Hornet Sportabout
Valiant
Duster 360
Merc 240D
Merc 230
Merc 280
Merc 280C
Merc 450SE
Merc 450SL
Merc 450SLC
Cadillac Fleetwood
Lincoln Continental
Chrysler Imperial
Fiat 128
Honda Civic
Toyota Corolla
Toyota Corona
Dodge Challenger
AMC Javelin
Camaro Z28
Pontiac Firebird
Fiat X19
Porsche 9142
Lotus Europa
Ford Pantera L
Ferrari Dino
Maserati Bora
Volvo 142E
# fan
plot(as.phylo(hc), type = "fan")
49
M
a
z
d
a

R
X
4
M
a
z
d
a

R
X
4

W
a
g
D
a
t
s
u
n

7
1
0
H
o
r
n
e
t

4

D
r
i
v
e
H
o
r
n
e
t

S
p
o
r
t
a
b
o
u
t
V
a
l
i
a
n
t
D
u
s
t
e
r

3
6
0
M
e
r
c

2
4
0
D
M
e
r
c

2
3
0
M
e
r
c

2
8
0
M
e
rc
2
8
0
C
M
e
r
c

4
5
0
S
E
M
e
r
c

4
5
0
S
L
M
e
r
c

4
5
0
S
L
C
C
a
d
i
l
l
a
c

F
l
e
e
t
w
o
o
d
L
i
n
c
o
l
n

C
o
n
t
i
n
e
n
t
a
l
C
h
ry
s
le
r Im
p
e
ria
l
F
i
a
t

1
2
8
Honda Civic
T
o
y
o
ta
C
o
ro
lla
T
o
y
o
t
a

C
o
r
o
n
a
D
o
d
g
e

C
h
a
l
l
e
n
g
e
r
A
M
C
J
a
v
e
lin
C
a
m
a
r
o

Z
2
8
P
o
n
t
i
a
c

F
i
r
e
b
i
r
d
F
i
a
t

X
1

9
P
o
r
s
c
h
e

9
1
4

2
L
o
t
u
s

E
u
r
o
p
a
F
o
r
d

P
a
n
t
e
r
a

L
F
e
r
r
a
r
i

D
i
n
o
Maserati Bora
V
o
l
v
o

1
4
2
E
# add colors randomly
plot(as.phylo(hc), type = "fan", tip.color = hsv(runif(15, 0.65,
0.95), 1, 1, 0.7),
edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7),
edge.width = runif(20,0.5, 3), use.edge.length = TRUE, col = "gray80")
50
M
a
z
d
a

R
X
4
M
a
z
d
a

R
X
4

W
a
g
D
a
t
s
u
n

7
1
0
H
o
r
n
e
t

4

D
r
i
v
e
H
o
r
n
e
t

S
p
o
r
t
a
b
o
u
t
V
a
l
i
a
n
t
D
u
s
t
e
r

3
6
0
M
e
r
c

2
4
0
D
M
e
r
c

2
3
0
M
e
r
c

2
8
0
M
e
rc
2
8
0
C
M
e
r
c

4
5
0
S
E
M
e
r
c

4
5
0
S
L
M
e
r
c

4
5
0
S
L
C
C
a
d
i
l
l
a
c

F
l
e
e
t
w
o
o
d
L
i
n
c
o
l
n

C
o
n
t
i
n
e
n
t
a
l
C
h
ry
s
le
r Im
p
e
ria
l
F
i
a
t

1
2
8
Honda Civic
T
o
y
o
ta
C
o
ro
lla
T
o
y
o
t
a

C
o
r
o
n
a
D
o
d
g
e

C
h
a
l
l
e
n
g
e
r
A
M
C
J
a
v
e
lin
C
a
m
a
r
o

Z
2
8
P
o
n
t
i
a
c

F
i
r
e
b
i
r
d
F
i
a
t

X
1

9
P
o
r
s
c
h
e

9
1
4

2
L
o
t
u
s

E
u
r
o
p
a
F
o
r
d

P
a
n
t
e
r
a

L
F
e
r
r
a
r
i

D
i
n
o
Maserati Bora
V
o
l
v
o

1
4
2
E
Triple heat map plot
library(reshape2)
library (grid)
library(ggplot2)
#X axis quantitaive ggplot data
datfx <- data.frame(indv=factor(paste("ID", 1:20, sep = ""),
levels =rev(paste("ID", 1:20, sep = ""))),
matrix(sample(LETTERS[1:7],80, T), ncol = 4))
# converting data to long form for ggplot2 use
datf1x <- melt(datfx, id.var = indv)
plotx <- ggplot(datf1x, aes(indv, variable)) +
geom_tile(aes(fill = value),colour = "white") +
scale_fill_manual(values= terrain.colors(7))+
scale_x_discrete(expand=c(0,0))
px <- plotx
#Y axis quantitaive ggplot data
datfy <- data.frame(indv=factor(paste("ID", 21:40, sep = ""),
levels =rev(paste("ID",21:40, sep = ""))), matrix(sample(LETTERS[7:10],100, T), ncol = 5))
# converting data to long form for ggplot2 use
datf1y <- melt(datfy, id.var = indv)
ploty <- ggplot(datf1y, aes( variable, indv)) + geom_tile(aes(fill = value),
colour = "white") +
scale_fill_manual(values= c("cyan4", "midnightblue", "green2", "lightgreen")) +
scale_x_discrete(expand=c(0,0))
51
py <- ploty + theme(legend.position="left", axis.title=element_blank())
# plot XY quantative fill
datfxy <- data.frame(indv=factor(paste("ID", 1:20, sep = ""),
levels =rev(paste("ID", 1:20, sep = ""))), matrix(rnorm (400, 50, 10), ncol = 20))
names (datfxy) <- c("indv",paste("ID", 21:40, sep = ""))
datfxy <- melt(datfxy, id.var = indv)
levels (datfxy$ variable) <- rev(paste("ID", 21:40, sep = ""))
pxy <- plotxy <- ggplot(datfxy, aes(indv, variable)) +
geom_tile(aes(fill = value),colour = "white") +
scale_fill_gradient(low="red", high="yellow") +
theme(axis.title=element_blank())
# Define layout for the plots (2 rows, 2 columns)
layt<-grid.layout(nrow=2,ncol=2,heights=c(6/8,2/8),widths=c(2/8,6/8),default.units=c(null,null))
#View the layout of plots
grid.show.layout(layt)
52
(1, 1) 0.75null
0.25null
(1, 2) 0.75null
0.75null
(2, 1) 0.25null
0.25null
(2, 2)
0.75null
0.25null
#Draw plots one by one in their positions
grid.newpage()
pushViewport(viewport(layout=layt))
print(py,vp=viewport(layout.pos.row=1,layout.pos.col=1))
print(pxy,vp=viewport(layout.pos.row=1,layout.pos.col=2))
print(px,vp=viewport(layout.pos.row=2,layout.pos.col=2))
53
ID40
ID39
ID38
ID37
ID36
ID35
ID34
ID33
ID32
ID31
ID30
ID29
ID28
ID27
ID26
ID25
ID24
ID23
ID22
ID21
X1X2X3X4X5
value
G
H
I
J
ID40
ID39
ID38
ID37
ID36
ID35
ID34
ID33
ID32
ID31
ID30
ID29
ID28
ID27
ID26
ID25
ID24
ID23
ID22
ID21
ID20ID19ID18ID17ID16ID15ID14ID13ID12ID11ID10 ID9 ID8 ID7 ID6 ID5 ID4 ID3 ID2 ID1
30
40
50
60
70
value
X1
X2
X3
X4
ID20ID19ID18ID17ID16ID15ID14ID13ID12ID11ID10 ID9 ID8 ID7 ID6 ID5 ID4 ID3 ID2 ID1
indv
v
a
r
i
a
b
l
e
value
A
B
C
D
E
F
G
Mosaic plot for categorical data
myd <- data.frame (fact1 = sample (c("A", "B", "C", "D"), 200, replace = TRUE),
fact2 = sample (c("HL", "PS", "DS"), 200, replace = TRUE),
fact3 = sample (c("Male", "Female"), 200, replace = TRUE))
#plot
# vcd package is for visualization of categorical data
require(vcd)
mytable <- table (myd)
mosaic(mytable, shade=TRUE, legend=TRUE)
54
1.5
0.0
1.9
Pearson
residuals:
pvalue =
0.27
fact2
f
a
c
t
1
f
a
c
t
3
D
M
a
l
e
F
e
m
a
l
e
C
M
a
l
e
F
e
m
a
l
e
B
M
a
l
e
F
e
m
a
l
e
A
DS HL PS
M
a
l
e
F
e
m
a
l
e
References
1.R Graphics Cookbook
2.ggplot2 book by Hadley Wickham
3.R graphs examples
4.R Graph cookbook
5.xkcd style graphs
55

You might also like