Professional Documents
Culture Documents
g <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3)
1
c <- c(rep("A", 5), rep("B", 3), rep("C", 4))
df <- data.frame(x = sample(12), y = sample(1:20, 12, replace = TRUE), g, c)
rate <- matrix(c(29, 48, 10, 13, 29, 46, 10, 15, 27, 43, 9, 21, 28, 46, 10, 16, 22, 45, 10, 23, 26, 46,
10, 17, 23, 46, 10, 21, 24, 40, 14, 22, 24, 44, 15, 18, 25, 43, 15, 17, 25, 44, 14, 16, 24, 43, 16, 18),
nrow = 4)
vote <- data.frame(Name = c("ChuWang", "TsaiChen", "SoongHsu", "Unknown"), rate)
colnames(vote) <- c("Name", "1041019", "1041108", "1041119", "1041213", "1041220",
"1041226", "1041227", "1050102", "1050104", "1050112", "1050114")
2
直方圖 (histogram)
呈現單一變數的分佈,將資料分群,每個 bin 的高度代表觀測值落在該群集的數量
ggplot(data, aes(x = Var)) + geom_histogram(arguments)
Argument: stat = "bin" or "count"
Var 若為離散數值,必須設定 stat = "count"
ggplot(dsel, aes(carat)) + geom_histogram()
ggplot(dsel, aes(carat)) + geom_histogram(stat = "count")
ggplot(dsel, aes(clarity)) + geom_histogram(stat = "count")
Argument: bins = 30 or 數值
設定 bins 數量,適用於 Var 為連續數值
ggplot(dsel, aes(carat)) + geom_histogram(bins = 10)
Argument: binwidth = (Var 數值範圍/30) or 數值
設定每個 bin 的範圍,適用於 Var 為連續數值
ggplot(dsel, aes(carat)) + geom_histogram(binwidth = 0.3)
ggplot(dsel, aes(carat)) + geom_histogram(binwidth = 0.02)
Arguments: fill/color = 數值 (依色表順序) or 顏色名稱 or 色碼
設定 (所有) bins 顏色,color 只設定 bins 外框顏色
ggplot(dsel, aes(carat)) + geom_histogram(fill = 4, color = "red")
ggplot(dsel, aes(carat)) + geom_histogram(fill = "#FF9999", color = 4)
ggplot(data, aes(x = Var, y = ..computed variable..)) + geom_histogram(arguments)
改變 y 軸定義,適用於 Var 為連續數值
Computed variables: count or density or ncount or ndensity
每個 bin 資料點數 or 每個 bin 資料點數比例 or 標準化 count or 標準化 density
ggplot(dsel, aes(carat, y = ..density..)) + geom_histogram()
ggplot(data, aes(x = Var1, fill = Var2)) + geom_histogram(arguments)
Var2 通常是 factor,將各群集 Var1 資料依據 Var2 的 level 值著色
自動產生 legend,順序為 level 值在 factor 中的排序
ggplot(dsel, aes(carat, fill = cut)) + geom_histogram()
ggplot(dsel, aes(clarity, fill = cut)) + geom_histogram(stat = "count")
若 Var2 用數值代表顏色,可用 factor(Var2) 轉成 factor 格式
ggplot(dsel, aes(clarity, fill = v)) + geom_histogram(stat = "count")
ggplot(dsel, aes(clarity, fill = factor(v))) + geom_histogram(stat = "count")
Aesthetic 引數可改成 color,只對 bins 外框著色
ggplot(dsel, aes(clarity, color = factor(v))) + geom_histogram(stat = "count")
Argument: position = "stack" or "identity" or "dodge" or "fill"
顯示方式:堆疊 or 不調整 or 並列 or 標準化堆疊
ggplot(dsel, aes(clarity, fill = cut)) + geom_histogram(stat = "count", position = "dodge")
ggplot(dsel, aes(clarity, fill = cut)) + geom_histogram(stat = "count", position = "fill")
ggplot(data, aes(x = Var1, group = Var2)) + geom_histogram(arguments)
3
Var2 通常是 factor,將各群集 Var1 資料依據 Var2 的 level 值分組
Argument: position = "stack" or "identity" or "dodge" or "fill"
ggplot(dsel, aes(clarity, group = cut)) + geom_histogram(stat = "count", position =
"dodge")
ggplot(dsel, aes(clarity, group = v)) + geom_histogram(stat = "count", position =
"dodge")
以折線圖呈現直方圖內容
用 geom_freqpoly() 取代 geom_histogram()
ggplot(data, aes(x = Var)) + geom_freqpoly(arguments)
Argument: stat = "bin" or "count"
ggplot(dsel, aes(carat)) + geom_freqpoly()
Var 若為離散數值,每個群集只有一個數值,不適合使用 geom_freqpoly 繪圖
Arguments: bins/binwidth/color 用法同 geom_histogram
ggplot(data, aes(x = Var, y = ..computed variable..)) + geom_freqpoly(arguments)
用法同 geom_histogram
ggplot(data, aes(x = Var1, color = Var2)) + geom_freqpoly(arguments)
用法同 geom_histogram
Argument: position = "identity" or "stack" or "dodge" or "fill"
用法同 geom_histogram (預設值不同)
ggplot(dsel, aes(carat, color = cut)) + geom_freqpoly(position = "stack")
ggplot(dsel, aes(carat, color = cut)) + geom_freqpoly(position = "fill")
ggplot(data, aes(x = Var1, group = Var2)) + geom_freqpoly(arguments)
用法同 geom_histogram
若 Var1 為離散數值,由於已將各群集 Var1 資料分組,可用 geom_freqpoly 繪圖
ggplot(dsel, aes(carat, group = cut)) + geom_freqpoly()
ggplot(dsel, aes(clarity, group = cut)) + geom_freqpoly(stat = "count")
Argument: position = "identity" or "stack" or "dodge" or "fill"
用法同 geom_histogram (預設值不同)
圓餅圖
圓餅圖 = 在極座標 (polar coordinate) 系統畫堆疊長條圖 (stacked bar chart)
把資料畫成「只有一個 bin」的堆疊長條圖 (資料放 Y 軸)
將 Y 軸資料依比例轉成 360 (2*pi 弧度)
ggplot(data, aes(x = factor(1), y = Var1, fill = Var2) + geom_col() + coord_polar(theta = "y",
arguments)
x = factor(1): X 軸只有一個 bin
Var1: 數值資料 (vector)
Var2: label 資料 (factor),若為數值資料須轉成 factor 格式
Argument: start = 數值 (預設為 0)
起始角度 (以弧度為單位),0 代表正上方,一圓周為 2*pi
Argument: direction = 1 or -1
旋轉方向,1 為順時針,-1 為逆時針
5
df3 <- data.frame(Age = c("Child", "Teen", "Adult", "Old Man"), Num = c(21, 53, 85, 8))
g <- ggplot(df3, aes(factor(1), Num, fill = Age)) + geom_col()
g + coord_polar("y")
g + coord_polar("y", direction = -1)
g + coord_polar("y", start = pi)
折線圖
指定長度相同的 Var1 和 Var2 二個向量當成 (x, y) 座標
geom_point() 畫座標點
geom_line() 將座標點依據 x 座標值順序連線
geom_path() 將座標點依據 (Var1, Var2) 在向量中順序連線
Var1 要有順序性 (通常是不重複的數值或時間),連線才有意義
用單純數字座標舉例
ggplot(data, aes(x = Var1, y = Var2) + geom_point(arguments) + geom_line(arguments)
geom_line() 可替換為 geom_path(),geom_point() 可以省略不畫座標點
ggplot(df, aes(x, y)) + geom_point() + geom_line()
ggplot(df, aes(x, y)) + geom_point() + geom_path()
geom_point arguments: 改變 (所有) 座標點格式
size/stroke = 數值 (mm)
color/fill = 數值 or 顏色名稱 or 色碼
shape = 數值 or 形狀名稱
ggplot(df, aes(x, y)) + geom_point(size = 5, color = "red", shape = "diamond") +
geom_line()
geom_line (geom_path) arguments: 改變 (所有) 線條格式
size = 數值 (mm)
linetype = "solid", "dashed", "dotted", "dotdash", "longdash", "twodash" (可用數值指定)
color = 數值 or 顏色名稱 or 色碼
lineend = "round", "butt", "square" (可用數值指定)
linejoin = "round", "mitre", "bevel" (可用數值指定)
ggplot(df, aes(x, y)) + geom_point(size = 2) + geom_line(size = 1, linetype = 2, color =
4)
ggplot(data, aes(x = Var1, y = Var2, group = Var3) + geom_point(arguments) +
geom_line(arguments)
Var3 通常是 factor,將 (Var1, Var2) 座標點依據 Var3 的 level 值分組,畫出多條折線
ggplot(df, aes(x, y, group = g)) + geom_point() + geom_line()
ggplot(df, aes(x, y, group = c)) + geom_point() + geom_line()
ggplot(data, aes(x = Var1, y = Var2, color = Var3) + geom_point(arguments) +
geom_line(arguments)
6
Var3 通常是 factor,將 (Var1, Var2) 座標點依據 Var3 的 level 值著色及分組
ggplot(df, aes(x, y, color = g)) + geom_point() + geom_line()
ggplot(df, aes(x, y, color = factor(g))) + geom_point() + geom_line()
ggplot(df, aes(x, y, color = c)) + geom_point() + geom_line()
實例
ggplot(recent, aes(date, pce)) + geom_point() + geom_line()
ggplot(recent, aes(date, unemploy)) + geom_point() + geom_line()
Name <- rep(vote$Name, times = 12)
Num <- rep(1:12, each = 4)
Percent <- as.numeric(as.matrix(vote[-1]))
df4 <- data.frame(Name, Percent, Num)
ggplot(df4, aes(Num, Percent, color = factor(Name))) + geom_point() + geom_line()
XY 散佈圖
觀察二個變數的比較,圖中每個點代表每對變數的觀測值,X 軸和 Y 軸各代表一個變數
ggplot(data, aes(x = Var1, y = Var2) + geom_point(arguments)
Var1 和 Var2 通常至少有一個是連續數值
ggplot(dsel, aes(carat, price)) + geom_point()
ggplot(dsel, aes(carat, clarity)) + geom_point()
ggplot(dsel, aes(clarity, price)) + geom_point()
ggplot(dsel, aes(clarity, cut)) + geom_point()
Arguments: size/stroke/color/fill/shape
用法同 geom_point()
Argument: alpha = 1 or 數值 (<1)
設定透明度,讓點的顏色依數量而深淺 (透明度) 不同,數值越小越透明
ggplot(dsel, aes(carat, price)) + geom_point(alpha = 0.5)
ggplot(dsel, aes(carat, price)) + geom_point(alpha = 0.1)
Argument: position = "identity" or "jitter"
將重複的點隨機分散,使用 geom_jitter() 可設定散落範圍
ggplot(dsel, aes(clarity, price)) + geom_point(position = "jitter")
ggplot(dsel, aes(clarity, price)) + geom_jitter(width = 0.2, height = 0)
ggplot(data, aes(x = Var1, y = Var2, color = Var3) + geom_point(arguments)
Var3 通常是 factor,將 (Var1, Var2) 座標點依據 Var3 的 level 值著色
ggplot(dsel, aes(carat, price, color = color)) + geom_point()
ggplot(dsel, aes(carat, price, color = clarity)) + geom_point()
函數曲線 y = f(x)
將函數 f(x) 寫入 function myfun,自變數 x 的範圍設定為 data 的 Var1 (向量)
可以使用 geom_line 的 arguments 設定曲線格式
7
ggplot(data, aes(x = Var1)) + stat_function(fun = myfun, arguments)
Argument: n = 數值 (預設為 101)
繪圖點數,數字越大精確度越高
x1 <- data.frame(x = c(-10*pi, 10*pi))
ggplot(x1, aes(x)) + stat_function(fun = sin)
ggplot(x1, aes(x)) + stat_function(fun = sin, n = 1000)
x2 <- data.frame(x = c(0, 10))
ggplot(x2, aes(x)) + stat_function(fun = sin, color = "red") + stat_function(fun = sqrt, color =
"blue")
在既有圖上畫參考線
參考既有圖的座標,可以使用 geom_line 的 arguments 設定直線格式
承襲既有圖的 XY 軸範圍 (不可指定起終點座標)
水平線: geom_hline(yintercept = 數值, arguments)
垂直線: geom_vline(xintercept = 數值, arguments)
指定水平線的 Y 座標或垂直線的 X 座標,若為向量可畫出多條線
g <- ggplot(mtcars, aes(mpg, wt)) + geom_point()
g + geom_hline(yintercept = c(1.7, 4.2), color = "blue", linetype = "dashed")
g + geom_vline(xintercept = c(12, 28), color = "green", linetype = "dotted", size = 1.2)
斜線: geom_abline(slope = 數值, intercept = 數值, arguments)
公式: y = ax + b a: slope (斜率) b: intercept (與 Y 軸交點座標)
g + geom_abline(slope = c(-0.13, 0.36), intercept = c(5, -6.8), color = c("red", "brown"))
迴歸線: geom_smooth(arguments)
瞭解二個或多個變數間是否相關、相關方向與強度,可以由已知的自變數估計應
變數的條件期望值
Argument: method = "auto", "lm", "glm", "gam", "loess" or a function
使用的迴歸分析方法,auto 會自動選擇最佳者
Argument: se = TRUE or FALSE
是否顯示 confidence interval
Argument: formula = y~x or a formula
可指定其他 formulas
g + geom_smooth()
g + geom_smooth(se = FALSE)
g + geom_smooth(method = "lm", color = "red", size = 2)
指定起終點座標
斜 線 : geom_segment(data = df, aes(x = Var1, y = Var2, xend = Var3, yend = Var4),
arguments)
在座標點 (Var1, Var2) 和 (Var3, Var4) 之間畫直線,座標點必須存在 df 中
xypoint <- data.frame(x1 = 12, y1 = 2, x2 = 30, y2 = 4.5)
8
g + geom_segment(data = xypoint, aes(x = x1, y = y1, xend = x2, yend = y2), color =
"blue", size = 1.2)
曲線: geom_curve(data = df, aes(x = Var1, y = Var2, xend = Var3, yend = Var4), curvature =
數值, angle = 數值, ncp = 數值, arguments)
在座標點 (Var1, Var2) 和 (Var3, Var4) 之間畫曲線,座標點必須存在 df 中
Argument: curvature = 數值 (預設值 0.5,0 代表直線)
Argument: angle = 數值 (0~180 之間,預設值 90)
Argument: ncp = 數值 (預設值 5)
g + geom_curve(data = xypoint, aes(x = x1, y = y1, xend = x2, yend = y2), color =
"blue")
面向 or 圖層 (facets)
根據指定變數 (通常是 factor),將原圖分成多個層面來觀察
所有種類統計圖都可以使用
facet_wrap(~Var)
找出變數 Var 的個別圖層,據此將資料分群,每一群資料各繪一張小圖
所有小圖排列成一張大圖,直行或橫列位置不具特別意義
ggplot(dsel, aes(carat, price, color = color)) + geom_point() + facet_wrap(~color)
ggplot(dsel, aes(carat, price, color = color)) + geom_point() + facet_wrap(~cut)
ggplot(dsel, aes(carat, price, color = color)) + geom_point() + facet_wrap(~clarity)
facet_grid(Var1~Var2~…)
指定二個以上變數,將變數的所有分層指派到直行或橫列上
ggplot(dsel, aes(carat, price, color = color)) + geom_point() + facet_grid(color~cut)
ggplot(dsel, aes(carat, price, color = color)) + geom_point() + facet_grid(color~clarity)
ggplot(dsel, aes(carat, price, color = color)) + geom_point() + facet_grid(cut~clarity)
繪圖區域控制
調整 XY 軸顯示範圍 (顯示範圍之外的資料會被刪除)
xlim(Var1, Var2) or ylim(Var1, Var2)
Var1 和 Var2 分別設定最小值和最大值,NA 為自動設定
ggplot(mtcars, aes(mpg, wt)) + geom_point()
ggplot(mtcars, aes(mpg, wt)) + geom_point() + xlim(15, 20)
ggplot(mtcars, aes(mpg, wt)) + geom_point() + xlim(NA, 20)
ggplot(mtcars, aes(mpg, wt)) + geom_point() + xlim(20, 15)
設定標題名稱
labs(arguments)
Arguments: aesthetics = 字串
Arguments: title/subtitle/caption/tag = 字串
g <- ggplot(dsel, aes(carat, price, color = cut)) + geom_point()
9
g + labs(x = "x label", y = "y label", color = "Cut Quality", title = "Title", subtitle =
"Subtitle", caption = "Caption", tag = "Tag")
ggtitle(字串) or xlab(字串) or ylab(字串)
設定個別 labels
加入文字註解
geom_text(aes(x = 數值, y = 數值, label = 字串, arguments))
在指定 XY 座標值處加上字串
Arguments: 設定格式
g <- ggplot(dsel, aes(carat, price, color = cut)) + geom_point()
g + geom_text(aes(0.5, 15000, label = "AA"), color = "red")
設定圖例
guides(aesthetics = guide_legend(arguments))
aesthetics 必須在原圖中有設定
g <- ggplot(df4, aes(Num, Percent, color = factor(Name))) + geom_point() + geom_line()
g + guides(color = guide_legend(title = "Candidates"))
設定座標軸
coord_fixed(ratio = 數值)
ratio = y/x,一個 grid 的顯示數值比值
coord_flip()
對調座標軸
g <- ggplot(dsel, aes(carat, price, color = cut)) + geom_point()
g + coord_fixed(ratio = 1/2500)
g + coord_fixed(ratio = 1/10000)
g + coord_flip()
設定主題風格
theme_XXX()
10