You are on page 1of 11

GENOMICS, TRANSCRIPTOMICS AND EPIGENOMICS

Introduction to R
# Simple operations
12 + 64 * 2 # Simple vector
v <- c(1, 2, 3)
# variables assignation
x <- 1 # length of the vector (function call)
y <- 2 length(v)
z <- x + y
# vector indexing
# create a new vector v v[1]
v <- c(10, 8, 15) v[2]
v[3]
# first element of v
v[1] # math operations
sum(v)
# Indexing a vector max(v)

Data types in R :
- Vectors
- Lists
- Matrices
- Arrays
- Factors
- Data Frames

> # type (class) of an object


> class(1)
[1] "numeric"
> class(c(1, 2))
[1] "numeric"
> # vector
> a <- c(1, 2)
> length(a)
[1] 2
> a[1]
[1] 1
> # vector (R has no scalar types!)
> a <- 1
> length(a)
[1] 1
> a[1]
[1] 1
Lists
> # a simple vector (fixed type)
> a <- c(1, 2, 3)
> # a simple list
> b <- list("a", 2, 3)
> # index the list
> b[[1]]
[1] "a"
> b[[2]] + b[[3]]
[1] 5

What are data.frames?

data.frame is the main data type in R for storing table data.


Each row of a given column must contain the same type of data. It is a special list of vectors.
> # create a new data.frame df
> df <- data.frame(
+ name = c("a", "b", "c"),
+ value = c(10, 8, 15)
+ )

Indexing a data.frame
Loading and saving data.frame

# loading a data.frame from a CSV file


t <- read.csv('mydata.csv')
v <- read.table('my data.tsv', sep = '\t')

# writing a data.frame to a CSV file


write.csv(t, 'mydata_updated.csv')

Ordering a data.frame

> # data
> d <- data.frame(
+ count = c(8, 21, 32, 12, 4),
+ cat = c("A", "A", "B", "B", "B"))
> # get the order of each element in d$count
> order(d$count)
[1] 5 1 4 2 3
> order(d$count, decreasing = TRUE)
[1] 3 2 4 1 5
> # create a reordered data.frame
> d[order(d$count),]
count cat
5 4 B
1 8 A
4 12 B
2 21 A
3 32 B
> # reverse order data.frame
> d[order(d$cat, decreasing = TRUE),]
count cat
3 32 B
4 12 B
5 4 B
1 8 A
2 21 A

Factor

factor is special categorial data type in R. Vector can be converted to factor.


The categories can be set or retrieved using the levels functions.

> # Create a vector and a factor


> f <- c("red", "blue", "red", "red", "blue")
> a <- factor(f)
> # Get the levels from a factor
> levels(a)
[1] "blue" "red"
> # Change the level names
> levels(a) <- c("A", "B")
> # Convert back a factor to a vector
> b <- as.vector(a)

Function

A new function can be created using the function keyword.


The input can be composed of several parameters.
The output corresponds to the value of the last instruction or the value passed to return.
> g <- function(a, b, c) {return(a + b - c)}
> g(2,1,6)
[1] -3

> # function definition


> f <- function(x, y)
+ {
+ z <- x + y
+ return(z + 1)
+ }
> # function call
> f(1, 4)
[1] 6
> # combining function call
> a <- 3
> f(a, f(a, a))
[1] 11

sapply, apply, tapply

> f<- c("red", "blue", "red", "red", "blue")


> x <- c(1, 3, 2, 4, 2)
> tapply(x, f, sum)
blue red
5 7

apply() – used to apply a function to the rows or columns of matrices or data frames
apply(x, MARGIN, FUN)
x = matrix, data frame or array
MARGIN = 1 indicates rows, 2 indicates columns, c(1,2) indicates rows and column
FUN = function to be applied

# Get the sum of each column


data <- matrix(1:9, nrow=3, ncol=3)
data
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

apply(data, 2, sum)
[1] 6 15 24

# Get the sum of each row


data <- matrix(1:9, nrow=3, ncol=3)
data
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

apply(data, 1, sum)
[1] 12 15 18
lapply() – used to apply a function to each element of the list
lapply(x, FUN)
x = list
FUN = function

# Get the sum of each list item


data <- list(item1 = 1:5,
item2 = seq(4,36,8),
item4 = c(1,3,5,7,9))
data
$item1
[1] 1 2 3 4 5

$item2
[1] 4 12 20 28 36

$item4
[1] 1 3 5 7 9

lapply(data, sum)
$item1
[1] 15

$item2
[1] 100

$item4
[1] 25

sapply() and lapply() work basically the same


The only difference is that lapply() always return a list, whereas sapply() tries to simplify the r
esult into a vector or matrix.

# Get the sum of each list item and simplify the result into a vector
data <- list(item1 = 1:5,
item2 = seq(4,36,8),
item4 = c(1,3,5,7,9))
data
$item1
[1] 1 2 3 4 5

$item2
[1] 4 12 20 28 36

$item4
[1] 1 3 5 7 9

sapply(data, sum)
item1 item2 item4
15 100 25

tapply() – breaks the data set intro groups and applies a function to each group
tapply(x, INDEX, FUN, simplify)
x = a vector
INDEX = a grouping factor or a list of factors
FUN = the function to be applied
Simplify = return simplified result if set to TRUE. Default is TRUE

# Find the age of youngest male and female


data <- data.frame(name=c("Amy","Max","Ray","Kim","Sam","Eve","Bob"),
age=c(24, 22, 21, 23, 20, 24, 21),
gender=factor(c("F","M","M","F","M","F","M")))
data
name age gender
1 Amy 24 F
2 Max 22 M
3 Ray 21 M
4 Kim 23 F
5 Sam 20 M
6 Eve 24 F
7 Bob 21 M

tapply(data$age, data$gender, min)


F M
23 20

grep() = searches for matches of certain character pattern in a vector of character strings and
returns the indices that yielded a match
grepI() = searches for matches of certain character pattern in a vector of character strings an
d returns a logical vector indicating which elements of the vector contained a match

x <- c("d", "a", "c", "abba") # Create example character vector


grep("a", x) # Apply grep function in R
# 2 4
grepl("a", x) # Apply grepl function in R
# FALSE TRUE FALSE TRUE

aggregate() = splits the data into subsets, computes summary statistics for each, and return t
he result in a convenient form

> d <- data.frame(count = c(8, 21, 32, 12, 4),


+ cat = c('A', 'A', 'B', 'B', 'B'))
> aggregate(d$count, list(cat = d$cat), sum)
cat x
1 A 29
2 B 48

Libraries
> # load the library
> # install.packages(“package_name”)
> library(stringr)
> # use a function from the library
> str_to_upper("aaaa")
[1] "AAAA"
Bioconductor
- provides tools for the analysis and comprehension of high-throughput genomic data.
- uses the R statistical programming language and is open source and open developme
nt.

# load a bioconductor the library


library("Biostrings")
# read the fast file
fa <- readDNAStringSet("data/hg19.fa")
dunif, runif, qunif, punif
dnorm, rnorm, qnorm, pnorm
These functions provide information about the uniform distribution / normal distribution on
the interval from min to max.
- d gives the height of the probability density function,
- p gives the distribution function (cumulative density function)
- q gives the quantile function (inverse cumulative density function)
- r generates random deviates (random numbers).

> # uniform distribution


> dunif(0.5)
[1] 1
> runif(3)
[1] 0.07315138 0.65871179 0.29570629
> # normal distribution
> dnorm(0.5)
[1] 0.3520653
> rnorm(3, mean = 3, sd = 1)
[1] 4.219802 1.681584 1.945536

Color palette Plot pch

xy plot

Scatter plot
Histogram

Boxplot

Barplots
Heatmap

Multiplots

par(mfrow = c(1, 2))


plot(x, y)
plot(x, z)

You might also like