Introduction To R

GENOMICS, TRANSCRIPTOMICS AND EPIGENOMICS
Introduction to R
# Simple operations
12 + 64 * 2 # Simple vector
v <- c(1, 2, 3)
# variables assignation
x <- 1 # length of the vector (function call)
y <- 2 length(v)
z <- x + y
# vector indexing
# create a new vector v v[1]
v <- c(10, 8, 15) v[2]
v[3]
# first element of v
v[1] # math operations
sum(v)
# Indexing a vector max(v)
Data types in R :
- Vectors
- Lists
- Matrices
- Arrays
- Factors
- Data Frames
> # type (class) of an object

> class(1)
[1] "numeric"
> class(c(1, 2))
[1] "numeric"
> # vector
> a <- c(1, 2)
> length(a)
[1] 2
> a[1]
[1] 1
> # vector (R has no scalar types!)
> a <- 1
> length(a)
[1] 1
> a[1]
[1] 1
Lists
> # a simple vector (fixed type)
> a <- c(1, 2, 3)
> # a simple list
> b <- list("a", 2, 3)
> # index the list
> b[[1]]
[1] "a"
> b[[2]] + b[[3]]
[1] 5
What are data.frames?
data.frame is the main data type in R for storing table data.

Each row of a given column must contain the same type of data. It is a special list of vectors.
> # create a new data.frame df
> df <- data.frame(
+ name = c("a", "b", "c"),
+ value = c(10, 8, 15)
+ )
Indexing a data.frame
Loading and saving data.frame
# loading a data.frame from a CSV file

t <- read.csv('mydata.csv')
v <- read.table('my data.tsv', sep = '\t')
# writing a data.frame to a CSV file

write.csv(t, 'mydata_updated.csv')
Ordering a data.frame
> # data
> d <- data.frame(
+ count = c(8, 21, 32, 12, 4),
+ cat = c("A", "A", "B", "B", "B"))
> # get the order of each element in d$count
> order(d$count)
[1] 5 1 4 2 3
> order(d$count, decreasing = TRUE)
[1] 3 2 4 1 5
> # create a reordered data.frame
> d[order(d$count),]
count cat
5 4 B
1 8 A
4 12 B
2 21 A
3 32 B
> # reverse order data.frame
> d[order(d$cat, decreasing = TRUE),]
count cat
3 32 B
4 12 B
5 4 B
1 8 A
2 21 A
Factor
factor is special categorial data type in R. Vector can be converted to factor.

The categories can be set or retrieved using the levels functions.
> # Create a vector and a factor

> f <- c("red", "blue", "red", "red", "blue")
> a <- factor(f)
> # Get the levels from a factor
> levels(a)
[1] "blue" "red"
> # Change the level names
> levels(a) <- c("A", "B")
> # Convert back a factor to a vector
> b <- as.vector(a)
Function
A new function can be created using the function keyword.

The input can be composed of several parameters.
The output corresponds to the value of the last instruction or the value passed to return.
> g <- function(a, b, c) {return(a + b - c)}
> g(2,1,6)
[1] -3
> # function definition

> f <- function(x, y)
+ {
+ z <- x + y
+ return(z + 1)
+ }
> # function call
> f(1, 4)
[1] 6
> # combining function call
> a <- 3
> f(a, f(a, a))
[1] 11
sapply, apply, tapply
> f<- c("red", "blue", "red", "red", "blue")

> x <- c(1, 3, 2, 4, 2)
> tapply(x, f, sum)
blue red
5 7
apply() – used to apply a function to the rows or columns of matrices or data frames
apply(x, MARGIN, FUN)
x = matrix, data frame or array
MARGIN = 1 indicates rows, 2 indicates columns, c(1,2) indicates rows and column
FUN = function to be applied
# Get the sum of each column

data <- matrix(1:9, nrow=3, ncol=3)
data
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
apply(data, 2, sum)
[1] 6 15 24
# Get the sum of each row

data <- matrix(1:9, nrow=3, ncol=3)
data
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
apply(data, 1, sum)
[1] 12 15 18
lapply() – used to apply a function to each element of the list
lapply(x, FUN)
x = list
FUN = function
# Get the sum of each list item

data <- list(item1 = 1:5,
item2 = seq(4,36,8),
item4 = c(1,3,5,7,9))
data
$item1
[1] 1 2 3 4 5
$item2
[1] 4 12 20 28 36
$item4
[1] 1 3 5 7 9
lapply(data, sum)
$item1
[1] 15
$item2
[1] 100
$item4
[1] 25
sapply() and lapply() work basically the same

The only difference is that lapply() always return a list, whereas sapply() tries to simplify the r
esult into a vector or matrix.
# Get the sum of each list item and simplify the result into a vector
data <- list(item1 = 1:5,
item2 = seq(4,36,8),
item4 = c(1,3,5,7,9))
data
$item1
[1] 1 2 3 4 5
$item2
[1] 4 12 20 28 36
$item4
[1] 1 3 5 7 9
sapply(data, sum)
item1 item2 item4
15 100 25
tapply() – breaks the data set intro groups and applies a function to each group
tapply(x, INDEX, FUN, simplify)
x = a vector
INDEX = a grouping factor or a list of factors
FUN = the function to be applied
Simplify = return simplified result if set to TRUE. Default is TRUE
# Find the age of youngest male and female

data <- data.frame(name=c("Amy","Max","Ray","Kim","Sam","Eve","Bob"),
age=c(24, 22, 21, 23, 20, 24, 21),
gender=factor(c("F","M","M","F","M","F","M")))
data
name age gender
1 Amy 24 F
2 Max 22 M
3 Ray 21 M
4 Kim 23 F
5 Sam 20 M
6 Eve 24 F
7 Bob 21 M
tapply(data$age, data$gender, min)

F M
23 20
grep() = searches for matches of certain character pattern in a vector of character strings and
returns the indices that yielded a match
grepI() = searches for matches of certain character pattern in a vector of character strings an
d returns a logical vector indicating which elements of the vector contained a match
x <- c("d", "a", "c", "abba") # Create example character vector

grep("a", x) # Apply grep function in R
# 2 4
grepl("a", x) # Apply grepl function in R
# FALSE TRUE FALSE TRUE
aggregate() = splits the data into subsets, computes summary statistics for each, and return t
he result in a convenient form
> d <- data.frame(count = c(8, 21, 32, 12, 4),

+ cat = c('A', 'A', 'B', 'B', 'B'))
> aggregate(d$count, list(cat = d$cat), sum)
cat x
1 A 29
2 B 48
Libraries
> # load the library
> # install.packages(“package_name”)
> library(stringr)
> # use a function from the library
> str_to_upper("aaaa")
[1] "AAAA"
Bioconductor
- provides tools for the analysis and comprehension of high-throughput genomic data.
- uses the R statistical programming language and is open source and open developme
nt.
# load a bioconductor the library

library("Biostrings")
# read the fast file
fa <- readDNAStringSet("data/hg19.fa")
dunif, runif, qunif, punif
dnorm, rnorm, qnorm, pnorm
These functions provide information about the uniform distribution / normal distribution on
the interval from min to max.
- d gives the height of the probability density function,
- p gives the distribution function (cumulative density function)
- q gives the quantile function (inverse cumulative density function)
- r generates random deviates (random numbers).
> # uniform distribution

> dunif(0.5)
[1] 1
> runif(3)
[1] 0.07315138 0.65871179 0.29570629
> # normal distribution
> dnorm(0.5)
[1] 0.3520653
> rnorm(3, mean = 3, sd = 1)
[1] 4.219802 1.681584 1.945536
Color palette Plot pch
xy plot
Scatter plot
Histogram
Boxplot
Barplots
Heatmap
Multiplots
par(mfrow = c(1, 2))

plot(x, y)
plot(x, z)

Introduction To R

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Introduction To R

Uploaded by

Copyright:

Available Formats

GENOMICS, TRANSCRIPTOMICS AND EPIGENOMICS

> # type (class) of an object

What are data.frames?

data.frame is the main data type in R for storing table data.

# loading a data.frame from a CSV file

# writing a data.frame to a CSV file

factor is special categorial data type in R. Vector can be converted to factor.

> # Create a vector and a factor

A new function can be created using the function keyword.

> # function definition

sapply, apply, tapply

> f<- c("red", "blue", "red", "red", "blue")

# Get the sum of each column

# Get the sum of each row

# Get the sum of each list item

sapply() and lapply() work basically the same

# Find the age of youngest male and female

tapply(data$age, data$gender, min)

x <- c("d", "a", "c", "abba") # Create example character vector

> d <- data.frame(count = c(8, 21, 32, 12, 4),

# load a bioconductor the library

> # uniform distribution

Color palette Plot pch

par(mfrow = c(1, 2))

You might also like