You are on page 1of 169

Introduction to Data Analysis in R

125L
What is Literal? | Webopedia

R: Arithmetic Operators (ethz.ch)


128L %/% 5L

FALSE
10.50 * 200L

21.15 * 50L

3.20 * 500L

2100 + 1057.5 + 1600

12.10 * 200

24.90 * 50

4.99 * 500

12.10 * 200 + 24.90 * 50 + 4.99 * 500


total_purchase_cost <- 10.50 * 200L + 21.15 * 50L + 3.20 * 500L

total_selling_cost <- 12.10 * 200 + 24.90 * 50 + 4.99 * 500


profit_1 <- total_selling_cost - total_purchase_cost

q_1 <- class(total_purchase_cost)

q_2 <- class(total_selling_cost)

q_3 <- class(profit_1)


Consistent naming conventions in R | R-bloggers

Logical Expressions in R
R: Relational Operators (ethz.ch)

R: Logical Operators (ethz.ch)


Project: Install RStudio

Data Visualization in R
library(readr)
life_expec <- read_csv("life_expec.csv")
life_expec %>%

ggplot()
life_expec_filter <- life_expec %>%

filter(Race == "All Races" & Sex == "Both Sexes")

life_expec_filter %>%

ggplot(aes(x = Year, y = Avg_Life_Expec)) +

geom_line() +

labs(

title = "United States Life Expectancy: 100 Years of Change",

y = "Average Life Expectancy (Years)"

)
String Manipulation in R: Fundamentals
library(tidyverse)

# Supply col_types = cols() to suppress column specification output

recent_grads <- read_csv("recent_grads.csv", col_types = cols())

lower_case_colnames <- str_to_lower(colnames(recent_grads))

colnames(recent_grads) <- lower_case_colnames


library(tidyverse)

input_sentence <- "You'll need to split this sentence."

output1 <- str_split(input_sentence, " ")[[1]]

output2 <- output1[1:4]


Function reference • stringr (tidyverse.org)

Regular expressions • stringr (tidyverse.org)

Regular-Expressions.info - Regex Tutorial, Examples and Reference - Regexp Patterns

Working with APIs


# Make a get request to get the latest position of the ISS from the OpenNotify API.

response = requests.get("http://api.open-notify.org/iss-now.json")

status_code = response.status_code
# Enter your answer below.

response = requests.get("http://api.open-notify.org/iss-pass")

status_code = response.status_code
# Set up the parameters we want to pass to the API.

# This is the latitude and longitude of New York City.

parameters = {"lat": 40.71, "lon": -74}

# Make a get request with the parameters.

response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)

# Print the content of the response (the data the server returned)

print(response.content)

# This gets the same data as the command above


response = requests.get("http://api.open-notify.org/iss-pass.json?lat=40.71&lon=-74")

print(response.content)

parameters = {"lat": 37.78, "lon": -122.41}

response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)

content = response.content

# Make a list of fast food chains.

best_food_chains = ["Taco Bell", "Shake Shack", "Chipotle"]

print(type(best_food_chains))

# Import the JSON library.

import json

# Use json.dumps to convert best_food_chains to a string.

best_food_chains_string = json.dumps(best_food_chains)

print(type(best_food_chains_string))
# Convert best_food_chains_string back to a list.

print(type(json.loads(best_food_chains_string)))

# Make a dictionary

fast_food_franchise = {

"Subway": 24722,

"McDonalds": 14098,

"Starbucks": 10821,

"Pizza Hut": 7600

# We can also dump a dictionary to a string and load it.

fast_food_franchise_string = json.dumps(fast_food_franchise)

print(type(fast_food_franchise_string))

fast_food_franchise_2 = json.loads(fast_food_franchise_string)
# Headers is a dictionary

print(response.headers)

content_type = response.headers["content-type"]
# Call the API here.

response = requests.get("http://api.open-notify.org/astros.json")

json_data = response.json()

in_space_count = json_data["number"]
Requests: HTTP for Humans™ — Requests 2.26.0 documentation (python-requests.org)

19.2. json — JSON encoder and decoder — Python 3.6.15 documentation


# Create a dictionary of headers containing our Authorization header.

headers = {"Authorization": "token 1f36137fbbe1602f779300dad26e4c1b7fbab631"}

# Make a GET request to the GitHub API with our headers.

# This API endpoint will give us details about Vik Paruchuri.


response = requests.get("https://api.github.com/users/VikParuchuri", headers=headers)

# Print the content of the response. As you can see, this token corresponds to the account of Vik
Paruchuri.

print(response.json())

response = requests.get("https://api.github.com/users/VikParuchuri/orgs", headers=headers)

orgs = response.json()

# We've loaded headers in.

response = requests.get("https://api.github.com/users/torvalds", headers=headers)

torvalds = response.json()

# Enter your answer here.


response = requests.get("https://api.github.com/repos/octocat/Hello-World", headers=headers)

hello_world = response.json()

params = {"per_page": 50, "page": 1}

response = requests.get("https://api.github.com/users/VikParuchuri/starred", headers=headers,


params=params)

page1_repos = response.json()

params = {"per_page": 50, "page": 2}

response = requests.get("https://api.github.com/users/VikParuchuri/starred", headers=headers,


params=params)

page2_repos = response.json()
# Enter your code here.

response = requests.get("https://api.github.com/user", headers=headers)

# Create the data we'll pass into the API endpoint. While this endpoint only requires the "name" key,
there are other optional keys.

payload = {"name": "test"}

# We need to pass in our authentication headers!

response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)

print(response.status_code)

payload = {"name": "learning-about-apis"}

response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)


status = response.status_code

payload = {"description": "The best repository ever!", "name": "test"}

response = requests.patch("https://api.github.com/repos/VikParuchuri/test", json=payload,


headers=headers)

print(response.status_code)

payload = {"description": "Learning about requests!", "name": "learning-about-apis"}

response = requests.patch("https://api.github.com/repos/VikParuchuri/learning-about-apis",
json=payload, headers=headers)

status = response.status_code
user = response.json()
response = requests.delete("https://api.github.com/repos/VikParuchuri/test", headers=headers)

print(response.status_code)

response = requests.delete("https://api.github.com/repos/VikParuchuri/learning-about-apis",
headers=headers)

status = response.status_code
response = requests.delete("https://api.github.com/repos/VikParuchuri/test", headers=headers)

print(response.status_code)

response = requests.delete("https://api.github.com/repos/VikParuchuri/learning-about-apis",
headers=headers)

status = response.status_code

Sign in to GitHub · GitHub

GitHub REST API - GitHub Docs


GitHub REST API - GitHub Docs
Not Found! (spring.io)
python_top_articles = python_top["data"]["children"]

most_upvoted = ""

most_upvotes = 0

for article in python_top_articles:

ar = article["data"]

if ar["ups"] >= most_upvotes:

most_upvoted = ar["id"]

most_upvotes = ar["ups"]
headers = {"Authorization": "bearer 13426216-4U1ckno9J5AiK72VRbpEeBaMSKk", "User-Agent":
"Dataquest/1.0"}

response = requests.get("https://oauth.reddit.com/r/python/comments/4b7w9u", headers=headers)

comments = response.json()
payload = {"dir": 1, "id": "d16y4ry"}

headers = {"Authorization": "bearer 13426216-4U1ckno9J5AiK72VRbpEeBaMSKk", "User-Agent":


"Dataquest/1.0"}

response = requests.post("https://oauth.reddit.com/api/vote", json=payload, headers=headers)

status = response.status_code
Web Scraping
parser = BeautifulSoup(content, 'html.parser')

# Get a list of all occurrences of the body tag in the element.

body = parser.find_all("body")

# Get the paragraph tag.

p = body[0].find_all("p")

# Get the text.

print(p[0].text)
head = parser.find_all("head")

title = head[0].find_all("title")

title_text = title[0].text
# Get the website that contains classes.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_classes.html")

content = response.content

parser = BeautifulSoup(content, 'html.parser')

# Get the first inner paragraph.

# Find all the paragraph tags with the class inner-text.

# Then, take the first element in that list.

first_inner_paragraph = parser.find_all("p", class_="inner-text")[0]

print(first_inner_paragraph.text)

second_inner_paragraph = parser.find_all("p", class_="inner-text")[1]

second_inner_paragraph_text = second_inner_paragraph.text

first_outer_paragraph = parser.find_all("p", class_="outer-text")[0]

first_outer_paragraph_text = first_outer_paragraph.text
# Get the website that contains classes and IDs.

response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")

content = response.content

parser = BeautifulSoup(content, 'html.parser')

# Select all of the elements that have the first-item class.

first_items = parser.select(".first-item")

# Print the text of the first paragraph (the first element with the first-item class).

print(first_items[0].text)

first_outer_text = parser.select(".outer-text")[0].text

second_text = parser.select("#second")[0].text

# Get the website that contains classes and IDs.

response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")

content = response.content

parser = BeautifulSoup(content, 'html.parser')

# Select all of the elements that have the first-item class.

first_items = parser.select(".first-item")

# Print the text of the first paragraph (the first element with the first-item class).
print(first_items[0].text)

first_outer_text = parser.select(".outer-text")[0].text

second_text = parser.select("#second")[0].text
# Get the Superbowl box score data.

response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")

content = response.content

parser = BeautifulSoup(content, 'html.parser')

# Find the number of turnovers the Seahawks committed.

turnovers = parser.select("#turnovers")[0]

seahawks_turnovers = turnovers.select("td")[1]

seahawks_turnovers_count = seahawks_turnovers.text

print(seahawks_turnovers_count)

patriots_total_plays_count = parser.select("#total-plays")[0].select("td")[2].text

seahawks_total_yards_count = parser.select("#total-yards")[0].select("td")[1].text
HTML basics - Learn web development | MDN (mozilla.org)

HTML elements reference - HTML: HyperText Markup Language | MDN (mozilla.org)

Beautiful Soup Documentation — Beautiful Soup 4.9.0 documentation (crummy.com)

Fuzzy Language in Data Science


# ans = "A"

# ans = "B"
# ans = "C"

ans = "A"
best_churn["scaled_tran"] = (best_churn["nr_of_transactions"] \

- best_churn["nr_of_transactions"].min()) \

/ (best_churn["nr_of_transactions"].max() \

- best_churn["nr_of_transactions"].min())

best_churn["scaled_amount"] = (best_churn["amount_spent"] \

-best_churn["amount_spent"].min()) \

/ (best_churn["amount_spent"].max() \

- best_churn["amount_spent"].min())
best_churn["score"] = 100*(.5*best_churn["scaled_tran"] \

+ .5*best_churn["scaled_amount"])

best_churn.sort_values("score", inplace=True, ascending=False)

top_50_churned = best_churn.loc[best_churn["churned"] == 1].head(50)

top_50_churned.to_csv("best_customers.txt")
Communicating Results
import pandas as pd

playstore = pd.read_csv("googleplaystore.csv")

print(playstore.shape)

answer="no" # We don't care about free apps for this project


playstore.drop(labels=10472, inplace=True)
paid.sort_values("Reviews", ascending=False, inplace=True)

paid.drop_duplicates("App", inplace=True)

print(paid.duplicated("App").sum())

paid.reset_index(inplace=True, drop=True)

affordable_apps = paid[paid["Price"]<50].copy()

cheap = affordable_apps["Price"] < 5

reasonable = affordable_apps["Price"] >= 5

affordable_apps[cheap].hist(column="Price", grid=False, figsize=(12,6))


affordable_apps[reasonable].hist(column="Price", grid=False, figsize=(12,6))

affordable_apps["affordability"] = affordable_apps.apply(

lambda row: "cheap" if row["Price"] < 5 else "reasonable",

axis=1

cheap = affordable_apps["Price"] < 5

reasonable = affordable_apps["Price"] >= 5

cheap_mean = affordable_apps.loc[cheap, "Price"].mean()

affordable_apps.loc[cheap, "price_criterion"] = affordable_apps["Price"].apply(

lambda price: 1 if price < cheap_mean else 0

affordable_apps[reasonable].plot(kind="scatter", x="Price", y="Rating")

reasonable_mean = affordable_apps.loc[reasonable, "Price"].mean()

affordable_apps.loc[reasonable,"price_criterion"] = affordable_apps["Price"].apply(

lambda price: 1 if price < reasonable_mean else 0

)
affordable_apps["genre_count"] = affordable_apps["Genres"].str.count(";")+1

genres_mean = affordable_apps.groupby(

["affordability", "genre_count"]

).mean()[["Price"]]

def label_genres(row):

"""For each segment in `genres_mean`,

labels the apps that cost less than its segment's mean with `1`

and the others with `0`."""

aff = row["affordability"]

gc = row["genre_count"]

price = row["Price"]

if price < genres_mean.loc[(aff, gc)][0]:


return 1

else:

return 0

affordable_apps["genre_criterion"] = affordable_apps.apply(

label_genres, axis="columns"

categories_mean = affordable_apps.groupby(

["affordability", "Category"]

).mean()[["Price"]]

def label_categories(row):

"""For each segment in `categories_mean`,

labels the apps that cost less than its segment's mean with `1`

and the others with `0`."""

aff = row["affordability"]

cat = row["Category"]

price = row["Price"]

if price < categories_mean.loc[(aff, cat)][0]:

return 1

else:

return 0

affordable_apps["category_criterion"] = affordable_apps.apply(

label_categories, axis="columns"
)

criteria = ["price_criterion", "genre_criterion", "category_criterion"]

affordable_apps["Result"] = affordable_apps[criteria].mode(axis='columns')

def new_price(row):

if row["affordability"] == "cheap":

return round(max(row["Price"], cheap_mean), 2)

else:

return round(max(row["Price"], reasonable_mean), 2)

affordable_apps["New Price"] = affordable_apps.apply(new_price, axis="columns")

affordable_apps["Installs"] = affordable_apps["Installs"].str.replace("[+,]", "").astype(int)

affordable_apps["Impact"] = (affordable_apps["New Price"]-


affordable_apps["Price"])*affordable_apps["Installs"]

total_impact = affordable_apps["Impact"].sum()
print(total_impact)
ans1 = "no"

ans21 = "no"

ans22 = "yes"
import pandas as pd

subs = pd.read_csv("muscle_labs.csv", parse_dates=["end_date", "start_date"])

subs["churn_month"] = subs["end_date"].dt.year*100 + subs["end_date"].dt.month


monthly_churn = pd.DataFrame({"total_churned": subs.groupby("churn_month").size()})
import datetime as dt

# arange = __import__("numpy").arange
# Ellipse = __import__("matplotlib").patches.Ellipse

# ax = churn.plot(x="yearmonth", y="churn_rate", figsize=(12,6), rot=45, marker=".")

# start, end = ax.get_xlim()

# ax.get_xticks()

# ax.set_xticks(arange(2, end, 3))

# ax.set_xticklabels(yearmonths[2::3])

# circle = Ellipse((35, churn.loc[churn.yearmonth == "201312", "churn_rate"].iloc[0]),

# 5, 0.065, color='sandybrown', fill=False

# )

# ax.add_artist(circle)

# ax.xaxis.label.set_visible(False)

# ax.spines['top'].set_visible(False)

# ax.spines['right'].set_visible(False)

# ax.get_legend().remove()

import datetime as dt

def get_customers(yearmonth):

year = yearmonth//100

month = yearmonth-year*100

date = dt.datetime(year, month, 1)

return ((subs["start_date"] < date) & (date <= subs["end_date"])).sum()

churn["total_customers"] = churn["yearmonth"].apply(get_customers)

churn["churn_rate"] = churn["total_churned"] / churn["total_customers"]

churn["yearmonth"] = churn["yearmonth"].astype(str)

arange = __import__("numpy").arange

Ellipse = __import__("matplotlib").patches.Ellipse

ax = churn.plot(x="yearmonth", y="churn_rate", figsize=(12,6), rot=45, marker=".")

start, end = ax.get_xlim()


ax.get_xticks()

ax.set_xticks(arange(2, end, 3))

ax.set_xticklabels(yearmonths[2::3])

circle = Ellipse((35, churn.loc[churn.yearmonth == "201312", "churn_rate"].iloc[0]),

5, 0.065, color='sandybrown', fill=False

ax.add_artist(circle)

ax.xaxis.label.set_visible(False)

ax.spines['top'].set_visible(False)

ax.spines['right'].set_visible(False)

ax.get_legend().remove()

The Mean
set.seed(1)

distribution <- sample.int(100, size=10)

ndistribution <- replicate(n=50, expr=sample.int(25, size=5))


set.seed(1)

checkDist <- function(){

distribution <- sample.int(1000, size=10)

mean <- sum(distribution) / length(distribution)

round(sum(distribution - mean)) == 0

equal_distances <- sum(replicate(n=5000, expr=checkDist()))


distribution_1 <- c(42, 24, 32, 11)

distribution_2 <- c(102, 32, 74, 15, 38, 45, 22)

distribution_3 <- c(3, 12, 7, 2, 15, 1, 21)

compute_mean <- function(distribution) {

N <- length(distribution)

sum_of_the_distribution = 0

for ( i in 1:N) {

sum_of_the_distribution <- sum_of_the_distribution + distribution[i]

sum_of_the_distribution / N

}
mean_1 <- compute_mean(distribution_1)

mean_2 <- compute_mean(distribution_2)

mean_3 <- compute_mean(distribution_3)

library(readr)

houses <- read_tsv('AmesHousing_1.txt')

one <- TRUE # every column that describes years is measured on an interval scale

two <- FALSE # `SalePrice` is measured on a ratio scale

three <- TRUE # The data set has less values than the initial one with 3970 rows which we don't
know either whether it represents a population
compute_mean <- function(distribution) {

N <- length(distribution)

sum_of_the_distribution = 0

for ( i in 1:N) {

sum_of_the_distribution <- sum_of_the_distribution + distribution[i]

sum_of_the_distribution / N

computed_mean <- compute_mean(houses$SalePrice)

r_mean <- mean(houses$SalePrice)

means_are_equal <- (computed_mean == r_mean)


library(tibble)
library(ggplot2)

library(purrr)

set.seed(4)

parameter <- mean(houses$SalePrice)

sample_sizes <- seq(5, by=29, length.out=100)

sampling_errors <- map_dbl(sample_sizes,

function(x) parameter - mean(sample(houses$SalePrice,

size=x)) )

df <- tibble(x = sample_sizes, y = sampling_errors)

ggplot(data = df, aes(x = sample_sizes, y = sampling_errors)) +

geom_point(size=2) +

geom_hline(yintercept = 0) +

geom_vline(xintercept = 2930) +

labs(x = "Sample size",

y = "Sampling error")
library(tibble)

library(ggplot2)

set.seed(1)

mean_points <- replicate(n = 10000,

expr = mean(sample(houses$SalePrice,

size = 100)))

ggplot(data = tibble(mean_points), aes(x = mean_points)) +

geom_histogram(bins = 100,

position = "identity",

alpha = 0.5) +

geom_vline(aes(xintercept = mean(houses$SalePrice))) +

xlab("Sample mean") +

ylab("Frequency") +

xlim(0, 500000)
population <- c(3, 7, 2)

library(purrr)
samples <- list(c(3, 7),

c(3, 2),

c(7, 2),

c(7, 3),

c(2, 3),

c(2, 7))

sample_means <- map_dbl(samples, function(x) mean(x))

population_mean <- mean(population)

mean_of_sample_means <- mean(sample_means)

unbiased <- (population_mean == mean_of_sample_means)


R: Random Samples and Permutations (ethz.ch)

R: Arithmetic Mean (ethz.ch)

R: Apply a Function over a List or Vector (ethz.ch)

Machine Learning Introduction with Python

Introduction to K-Nearest Neighbors


stripped_commas = dc_listings['price'].str.replace(',', '')

stripped_dollars = stripped_commas.str.replace('$', '')

dc_listings['price'] = stripped_dollars.astype('float')

mean_price = dc_listings.iloc[0:5]['price'].mean()

print(mean_price)

# Brought along the changes we made to the `dc_listings` Dataframe.

dc_listings = pd.read_csv('dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')

stripped_dollars = stripped_commas.str.replace('$', '')

dc_listings['price'] = stripped_dollars.astype('float')

dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]

def predict_price(new_listing):

temp_df = dc_listings.copy()

## Complete the function.

return(new_listing)

acc_one = predict_price(1)

acc_two = predict_price(2)

acc_four = predict_price(4)

def predict_price(new_listing):

temp_df = dc_listings.copy()

temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))

temp_df = temp_df.sort_values('distance')

nearest_neighbors = temp_df.iloc[0:5]['price']

predicted_price = nearest_neighbors.mean()

return(predicted_price)

acc_one = predict_price(1)

acc_two = predict_price(2)

acc_four = predict_price(4)

print(acc_one)

print(acc_two)

print(acc_four)
k-nearest neighbors algorithm - Wikipedia

Five most popular similarity measures implementation in python (dataaspirant.com)


Machine Learning Tutorial Using K-NN | Dataquest

You might also like