Beattie Scda Finalproj-1

Beattie_SCDA_FinalProj
April 20, 2024
[38]: ################################################################################
# Scripting for Data Analaysis: Final Project Report
# Class: IST 652

# Student: Jovan Beattie
# Submission Date: March 24, 2024
tie
at
# Program Summary:
# This program intakes a set of data from GoodReads that includes attributes␣
Be
↪related to books reviewed on the website.
# The data is cleaned, explored, then used to develop predictive models using␣
n
↪review data.
va
Jo
# Questions:
# 1. What is the distribution of genres in the data set?
of
# 2. Who are the authors with the best average reviews? The lowest?
# 3. Can the publications of an author be visualized via time series?
ty
# 4. What are the average ratings of publishers present in the data set?
er
# 5. Can the number of reviews be used to predict average rating?

op
################################################################################h
Pr
[1]: !apt-get update

!apt-get install texlive-xetex texlive-fonts-recommended␣
↪texlive-generic-recommended
'apt-get' is not recognized as an internal or external command,

operable program or batch file.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.
[39]: ### Essential

import pandas as pd
import requests
from tqdm import tqdm
from IPython.display import display
1
# Plotting
import matplotlib.pyplot as plt
import numpy as np
# Predictive models
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
[40]: # This data set is a set pulled from Kaggle. It contains data collected from␣
↪the website GoodReads, a site where users
# can leave reviews for books they have read. On this site, users can leave␣
tie
↪reviews in the form of a star rating, which
# ranges from 0 stars to 5 stars, along with text reviews where users can write␣
at
↪their thoughts and opinions about the
Be
# book. The data also contains relevent information about each book such as the␣
↪author, publisher, publication
n
# information. In total, the data set contains 45,000 observations of 12␣
va
↪attributes.
Jo
[41]: df = pd.read_csv("books.csv")
of
print(df.head(10))
ty
bookID title \
er
0 1.0 Harry Potter and the Half-Blood Prince (Harry …

1 2.0 Harry Potter and the Order of the Phoenix (Har…
op
2 4.0 Harry Potter and the Chamber of Secrets (Harry…

Pr
3 5.0 Harry Potter and the Prisoner of Azkaban (Harr…

4 8.0 Harry Potter Boxed Set Books 1-5 (Harry Potte…
5 9.0 Unauthorized Harry Potter Book Seven News: "Ha…
6 10.0 Harry Potter Collection (Harry Potter #1-6)
7 12.0 The Ultimate Hitchhiker's Guide: Five Complete…
8 13.0 The Ultimate Hitchhiker's Guide to the Galaxy …
9 14.0 The Hitchhiker's Guide to the Galaxy (Hitchhik…
authors average_rating isbn isbn13 \

0 J.K. Rowling/Mary GrandPré 4.57 439785960 9.78E+12
2 J.K. Rowling 4.42 439554896 9.78E+12
3 J.K. Rowling/Mary GrandPré 4.56 043965548X 9.78E+12
5 W. Frederick Zimmerman 3.74 976540606 9.78E+12
6 J.K. Rowling 4.73 439827604 9.78E+12
2
7 Douglas Adams 4.38 517226952 9.78E+12
8 Douglas Adams 4.38 345453743 9.78E+12
9 Douglas Adams 4.22 1400052920 9.78E+12
language_code num_pages ratings_count text_reviews_count \

0 eng 652 2095690.0 27591.0
1 eng 870 2153167.0 29221.0
2 eng 352 6333.0 244.0
3 eng 435 2339585.0 36325.0
4 eng 2690 41428.0 164.0
5 en-US 152 19.0 1.0
6 eng 3342 28242.0 808.0
7 eng 815 3628.0 254.0
8 eng 815 249558.0 4080.0
9 eng 215 4930.0 460.0
tie
publication_date publisher Unnamed: 12
0 9/16/2006 Scholastic Inc. NaN
at
1 9/1/2004 Scholastic Inc. NaN
Be
2 11/1/2003 Scholastic NaN
3 5/1/2004 Scholastic Inc. NaN n
va
5 4/26/2005 Nimble Books NaN
Jo
7 11/1/2005 Gramercy Books NaN

8 4/30/2002 Del Rey Books NaN
of
9 8/3/2004 Crown NaN

ty
[42]: # The first step in cleaning this data is to check for NA values. In this case,␣
er
↪there are a significant number of NA values in the

op
# last column, which is perhaps a column that was mistakenly added during data␣
Pr
↪collection. Regardless, it can be dropped completely
# as it contains no relevant data. Then, any other rows containing NA values in␣
↪the other columns are dopped, with a final check to
# ensure that there are no rows with NA values remaining.
[43]: # Counts the number of NA values in the data set

nan_count = df.isna().sum()
print(nan_count)
bookID 1
title 1
authors 1
average_rating 1
isbn 1
isbn13 1
language_code 1
num_pages 1
3
ratings_count 1
text_reviews_count 1
publication_date 1
publisher 1
Unnamed: 12 11126
dtype: int64
[44]: # Drops last column, the drops rows with NA values

df = df.drop(columns=['Unnamed: 12'])
df = df.dropna()
# Recounts NA values
nan_count = df.isna().sum()
print(nan_count)
bookID 0
title 0
tie
authors 0
at
average_rating 0
isbn 0
Be
isbn13 0
language_code 0
n
num_pages 0
va
ratings_count 0
Jo
text_reviews_count 0
publication_date 0
of
publisher 0
dtype: int64
ty
er
[45]: # Next, to make the data set more manageable and to develop visualizations/
op
↪predictions more tuned towards data regarding specific
# authors, books that have two or more authors credited to them are dropped␣
Pr
↪from the data set. This is done by removing rows with a
# "/", which are only present in observations that have two or more authors.
[46]: # Filters out rows that contain two or more authors using "/" character
substring = '/'
filter = df['authors'].str.contains(substring)
df = df[~filter]
# Check
display(df)
bookID title \
2 4.0 Harry Potter and the Chamber of Secrets (Harry…
5 9.0 Unauthorized Harry Potter Book Seven News: "Ha…
6 10.0 Harry Potter Collection (Harry Potter #1-6)
7 12.0 The Ultimate Hitchhiker's Guide: Five Complete…
8 13.0 The Ultimate Hitchhiker's Guide to the Galaxy …
4
… … …
11120 45630.0 Whores for Gloria
11122 45633.0 You Bright and Risen Angels
11123 45634.0 The Ice-Shirt (Seven Dreams #1)
11124 45639.0 Poor People
11125 45641.0 Las aventuras de Tom Sawyer

2 J.K. Rowling 4.42 439554896 9.78E+12
6 J.K. Rowling 4.73 439827604 9.78E+12
7 Douglas Adams 4.38 517226952 9.78E+12
8 Douglas Adams 4.38 345453743 9.78E+12
… … … … …
11120 William T. Vollmann 3.69 140231579 9.78E+12
tie
at
11125 Mark Twain 3.91 8497646983 9.79E+12
Be
language_code num_pages ratings_countn text_reviews_count \
2 eng 352 6333.0 244.0
va
5 en-US 152 19.0 1.0
6 eng 3342 28242.0 808.0
Jo
7 eng 815 3628.0 254.0

8 eng 815 249558.0 4080.0
of
… … … … …
ty
11120 en-US 160 932.0 111.0

er
11122 eng 635 783.0 56.0

11123 eng 415 820.0 95.0
op
11124 eng 434 769.0 139.0

Pr
11125 spa 272 113.0 12.0
publication_date publisher
2 11/1/2003 Scholastic
5 4/26/2005 Nimble Books
6 9/12/2005 Scholastic
7 11/1/2005 Gramercy Books
8 4/30/2002 Del Rey Books
… … …
11120 2/1/1994 Penguin Books
11124 2/27/2007 Ecco
11125 5/28/2006 Edimat Libros
[6564 rows x 12 columns]
5
[47]: # With the data set narrowed down, the next cleaning step is to ensure that␣
↪columns are of the correct data type. An initial
# check shows that most columns in this data set are being treated as objects,␣
↪which is incorrect. Severl columns need to be
# converted to their appropriate data types which is done through a series of␣
↪conversion functions. A final check shows that
# after conversion, the columns are of the appropriate data types.
[48]: # Stores column data types in variable then prints

result = df.dtypes
print("Output:")
print(result)
Output:
bookID float64
tie
title object
authors object
at
average_rating object
Be
isbn object
isbn13 object n
language_code object
va
num_pages object
ratings_count float64
Jo
text_reviews_count float64
publication_date object
of
publisher object
ty
dtype: object
er
[49]: # Renames column that had typo, converst to numeric, then drops NA values
op
df = df.rename(columns={' num_pages': 'num_pages'})

df['num_pages'] = pd.to_numeric(df['num_pages'], errors='coerce')
Pr
df = df.dropna(subset=['num_pages'])
# Drops problematic row, then converts column to date/time, but limits to just␣
↪date for clarity
df = df.drop(df.index[4757])
df['publication_date'] = pd.to_datetime(df['publication_date'], format='%m/%d/
↪%Y').dt.date
# Multiple conversions to integer and float

df['num_pages'] = df['num_pages'].astype(int)
df['ratings_count'] = df['ratings_count'].astype(int)
df['text_reviews_count'] = df['text_reviews_count'].astype(int)
df['bookID'] = df['bookID'].astype(int)
df['average_rating'] = df['average_rating'].astype(float)
6
# Check
result = df.dtypes
print("Output:")
print(result)
Output:
bookID int32
title object
authors object
average_rating float64
isbn object
isbn13 object
language_code object
num_pages int32
ratings_count int32
text_reviews_count int32
tie
publication_date object
at
publisher object
Be
dtype: object
[50]: # With the data set cleaned, it can now be augmented with further data scraped␣
n
↪from relevant APIs. In this case, to supplement
va
# the data, the program will use OpenLibrary's API to find the genre of each␣
Jo
↪book in the data set using the ISBN number. It will
# place this genre data in a new column named "genres".

of
[51]: # Function to fetch genre information based on ISBN

ty
def get_genre(isbn):
er
url = f"https://openlibrary.org/isbn/{isbn}.json"
op
response = requests.get(url)
if response.status_code == 200:
Pr
data = response.json()
if 'subjects' in data:
return data['subjects']
return None
# Creates new column

df['genres'] = None
# Iterate through each row and get genre, includes progress bar
for index, row in tqdm(df.iterrows(), total=len(df)): # Use tqdm for progress␣
↪bar
isbn = row['isbn']
genres = get_genre(isbn)
if genres:
df.at[index, 'genres'] = genres
7
100%|��| 6560/6560 [50:40<00:00, 2.16it/s]
[52]: # Once genre data has been colleced, the program will drop rows where genre␣
↪data was unable to be found.
[1]: df_with_genres = df[df['genres'].notnull()]

print(df_with_genres)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[1], line 1
----> 1 df_with_genres = df[df['genres'].notnull()]
2 print(df_with_genres)
NameError: name 'df' is not defined
tie
[54]: # With this new book data set that has been augmented with genre information,␣
at
↪intial visualizations can begin.
Be
# First, the program will generate a pie chart showing the distribution of the␣
↪top 10 genres present in the data.
# To do this, it will first rename certain genres to consolidate them into more␣
n
va
↪general categories, i.e. renameing
# the variuos instances and spellings of general fiction to a single "General␣

Jo
↪Fiction." Then, the program creates a
# list of genres and counts the number of each in the data set, sorting by the␣
of
↪top 10 and plotting them in a pie

ty
# chart.
er
[80]: # Stores genres to be renamed

op
replacements = {
Pr
"Fiction": "General Fiction",

"Fiction - General": "General Fiction",
"Comics & Graphic Novels / Graphic Novels / Manga": "Comics & Graphic␣
↪Novels",
"Fiction / General": "General Fiction",

"Biography / Autobiography": "Biography & Autobiography",
"Graphic Novels-Manga": "Graphic Novels - Manga",
"Fiction - Fantasy" : "Fantasy"
}
# Gets genres and collects them, then counts unique genres and stores the top␣
↪10 into a new variable
df_with_genres['genres'] = df_with_genres['genres'].apply(lambda x:␣

↪[replacements.get(genre, genre) for genre in x])
genres_list = [genre for sublist in df_with_genres['genres'] for genre in␣

↪sublist]
8
genre_counts = pd.Series(genres_list).value_counts()
top_10_genres = genre_counts.head(10)
# Plots pie chart

plt.figure(figsize=(8, 8))
top_10_genres.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title('Top 10 Genres')
plt.ylabel('')
plt.show()
C:\Users\jbeat\AppData\Local\Temp\ipykernel_2532\2300555406.py:13:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-

docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
tie
df_with_genres['genres'] = df_with_genres['genres'].apply(lambda x:
at
[replacements.get(genre, genre) for genre in x])
Be
n
va
Jo
of
ty
er
op
Pr
9
[56]: # ---------- ANALYSIS 1 ---------- #
# This pie chart shows an interesting result. As expected, fiction is generally␣

↪the most represented genre in the data set,
# but another significant genre that may not be as expected is the comics and␣
↪graphic novels genre. This would include comics,
# graphic novels, japanese manga, and other visual books that do not␣
↪necessarily fall in line with the traditional idea of a
# book.
[57]: # Next, the program finds authors with the highest and lowest average rating.␣
↪To do this, it first takes the original cleaned
# data frame into a new one with only authors that have more than 100,000␣
↪reviews. This will exclude authors that have a lower
# number of ratings, which could affect the accuracy of the output if included.␣
↪Then, after retrieving the top authors under
tie
# this criteria, the program plots them on a scatterplot for visualization.
at
[58]: # Subsets data frame to rows with more than 100k ratings, then groups authors␣
Be
↪and finds the top 10
df_100k = df[df['ratings_count'] > 100000]

n
author_ratings = df_100k.groupby('authors')['average_rating'].mean()
va
top_authors = author_ratings.nlargest(10)
Jo
top_authors = round(top_authors, 2)
of
print(top_authors)
ty
authors
er
Art Spiegelman 4.44

op
George R.R. Martin 4.41

J.R.R. Tolkien 4.41
Pr
Edgar Allan Poe 4.38

Rohinton Mistry 4.36
Dr. Seuss 4.35
Diana Gabaldon 4.35
Shel Silverstein 4.34
J.R. Ward 4.32
Diana Wynne Jones 4.29
Name: average_rating, dtype: float64
[59]: # Gets the total reviews of each author

total_reviews = df_100k.groupby('authors')['ratings_count'].sum()
# Combines author ratings and number of reviews

top_authors_info = pd.DataFrame({'Average Rating': top_authors, 'Total Reviews':
↪ total_reviews[top_authors.index]})
10
# Creates plot
plt.scatter(top_authors_info['Total Reviews'], top_authors_info['Average␣
↪Rating'], color='skyblue')
# Labels points on the plot

for i, author in enumerate(top_authors_info.index):
plt.annotate(author, (top_authors_info['Total Reviews'].iloc[i],␣
↪top_authors_info['Average Rating'].iloc[i]), fontsize=8, ha='left')
# Creates scatter plot

plt.title('Average Ratings vs Total Reviews for Top Authors')
plt.xlabel('Total Reviews (in millions)')
plt.ylabel('Average Rating')
plt.grid(True)
plt.tight_layout()
tie
plt.show()
at
Be
n
va
Jo
of
ty
er
op
Pr
[60]: # ---------- ANALYSIS 2 ---------- #
# The results of this inquiry are also interesting. Some immediately␣

↪recognizeable names are present in this top 10 list,
# such as George R.R. Martin, J.R.R. Tolkein, and Diana Gabaldon, but there are␣
↪some others present that may not immediately
11
# jump to mind when thinking of popular and highly rated authors. For example,␣
↪Art Spiegaleman is mostly known for his book
# "Maus", a graphic novel about his father's experience during the Holocaust.␣
↪Such a mix of authorship shows that there is
# a wider breadth of audience that uses GoodReads than simply consumers of␣
↪fiction.
[61]: # The program then performs the same process for authors that have the lowest␣
↪average rating at over 100,000 ratings.
[62]: # Finds the authors with the lowest average ratings and 100k ratings
bot_authors = author_ratings.nsmallest(10)
bot_authors = round(bot_authors, 2)
print(bot_authors)
tie
authors
Joseph Conrad 3.42
at
Elizabeth Gilbert 3.55
Be
John Gray 3.55
Stephenie Meyer 3.59 n
Azar Nafisi 3.61
va
Ayn Rand 3.63
Barbara Ehrenreich 3.63
Jo
H.G. Wells 3.64

James Frey 3.65
of
John Steinbeck 3.66

ty

er
[63]: # Combines author ratings and number of reviews into one data frame
op
bot_authors_info = pd.DataFrame({'Average Rating': bot_authors, 'Total Reviews':

↪ total_reviews[bot_authors.index]})
Pr
# Creates plot
plt.scatter(bot_authors_info['Total Reviews'], bot_authors_info['Average␣
↪Rating'], color='skyblue')
# Labels points
for i, author in enumerate(bot_authors_info.index):
plt.annotate(author, (bot_authors_info['Total Reviews'].iloc[i],␣
↪bot_authors_info['Average Rating'].iloc[i]), fontsize=8, ha='left')
# Creates scatter plot

plt.title('Average Ratings vs Total Reviews for Top Authors')
plt.xlabel('Total Reviews (in millions)')
plt.ylabel('Average Rating')
12
plt.grid(True)
plt.tight_layout()
plt.show()
tie
at
Be
n
va
Jo
of
[64]: # ---------- ANALYSIS 3 ---------- #

ty
er
# These results also provide interesting insights. Here, there are again a␣
↪decent amount of recognizeable authors
op
# such as Stephenie Meyer and John Steinbeck. In fact, many of these authors␣
Pr
↪are very popular best-sellers, and yet
# they have appeared in a list of the lowest rated authors in this particular␣
↪data set. This could primarily be
# explained by the fact this process limited the search to authors with over␣
↪100,000 reviews, as authors who no
# doubt have even lower average ratings probably don't have more than that␣
↪amount of reviews. However, it is still
# interesting to see that among popular authors these are the ones that appear␣
↪the lowest rated on average.
[65]: # The program then tests whether it is possible to plot the publications of␣
↪certain authors along a time series plot.
# In this case, three authors were randomly selected based on intiail␣

↪impression of the potential number of publications
# they could have: J.K. Rowling, Diana Gabaldon, and Agatha Christie. The␣
↪program takes each author and separates their
13
# works into separate data frames, and then plots each book in their respective␣
↪frames onto a plot using the
# "publication_date" column as the source of date data. The program also uses␣
↪the number of ratings of each book as a
# variable to guide the time series analysis.
[66]: # Subsets all J.K. Rowlings' Books and sorts by publication date
jk_books = df[df['authors'].str.contains('J.K. Rowling')]
jk_books = jk_books.sort_values(by='publication_date')
y_values = jk_books['ratings_count']
# Plots time series graph

plt.plot(jk_books['publication_date'], y_values, marker='o', linestyle='-')
plt.title('Time Series of Books Published by J.K. Rowling')
plt.xlabel('Publication Date')
tie
plt.ylabel('Ratings Count')
plt.xticks(rotation=45)
at
plt.tight_layout()
Be
plt.show()
n
va
Jo
of
ty
er
op
Pr
[67]: # Subsets all Diana Gabaldon's Books and sorts by publication date
dia_books = df[df['authors'].str.contains('Diana Gabaldon')]
dia_books = dia_books.sort_values(by='publication_date')
y_values = dia_books['ratings_count']
14
# Plots time series
plt.plot(dia_books['publication_date'], y_values, marker='o', linestyle='-')
plt.title('Time Series of Books Published by Diana Gabaldon')
plt.tight_layout()
plt.show()
tie
at
Be
n
va
Jo
of
ty
er
op
Pr
[68]: # Subsets all Agatha Christie's Books and sorts by publication date
ac_books = df[df['authors'].str.contains('Agatha Christie')]
ac_books = ac_books.sort_values(by='publication_date')
y_values = ac_books['ratings_count']
# Plots time series

plt.plot(ac_books['publication_date'], y_values, marker='o', linestyle='-')
plt.title('Time Series of Books Published by Agatha Christie')
plt.tight_layout()
plt.show()
15
tie
at
Be
n
va
[69]: # ---------- ANALYSIS 4 ---------- #
Jo
# The output of this portion of the program shows that it is very possible to␣
↪plot the publications of each author along
of
# a time series plot using this data. In the case of more prolific authors,␣
ty
↪such as Agatha Christie, it is very insightful

er
# to see what is essentially her publication history represented in visual␣

↪format. The output shows that she has been
op
# published many time and for quite a while, even after her death, and had a␣
Pr
↪surge of popularity in the early 2000s. Such
# inquiries would be interesting to explore in a larger-scale project.
[70]: # Next, the program analyzes publishers present in the data set and plots their␣
↪average ratings. It groups publishers,
# counting them, then finding the average rating for each and consolodating it␣
↪into a new data frame. The results are
# then plotted. It also finds the top 10 publishers with more than five␣
↪publications and prints their average ratings.
[71]: # Finds publishers with more than 5 publications, then counts them and finds␣
↪their average ratings
publisher_counts = df['publisher'].value_counts()
publishers_counts = publisher_counts[publisher_counts > 5]
df_filtered = df[df['publisher'].isin(publishers_counts.index)]
16
publisher_avg_ratings = df_filtered.groupby('publisher')['average_rating'].
↪mean()
top_publishers = publisher_avg_ratings.sort_values(ascending=False)
top_publishers = round(top_publishers, 2)
# Prints result
print(top_publishers.head(10))
publisher
Andrews McMeel Publishing 4.43
Tyndale House Publishers 4.38
VIZ Media 4.31
�� 4.27
Warne 4.27
Glénat 4.27
Harvest House Publishers 4.24
tie
Crossway Books 4.23
Gramercy Books 4.23
at
Everyman's Library 4.22
Be
[72]: # Combines publisher ratings with the number of books published

n
publisher_book_counts = df_filtered['publisher'].value_counts()
va
publisher_info = pd.DataFrame({'Average Rating': publisher_avg_ratings, 'Book␣

Jo
↪Count': publisher_book_counts})
of
# Plots results
ty
plt.scatter(publisher_info['Average Rating'], publisher_info['Book Count'],␣
er
↪color='skyblue')
op
plt.title('Average Rating vs Number of Books Published by Selected Publishers')

Pr
plt.xlabel('Average Rating')
plt.ylabel('Number of Books Published')
plt.grid(True)
plt.tight_layout()
plt.show()
17
tie
at
Be
n
va
[73]: # ---------- ANALYSIS 5 ---------- #
Jo
# Based on the results of this inquiry, it appears that most publishers fall␣
↪within an average rating range of 3.6 to 4.2.
of
# There are some that settle outside of that range, but the majority of␣
ty
↪publishers appear to fall within it. This is intuitive

er
# because the combination of books across different authors likely leads to a␣

↪wider range of ratings than when looking at
op
# authors alone. Therefore, it is probably more likely that publishers have a␣

Pr
↪more middling rating across their collections.
# This could also mean that the authors have more of an impact on book rating␣
↪than one might expect, as one might assume
# publishers have a strong impact on the final manuscript of a book after␣

↪editing.
[74]: # In the final portion of this program, several predictive models are tested to␣
↪determine if it is possible to predict the
# average rating of a book based on the number of reviews it has. Three models␣
↪are tested: Naive Bayes, Random Forest, and
# Linear regression.
# To develop the Naive Bayes model, the program first creates a function that␣
↪assesses each book and, depending on its average
# rating, marks it as having either a "high" rating, a "medium" rating, or a␣

↪"low" rating. These become the classes used for
18
# the Naive Bayes and Random Forest models. The program then creates testing␣
↪and training data sets to develop the Naive Bayes
# model.
# Then, the program uses the same process, but instead applies the Random␣
↪Forest algorithm to the training and test data. After
# generating the model, the importance of each of the features is plotted on a␣

↪bar graph for visualization.
# Finally,the program develops a Linear Regression model, using the raw average␣
↪ratings rather than the classifications like
# the other two. It then plots the Linear Regression model for visualization.
[75]: # Labels books based on their rating into high, medium, or low
def label_rating(rating):
if rating >= 4.5:
tie
return 'high'
elif rating >= 3 and rating < 4.5:
at
return 'medium'
Be
else:
return 'low' n
va
# Runs function and labels books
Jo
df['rating_label'] = df['average_rating'].apply(label_rating)
of
# Check
print(df)
ty
er
bookID title \
op
2 4 Harry Potter and the Chamber of Secrets (Harry…

5 9 Unauthorized Harry Potter Book Seven News: "Ha…
Pr
6 10 Harry Potter Collection (Harry Potter #1-6)

7 12 The Ultimate Hitchhiker's Guide: Five Complete…
8 13 The Ultimate Hitchhiker's Guide to the Galaxy …
… … …
11120 45630 Whores for Gloria
11122 45633 You Bright and Risen Angels
11123 45634 The Ice-Shirt (Seven Dreams #1)
11124 45639 Poor People
11125 45641 Las aventuras de Tom Sawyer

2 J.K. Rowling 4.42 439554896 9.78E+12
6 J.K. Rowling 4.73 439827604 9.78E+12
7 Douglas Adams 4.38 517226952 9.78E+12
8 Douglas Adams 4.38 345453743 9.78E+12
19
… … … … …
11125 Mark Twain 3.91 8497646983 9.79E+12
language_code num_pages ratings_count text_reviews_count \

2 eng 352 6333 244
5 en-US 152 19 1
6 eng 3342 28242 808
7 eng 815 3628 254
8 eng 815 249558 4080
… … … … …
11120 en-US 160 932 111
11122 eng 635 783 56
tie
11123 eng 415 820 95
11124 eng 434 769 139
at
11125 spa 272 113 12
Be
publication_date publisher \ n
2 2003-11-01 Scholastic
va
5 2005-04-26 Nimble Books
6 2005-09-12 Scholastic
Jo
7 2005-11-01 Gramercy Books

8 2002-04-30 Del Rey Books
of
… … …
ty
11120 1994-02-01 Penguin Books

er

op
11124 2007-02-27 Ecco

Pr
11125 2006-05-28 Edimat Libros
genres rating_label
2 None medium
5 None medium
6 None high
7 None medium
8 None medium
… … …
11120 None medium
11122 None medium
11123 None medium
11124 None medium
11125 [Twain, Mark 1835-1910, Fiction, Children's 9-… medium
[6560 rows x 14 columns]
20
[76]: # Sets variables
X = df[['ratings_count', 'text_reviews_count']]
y = df['rating_label']
# Creates test/train data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
# Generates NB model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
# Applies NB model
y_pred = nb_model.predict(X_test)
# Prints accuracy
tie
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
at
Be
Accuracy: 0.3833841463414634
[77]: # Generates RF model

n
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
va
rf_model.fit(X_train, y_train)
Jo
# Applies RF model
of
y_pred = rf_model.predict(X_test)
ty
# Prints accuracy
er
accuracy = accuracy_score(y_test, y_pred)

op
print("Accuracy:", accuracy)
Pr
Accuracy: 0.979420731707317
[78]: # Retrieves feature importances

feature_importances = rf_model.feature_importances_
indices = np.argsort(feature_importances)[::-1]
names = [X.columns[i] for i in indices]
# Plotes feature importances

plt.title("Feature Importance")
plt.bar(range(X.shape[1]), feature_importances[indices])
plt.xticks(range(X.shape[1]), names, rotation=45, ha='right')
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()
21
tie
at
Be
n
va
[81]: # Re-assigns variables
y = df['average_rating'] # Target variable
Jo
# Splitting the data into training and testing sets

of
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣

ty
↪random_state=42)
er
op
# Generates LR model
Pr
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Applies LR model
y_pred = lr_model.predict(X_test)
# Prints MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 0.12412191092066142
[82]: # Plots LR model

plt.scatter(y_test, y_pred, color='blue', label='Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],␣
↪color='red', linestyle='--', lw=2, label='Perfect Prediction')
22
plt.title('Actual vs Predicted Average Rating')
plt.xlabel('Actual Rating')
plt.ylabel('Predicted Rating')
plt.legend()
plt.show()
tie
at
Be
n
va
Jo
of
ty
er
op
Pr
[ ]: # ---------- ANALYSIS 6 ---------- #
# The final results of these models are interesting. Of the three, the most␣
↪accurate one was the Random Forest model.
# Following it was the Naive Bayes model, with a very trailing third place␣
↪awarded to the Linear Regression model.
# This can mean several things. For example, it is likely that the Random␣
↪Forest model was extremely accurate due to
# the simplistic decision-making set up by the data preparation stage. That is,␣
↪the model easily caught on to the
# trend that certain books with certain amount of reviews had were marked as␣
↪"high", and so on. However, this is still
23
# interesting because it means that there is a relationship between the number␣
↪of reviews and the average rating. This
# is further demonstrated by the lacking accuracy of the Naive Bayes model,␣

↪because the Naive Bayes model does not take
# into account inter-attribute relationships when making predictions. The␣

↪number of reviews, in a purely numeric sense,
# does not provide accurate predictions. Additionally, the Linear Regression␣

↪model, which did not use classification and
# used purely raw average ratings, also performed poorly, lending credit to␣
↪this conclusion.
[ ]: # Conclusion
# In conclusion, this program successfully takes a somewhat messy data set␣

↪containing book data from GoodReads, cleans it,
# explores it, and attempts to come up with predictive models that would be␣
tie
↪beneficial to an interested party. Based on
at
# the results of this exploration, it is fair to say that while there is plenty␣
↪of interesting data that can be
Be
# explored, such as the relationships between authors, publishers, and their␣
↪ratings, the ability of the data set to act
n
va
# as a foundation for predictive modeling is not as strong as it could be. If␣
↪the data set contained more data regarding
Jo
# how users chose which books to review, or what they enjoyed about books, then␣
↪more classes could be developed that could
of
# lead to more interesting predictions. As it stands now, the data still␣

ty
↪provides very interesting insights, and should be

er
# further explored at a future point.

op
Pr
24

Beattie Scda Finalproj-1

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Beattie Scda Finalproj-1

Uploaded by

Copyright:

Available Formats

Beattie_SCDA_FinalProj

April 20, 2024

# Scripting for Data Analaysis: Final Project Report

# Class: IST 652

# 5. Can the number of reviews be used to predict average rating?

[1]: !apt-get update

'apt-get' is not recognized as an internal or external command,

[39]: ### Essential

0 1.0 Harry Potter and the Half-Blood Prince (Harry …

2 4.0 Harry Potter and the Chamber of Secrets (Harry…

3 5.0 Harry Potter and the Prisoner of Azkaban (Harr…

authors average_rating isbn isbn13 \

language_code num_pages ratings_count text_reviews_count \

7 11/1/2005 Gramercy Books NaN

9 8/3/2004 Crown NaN

↪there are a significant number of NA values in the

↪collection. Regardless, it can be dropped completely

# ensure that there are no rows with NA values remaining.

[43]: # Counts the number of NA values in the data set

[44]: # Drops last column, the drops rows with NA values

↪predictions more tuned towards data regarding specific

↪from the data set. This is done by removing rows with a

authors average_rating isbn isbn13 \

7 eng 815 3628.0 254.0

11120 en-US 160 932.0 111.0

11122 eng 635 783.0 56.0

11124 eng 434 769.0 139.0

11125 spa 272 113.0 12.0

[6564 rows x 12 columns]

# after conversion, the columns are of the appropriate data types.

[48]: # Stores column data types in variable then prints

df = df.rename(columns={' num_pages': 'num_pages'})

# Multiple conversions to integer and float

↪book in the data set using the ISBN number. It will

# place this genre data in a new column named "genres".

[51]: # Function to fetch genre information based on ISBN

# Creates new column

[1]: df_with_genres = df[df['genres'].notnull()]

NameError: name 'df' is not defined

# the variuos instances and spellings of general fiction to a single "General␣

↪Fiction." Then, the program creates a

↪top 10 and plotting them in a pie

[80]: # Stores genres to be renamed

"Fiction": "General Fiction",

"Fiction / General": "General Fiction",

df_with_genres['genres'] = df_with_genres['genres'].apply(lambda x:␣

genres_list = [genre for sublist in df_with_genres['genres'] for genre in␣

# Plots pie chart

See the caveats in the documentation: https://pandas.pydata.org/pandas-

# This pie chart shows an interesting result. As expected, fiction is generally␣

df_100k = df[df['ratings_count'] > 100000]

Art Spiegelman 4.44

George R.R. Martin 4.41

Edgar Allan Poe 4.38

[59]: # Gets the total reviews of each author

# Combines author ratings and number of reviews

# Labels points on the plot

# Creates scatter plot

[60]: # ---------- ANALYSIS 2 ---------- #

# The results of this inquiry are also interesting. Some immediately␣

H.G. Wells 3.64

John Steinbeck 3.66

Name: average_rating, dtype: float64