You are on page 1of 24

Beattie_SCDA_FinalProj

April 20, 2024

[38]: ################################################################################

# Scripting for Data Analaysis: Final Project Report

# Class: IST 652


# Student: Jovan Beattie
# Submission Date: March 24, 2024

tie
at
# Program Summary:
# This program intakes a set of data from GoodReads that includes attributes␣

Be
↪related to books reviewed on the website.

# The data is cleaned, explored, then used to develop predictive models using␣
n
↪review data.
va
Jo

# Questions:
# 1. What is the distribution of genres in the data set?
of

# 2. Who are the authors with the best average reviews? The lowest?
# 3. Can the publications of an author be visualized via time series?
ty

# 4. What are the average ratings of publishers present in the data set?
er

# 5. Can the number of reviews be used to predict average rating?


op

################################################################################h
Pr

[1]: !apt-get update


!apt-get install texlive-xetex texlive-fonts-recommended␣
↪texlive-generic-recommended

'apt-get' is not recognized as an internal or external command,


operable program or batch file.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.

[39]: ### Essential


import pandas as pd
import requests
from tqdm import tqdm
from IPython.display import display

1
# Plotting
import matplotlib.pyplot as plt
import numpy as np

# Predictive models
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

[40]: # This data set is a set pulled from Kaggle. It contains data collected from␣
↪the website GoodReads, a site where users

# can leave reviews for books they have read. On this site, users can leave␣

tie
↪reviews in the form of a star rating, which

# ranges from 0 stars to 5 stars, along with text reviews where users can write␣

at
↪their thoughts and opinions about the

Be
# book. The data also contains relevent information about each book such as the␣
↪author, publisher, publication
n
# information. In total, the data set contains 45,000 observations of 12␣
va
↪attributes.
Jo

[41]: df = pd.read_csv("books.csv")
of

print(df.head(10))
ty

bookID title \
er

0 1.0 Harry Potter and the Half-Blood Prince (Harry …


1 2.0 Harry Potter and the Order of the Phoenix (Har…
op

2 4.0 Harry Potter and the Chamber of Secrets (Harry…


Pr

3 5.0 Harry Potter and the Prisoner of Azkaban (Harr…


4 8.0 Harry Potter Boxed Set Books 1-5 (Harry Potte…
5 9.0 Unauthorized Harry Potter Book Seven News: "Ha…
6 10.0 Harry Potter Collection (Harry Potter #1-6)
7 12.0 The Ultimate Hitchhiker's Guide: Five Complete…
8 13.0 The Ultimate Hitchhiker's Guide to the Galaxy …
9 14.0 The Hitchhiker's Guide to the Galaxy (Hitchhik…

authors average_rating isbn isbn13 \


0 J.K. Rowling/Mary GrandPré 4.57 439785960 9.78E+12
1 J.K. Rowling/Mary GrandPré 4.49 439358078 9.78E+12
2 J.K. Rowling 4.42 439554896 9.78E+12
3 J.K. Rowling/Mary GrandPré 4.56 043965548X 9.78E+12
4 J.K. Rowling/Mary GrandPré 4.78 439682584 9.78E+12
5 W. Frederick Zimmerman 3.74 976540606 9.78E+12
6 J.K. Rowling 4.73 439827604 9.78E+12

2
7 Douglas Adams 4.38 517226952 9.78E+12
8 Douglas Adams 4.38 345453743 9.78E+12
9 Douglas Adams 4.22 1400052920 9.78E+12

language_code num_pages ratings_count text_reviews_count \


0 eng 652 2095690.0 27591.0
1 eng 870 2153167.0 29221.0
2 eng 352 6333.0 244.0
3 eng 435 2339585.0 36325.0
4 eng 2690 41428.0 164.0
5 en-US 152 19.0 1.0
6 eng 3342 28242.0 808.0
7 eng 815 3628.0 254.0
8 eng 815 249558.0 4080.0
9 eng 215 4930.0 460.0

tie
publication_date publisher Unnamed: 12
0 9/16/2006 Scholastic Inc. NaN

at
1 9/1/2004 Scholastic Inc. NaN

Be
2 11/1/2003 Scholastic NaN
3 5/1/2004 Scholastic Inc. NaN n
4 9/13/2004 Scholastic NaN
va
5 4/26/2005 Nimble Books NaN
6 9/12/2005 Scholastic NaN
Jo

7 11/1/2005 Gramercy Books NaN


8 4/30/2002 Del Rey Books NaN
of

9 8/3/2004 Crown NaN


ty

[42]: # The first step in cleaning this data is to check for NA values. In this case,␣
er

↪there are a significant number of NA values in the


op

# last column, which is perhaps a column that was mistakenly added during data␣
Pr

↪collection. Regardless, it can be dropped completely

# as it contains no relevant data. Then, any other rows containing NA values in␣
↪the other columns are dopped, with a final check to

# ensure that there are no rows with NA values remaining.

[43]: # Counts the number of NA values in the data set


nan_count = df.isna().sum()
print(nan_count)

bookID 1
title 1
authors 1
average_rating 1
isbn 1
isbn13 1
language_code 1
num_pages 1

3
ratings_count 1
text_reviews_count 1
publication_date 1
publisher 1
Unnamed: 12 11126
dtype: int64

[44]: # Drops last column, the drops rows with NA values


df = df.drop(columns=['Unnamed: 12'])
df = df.dropna()

# Recounts NA values
nan_count = df.isna().sum()
print(nan_count)

bookID 0
title 0

tie
authors 0

at
average_rating 0
isbn 0

Be
isbn13 0
language_code 0
n
num_pages 0
va
ratings_count 0
Jo

text_reviews_count 0
publication_date 0
of

publisher 0
dtype: int64
ty
er

[45]: # Next, to make the data set more manageable and to develop visualizations/
op

↪predictions more tuned towards data regarding specific

# authors, books that have two or more authors credited to them are dropped␣
Pr

↪from the data set. This is done by removing rows with a

# "/", which are only present in observations that have two or more authors.

[46]: # Filters out rows that contain two or more authors using "/" character
substring = '/'
filter = df['authors'].str.contains(substring)
df = df[~filter]

# Check
display(df)

bookID title \
2 4.0 Harry Potter and the Chamber of Secrets (Harry…
5 9.0 Unauthorized Harry Potter Book Seven News: "Ha…
6 10.0 Harry Potter Collection (Harry Potter #1-6)
7 12.0 The Ultimate Hitchhiker's Guide: Five Complete…
8 13.0 The Ultimate Hitchhiker's Guide to the Galaxy …

4
… … …
11120 45630.0 Whores for Gloria
11122 45633.0 You Bright and Risen Angels
11123 45634.0 The Ice-Shirt (Seven Dreams #1)
11124 45639.0 Poor People
11125 45641.0 Las aventuras de Tom Sawyer

authors average_rating isbn isbn13 \


2 J.K. Rowling 4.42 439554896 9.78E+12
5 W. Frederick Zimmerman 3.74 976540606 9.78E+12
6 J.K. Rowling 4.73 439827604 9.78E+12
7 Douglas Adams 4.38 517226952 9.78E+12
8 Douglas Adams 4.38 345453743 9.78E+12
… … … … …
11120 William T. Vollmann 3.69 140231579 9.78E+12
11122 William T. Vollmann 4.08 140110879 9.78E+12

tie
11123 William T. Vollmann 3.96 140131965 9.78E+12
11124 William T. Vollmann 3.72 60878827 9.78E+12

at
11125 Mark Twain 3.91 8497646983 9.79E+12

Be
language_code num_pages ratings_countn text_reviews_count \
2 eng 352 6333.0 244.0
va
5 en-US 152 19.0 1.0
6 eng 3342 28242.0 808.0
Jo

7 eng 815 3628.0 254.0


8 eng 815 249558.0 4080.0
of

… … … … …
ty

11120 en-US 160 932.0 111.0


er

11122 eng 635 783.0 56.0


11123 eng 415 820.0 95.0
op

11124 eng 434 769.0 139.0


Pr

11125 spa 272 113.0 12.0

publication_date publisher
2 11/1/2003 Scholastic
5 4/26/2005 Nimble Books
6 9/12/2005 Scholastic
7 11/1/2005 Gramercy Books
8 4/30/2002 Del Rey Books
… … …
11120 2/1/1994 Penguin Books
11122 12/1/1988 Penguin Books
11123 8/1/1993 Penguin Books
11124 2/27/2007 Ecco
11125 5/28/2006 Edimat Libros

[6564 rows x 12 columns]

5
[47]: # With the data set narrowed down, the next cleaning step is to ensure that␣
↪columns are of the correct data type. An initial

# check shows that most columns in this data set are being treated as objects,␣
↪which is incorrect. Severl columns need to be

# converted to their appropriate data types which is done through a series of␣
↪conversion functions. A final check shows that

# after conversion, the columns are of the appropriate data types.

[48]: # Stores column data types in variable then prints


result = df.dtypes

print("Output:")
print(result)

Output:
bookID float64

tie
title object
authors object

at
average_rating object

Be
isbn object
isbn13 object n
language_code object
va
num_pages object
ratings_count float64
Jo

text_reviews_count float64
publication_date object
of

publisher object
ty

dtype: object
er

[49]: # Renames column that had typo, converst to numeric, then drops NA values
op

df = df.rename(columns={' num_pages': 'num_pages'})


df['num_pages'] = pd.to_numeric(df['num_pages'], errors='coerce')
Pr

df = df.dropna(subset=['num_pages'])

# Drops problematic row, then converts column to date/time, but limits to just␣
↪date for clarity

df = df.drop(df.index[4757])
df['publication_date'] = pd.to_datetime(df['publication_date'], format='%m/%d/
↪%Y').dt.date

# Multiple conversions to integer and float


df['num_pages'] = df['num_pages'].astype(int)
df['ratings_count'] = df['ratings_count'].astype(int)
df['text_reviews_count'] = df['text_reviews_count'].astype(int)
df['bookID'] = df['bookID'].astype(int)
df['average_rating'] = df['average_rating'].astype(float)

6
# Check
result = df.dtypes

print("Output:")
print(result)

Output:
bookID int32
title object
authors object
average_rating float64
isbn object
isbn13 object
language_code object
num_pages int32
ratings_count int32
text_reviews_count int32

tie
publication_date object

at
publisher object

Be
dtype: object

[50]: # With the data set cleaned, it can now be augmented with further data scraped␣
n
↪from relevant APIs. In this case, to supplement
va

# the data, the program will use OpenLibrary's API to find the genre of each␣
Jo

↪book in the data set using the ISBN number. It will

# place this genre data in a new column named "genres".


of

[51]: # Function to fetch genre information based on ISBN


ty

def get_genre(isbn):
er

url = f"https://openlibrary.org/isbn/{isbn}.json"
op

response = requests.get(url)
if response.status_code == 200:
Pr

data = response.json()
if 'subjects' in data:
return data['subjects']
return None

# Creates new column


df['genres'] = None

# Iterate through each row and get genre, includes progress bar
for index, row in tqdm(df.iterrows(), total=len(df)): # Use tqdm for progress␣
↪bar

isbn = row['isbn']
genres = get_genre(isbn)
if genres:
df.at[index, 'genres'] = genres

7
100%|����������| 6560/6560 [50:40<00:00, 2.16it/s]

[52]: # Once genre data has been colleced, the program will drop rows where genre␣
↪data was unable to be found.

[1]: df_with_genres = df[df['genres'].notnull()]


print(df_with_genres)

---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[1], line 1
----> 1 df_with_genres = df[df['genres'].notnull()]
2 print(df_with_genres)

NameError: name 'df' is not defined

tie
[54]: # With this new book data set that has been augmented with genre information,␣

at
↪intial visualizations can begin.

Be
# First, the program will generate a pie chart showing the distribution of the␣
↪top 10 genres present in the data.

# To do this, it will first rename certain genres to consolidate them into more␣
n
va
↪general categories, i.e. renameing

# the variuos instances and spellings of general fiction to a single "General␣


Jo

↪Fiction." Then, the program creates a

# list of genres and counts the number of each in the data set, sorting by the␣
of

↪top 10 and plotting them in a pie


ty

# chart.
er

[80]: # Stores genres to be renamed


op

replacements = {
Pr

"Fiction": "General Fiction",


"Fiction - General": "General Fiction",
"Comics & Graphic Novels / Graphic Novels / Manga": "Comics & Graphic␣
↪Novels",

"Fiction / General": "General Fiction",


"Biography / Autobiography": "Biography & Autobiography",
"Graphic Novels-Manga": "Graphic Novels - Manga",
"Fiction - Fantasy" : "Fantasy"
}

# Gets genres and collects them, then counts unique genres and stores the top␣
↪10 into a new variable

df_with_genres['genres'] = df_with_genres['genres'].apply(lambda x:␣


↪[replacements.get(genre, genre) for genre in x])

genres_list = [genre for sublist in df_with_genres['genres'] for genre in␣


↪sublist]

8
genre_counts = pd.Series(genres_list).value_counts()
top_10_genres = genre_counts.head(10)

# Plots pie chart


plt.figure(figsize=(8, 8))
top_10_genres.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title('Top 10 Genres')
plt.ylabel('')
plt.show()

C:\Users\jbeat\AppData\Local\Temp\ipykernel_2532\2300555406.py:13:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-


docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

tie
df_with_genres['genres'] = df_with_genres['genres'].apply(lambda x:

at
[replacements.get(genre, genre) for genre in x])

Be
n
va
Jo
of
ty
er
op
Pr

9
[56]: # ---------- ANALYSIS 1 ---------- #

# This pie chart shows an interesting result. As expected, fiction is generally␣


↪the most represented genre in the data set,

# but another significant genre that may not be as expected is the comics and␣
↪graphic novels genre. This would include comics,

# graphic novels, japanese manga, and other visual books that do not␣
↪necessarily fall in line with the traditional idea of a

# book.

[57]: # Next, the program finds authors with the highest and lowest average rating.␣
↪To do this, it first takes the original cleaned

# data frame into a new one with only authors that have more than 100,000␣
↪reviews. This will exclude authors that have a lower

# number of ratings, which could affect the accuracy of the output if included.␣
↪Then, after retrieving the top authors under

tie
# this criteria, the program plots them on a scatterplot for visualization.

at
[58]: # Subsets data frame to rows with more than 100k ratings, then groups authors␣

Be
↪and finds the top 10

df_100k = df[df['ratings_count'] > 100000]


n
author_ratings = df_100k.groupby('authors')['average_rating'].mean()
va
top_authors = author_ratings.nlargest(10)
Jo

top_authors = round(top_authors, 2)
of

print(top_authors)
ty

authors
er

Art Spiegelman 4.44


op

George R.R. Martin 4.41


J.R.R. Tolkien 4.41
Pr

Edgar Allan Poe 4.38


Rohinton Mistry 4.36
Dr. Seuss 4.35
Diana Gabaldon 4.35
Shel Silverstein 4.34
J.R. Ward 4.32
Diana Wynne Jones 4.29
Name: average_rating, dtype: float64

[59]: # Gets the total reviews of each author


total_reviews = df_100k.groupby('authors')['ratings_count'].sum()

# Combines author ratings and number of reviews


top_authors_info = pd.DataFrame({'Average Rating': top_authors, 'Total Reviews':
↪ total_reviews[top_authors.index]})

10
# Creates plot
plt.figure(figsize=(10, 6))
plt.scatter(top_authors_info['Total Reviews'], top_authors_info['Average␣
↪Rating'], color='skyblue')

# Labels points on the plot


for i, author in enumerate(top_authors_info.index):
plt.annotate(author, (top_authors_info['Total Reviews'].iloc[i],␣
↪top_authors_info['Average Rating'].iloc[i]), fontsize=8, ha='left')

# Creates scatter plot


plt.title('Average Ratings vs Total Reviews for Top Authors')
plt.xlabel('Total Reviews (in millions)')
plt.ylabel('Average Rating')
plt.grid(True)
plt.tight_layout()

tie
plt.show()

at
Be
n
va
Jo
of
ty
er
op
Pr

[60]: # ---------- ANALYSIS 2 ---------- #

# The results of this inquiry are also interesting. Some immediately␣


↪recognizeable names are present in this top 10 list,

# such as George R.R. Martin, J.R.R. Tolkein, and Diana Gabaldon, but there are␣
↪some others present that may not immediately

11
# jump to mind when thinking of popular and highly rated authors. For example,␣
↪Art Spiegaleman is mostly known for his book

# "Maus", a graphic novel about his father's experience during the Holocaust.␣
↪Such a mix of authorship shows that there is

# a wider breadth of audience that uses GoodReads than simply consumers of␣
↪fiction.

[61]: # The program then performs the same process for authors that have the lowest␣
↪average rating at over 100,000 ratings.

[62]: # Finds the authors with the lowest average ratings and 100k ratings
bot_authors = author_ratings.nsmallest(10)
bot_authors = round(bot_authors, 2)

print(bot_authors)

tie
authors
Joseph Conrad 3.42

at
Elizabeth Gilbert 3.55

Be
John Gray 3.55
Stephenie Meyer 3.59 n
Azar Nafisi 3.61
va
Ayn Rand 3.63
Barbara Ehrenreich 3.63
Jo

H.G. Wells 3.64


James Frey 3.65
of

John Steinbeck 3.66


ty

Name: average_rating, dtype: float64


er

[63]: # Combines author ratings and number of reviews into one data frame
op

bot_authors_info = pd.DataFrame({'Average Rating': bot_authors, 'Total Reviews':


↪ total_reviews[bot_authors.index]})
Pr

# Creates plot
plt.figure(figsize=(10, 6))
plt.scatter(bot_authors_info['Total Reviews'], bot_authors_info['Average␣
↪Rating'], color='skyblue')

# Labels points
for i, author in enumerate(bot_authors_info.index):
plt.annotate(author, (bot_authors_info['Total Reviews'].iloc[i],␣
↪bot_authors_info['Average Rating'].iloc[i]), fontsize=8, ha='left')

# Creates scatter plot


plt.title('Average Ratings vs Total Reviews for Top Authors')
plt.xlabel('Total Reviews (in millions)')
plt.ylabel('Average Rating')

12
plt.grid(True)
plt.tight_layout()
plt.show()

tie
at
Be
n
va
Jo
of

[64]: # ---------- ANALYSIS 3 ---------- #


ty
er

# These results also provide interesting insights. Here, there are again a␣
↪decent amount of recognizeable authors
op

# such as Stephenie Meyer and John Steinbeck. In fact, many of these authors␣
Pr

↪are very popular best-sellers, and yet

# they have appeared in a list of the lowest rated authors in this particular␣
↪data set. This could primarily be

# explained by the fact this process limited the search to authors with over␣
↪100,000 reviews, as authors who no

# doubt have even lower average ratings probably don't have more than that␣
↪amount of reviews. However, it is still

# interesting to see that among popular authors these are the ones that appear␣
↪the lowest rated on average.

[65]: # The program then tests whether it is possible to plot the publications of␣
↪certain authors along a time series plot.

# In this case, three authors were randomly selected based on intiail␣


↪impression of the potential number of publications

# they could have: J.K. Rowling, Diana Gabaldon, and Agatha Christie. The␣
↪program takes each author and separates their

13
# works into separate data frames, and then plots each book in their respective␣
↪frames onto a plot using the

# "publication_date" column as the source of date data. The program also uses␣
↪the number of ratings of each book as a

# variable to guide the time series analysis.

[66]: # Subsets all J.K. Rowlings' Books and sorts by publication date
jk_books = df[df['authors'].str.contains('J.K. Rowling')]
jk_books = jk_books.sort_values(by='publication_date')
y_values = jk_books['ratings_count']

# Plots time series graph


plt.figure(figsize=(10, 6))
plt.plot(jk_books['publication_date'], y_values, marker='o', linestyle='-')
plt.title('Time Series of Books Published by J.K. Rowling')
plt.xlabel('Publication Date')

tie
plt.ylabel('Ratings Count')
plt.xticks(rotation=45)

at
plt.tight_layout()

Be
plt.show()
n
va
Jo
of
ty
er
op
Pr

[67]: # Subsets all Diana Gabaldon's Books and sorts by publication date
dia_books = df[df['authors'].str.contains('Diana Gabaldon')]
dia_books = dia_books.sort_values(by='publication_date')
y_values = dia_books['ratings_count']

14
# Plots time series
plt.figure(figsize=(10, 6))
plt.plot(dia_books['publication_date'], y_values, marker='o', linestyle='-')
plt.title('Time Series of Books Published by Diana Gabaldon')
plt.xlabel('Publication Date')
plt.ylabel('Ratings Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

tie
at
Be
n
va
Jo
of
ty
er
op
Pr

[68]: # Subsets all Agatha Christie's Books and sorts by publication date
ac_books = df[df['authors'].str.contains('Agatha Christie')]
ac_books = ac_books.sort_values(by='publication_date')
y_values = ac_books['ratings_count']

# Plots time series


plt.figure(figsize=(10, 6))
plt.plot(ac_books['publication_date'], y_values, marker='o', linestyle='-')
plt.title('Time Series of Books Published by Agatha Christie')
plt.xlabel('Publication Date')
plt.ylabel('Ratings Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

15
tie
at
Be
n
va
[69]: # ---------- ANALYSIS 4 ---------- #
Jo

# The output of this portion of the program shows that it is very possible to␣
↪plot the publications of each author along
of

# a time series plot using this data. In the case of more prolific authors,␣
ty

↪such as Agatha Christie, it is very insightful


er

# to see what is essentially her publication history represented in visual␣


↪format. The output shows that she has been
op

# published many time and for quite a while, even after her death, and had a␣
Pr

↪surge of popularity in the early 2000s. Such

# inquiries would be interesting to explore in a larger-scale project.

[70]: # Next, the program analyzes publishers present in the data set and plots their␣
↪average ratings. It groups publishers,

# counting them, then finding the average rating for each and consolodating it␣
↪into a new data frame. The results are

# then plotted. It also finds the top 10 publishers with more than five␣
↪publications and prints their average ratings.

[71]: # Finds publishers with more than 5 publications, then counts them and finds␣
↪their average ratings

publisher_counts = df['publisher'].value_counts()
publishers_counts = publisher_counts[publisher_counts > 5]
df_filtered = df[df['publisher'].isin(publishers_counts.index)]

16
publisher_avg_ratings = df_filtered.groupby('publisher')['average_rating'].
↪mean()

top_publishers = publisher_avg_ratings.sort_values(ascending=False)
top_publishers = round(top_publishers, 2)

# Prints result
print(top_publishers.head(10))

publisher
Andrews McMeel Publishing 4.43
Tyndale House Publishers 4.38
VIZ Media 4.31
��� 4.27
Warne 4.27
Glénat 4.27
Harvest House Publishers 4.24

tie
Crossway Books 4.23
Gramercy Books 4.23

at
Everyman's Library 4.22

Be
Name: average_rating, dtype: float64

[72]: # Combines publisher ratings with the number of books published


n
publisher_book_counts = df_filtered['publisher'].value_counts()
va

publisher_info = pd.DataFrame({'Average Rating': publisher_avg_ratings, 'Book␣


Jo

↪Count': publisher_book_counts})
of

# Plots results
ty

plt.figure(figsize=(10, 6))
plt.scatter(publisher_info['Average Rating'], publisher_info['Book Count'],␣
er

↪color='skyblue')
op

plt.title('Average Rating vs Number of Books Published by Selected Publishers')


Pr

plt.xlabel('Average Rating')
plt.ylabel('Number of Books Published')
plt.grid(True)
plt.tight_layout()
plt.show()

17
tie
at
Be
n
va
[73]: # ---------- ANALYSIS 5 ---------- #
Jo

# Based on the results of this inquiry, it appears that most publishers fall␣
↪within an average rating range of 3.6 to 4.2.
of

# There are some that settle outside of that range, but the majority of␣
ty

↪publishers appear to fall within it. This is intuitive


er

# because the combination of books across different authors likely leads to a␣


↪wider range of ratings than when looking at
op

# authors alone. Therefore, it is probably more likely that publishers have a␣


Pr

↪more middling rating across their collections.

# This could also mean that the authors have more of an impact on book rating␣
↪than one might expect, as one might assume

# publishers have a strong impact on the final manuscript of a book after␣


↪editing.

[74]: # In the final portion of this program, several predictive models are tested to␣
↪determine if it is possible to predict the

# average rating of a book based on the number of reviews it has. Three models␣
↪are tested: Naive Bayes, Random Forest, and

# Linear regression.

# To develop the Naive Bayes model, the program first creates a function that␣
↪assesses each book and, depending on its average

# rating, marks it as having either a "high" rating, a "medium" rating, or a␣


↪"low" rating. These become the classes used for

18
# the Naive Bayes and Random Forest models. The program then creates testing␣
↪and training data sets to develop the Naive Bayes

# model.

# Then, the program uses the same process, but instead applies the Random␣
↪Forest algorithm to the training and test data. After

# generating the model, the importance of each of the features is plotted on a␣


↪bar graph for visualization.

# Finally,the program develops a Linear Regression model, using the raw average␣
↪ratings rather than the classifications like

# the other two. It then plots the Linear Regression model for visualization.

[75]: # Labels books based on their rating into high, medium, or low
def label_rating(rating):
if rating >= 4.5:

tie
return 'high'
elif rating >= 3 and rating < 4.5:

at
return 'medium'

Be
else:
return 'low' n
va
# Runs function and labels books
Jo

df['rating_label'] = df['average_rating'].apply(label_rating)
of

# Check
print(df)
ty
er

bookID title \
op

2 4 Harry Potter and the Chamber of Secrets (Harry…


5 9 Unauthorized Harry Potter Book Seven News: "Ha…
Pr

6 10 Harry Potter Collection (Harry Potter #1-6)


7 12 The Ultimate Hitchhiker's Guide: Five Complete…
8 13 The Ultimate Hitchhiker's Guide to the Galaxy …
… … …
11120 45630 Whores for Gloria
11122 45633 You Bright and Risen Angels
11123 45634 The Ice-Shirt (Seven Dreams #1)
11124 45639 Poor People
11125 45641 Las aventuras de Tom Sawyer

authors average_rating isbn isbn13 \


2 J.K. Rowling 4.42 439554896 9.78E+12
5 W. Frederick Zimmerman 3.74 976540606 9.78E+12
6 J.K. Rowling 4.73 439827604 9.78E+12
7 Douglas Adams 4.38 517226952 9.78E+12
8 Douglas Adams 4.38 345453743 9.78E+12

19
… … … … …
11120 William T. Vollmann 3.69 140231579 9.78E+12
11122 William T. Vollmann 4.08 140110879 9.78E+12
11123 William T. Vollmann 3.96 140131965 9.78E+12
11124 William T. Vollmann 3.72 60878827 9.78E+12
11125 Mark Twain 3.91 8497646983 9.79E+12

language_code num_pages ratings_count text_reviews_count \


2 eng 352 6333 244
5 en-US 152 19 1
6 eng 3342 28242 808
7 eng 815 3628 254
8 eng 815 249558 4080
… … … … …
11120 en-US 160 932 111
11122 eng 635 783 56

tie
11123 eng 415 820 95
11124 eng 434 769 139

at
11125 spa 272 113 12

Be
publication_date publisher \ n
2 2003-11-01 Scholastic
va
5 2005-04-26 Nimble Books
6 2005-09-12 Scholastic
Jo

7 2005-11-01 Gramercy Books


8 2002-04-30 Del Rey Books
of

… … …
ty

11120 1994-02-01 Penguin Books


er

11122 1988-12-01 Penguin Books


11123 1993-08-01 Penguin Books
op

11124 2007-02-27 Ecco


Pr

11125 2006-05-28 Edimat Libros

genres rating_label
2 None medium
5 None medium
6 None high
7 None medium
8 None medium
… … …
11120 None medium
11122 None medium
11123 None medium
11124 None medium
11125 [Twain, Mark 1835-1910, Fiction, Children's 9-… medium

[6560 rows x 14 columns]

20
[76]: # Sets variables
X = df[['ratings_count', 'text_reviews_count']]
y = df['rating_label']

# Creates test/train data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)

# Generates NB model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Applies NB model
y_pred = nb_model.predict(X_test)

# Prints accuracy

tie
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

at
Be
Accuracy: 0.3833841463414634

[77]: # Generates RF model


n
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
va
rf_model.fit(X_train, y_train)
Jo

# Applies RF model
of

y_pred = rf_model.predict(X_test)
ty

# Prints accuracy
er

accuracy = accuracy_score(y_test, y_pred)


op

print("Accuracy:", accuracy)
Pr

Accuracy: 0.979420731707317

[78]: # Retrieves feature importances


feature_importances = rf_model.feature_importances_
indices = np.argsort(feature_importances)[::-1]
names = [X.columns[i] for i in indices]

# Plotes feature importances


plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(X.shape[1]), feature_importances[indices])
plt.xticks(range(X.shape[1]), names, rotation=45, ha='right')
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

21
tie
at
Be
n
va
[81]: # Re-assigns variables
y = df['average_rating'] # Target variable
Jo

# Splitting the data into training and testing sets


of

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣


ty

↪random_state=42)
er
op

# Generates LR model
Pr

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Applies LR model
y_pred = lr_model.predict(X_test)

# Prints MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.12412191092066142

[82]: # Plots LR model


plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],␣
↪color='red', linestyle='--', lw=2, label='Perfect Prediction')

22
plt.title('Actual vs Predicted Average Rating')
plt.xlabel('Actual Rating')
plt.ylabel('Predicted Rating')
plt.legend()
plt.show()

tie
at
Be
n
va
Jo
of
ty
er
op
Pr

[ ]: # ---------- ANALYSIS 6 ---------- #

# The final results of these models are interesting. Of the three, the most␣
↪accurate one was the Random Forest model.

# Following it was the Naive Bayes model, with a very trailing third place␣
↪awarded to the Linear Regression model.

# This can mean several things. For example, it is likely that the Random␣
↪Forest model was extremely accurate due to

# the simplistic decision-making set up by the data preparation stage. That is,␣
↪the model easily caught on to the

# trend that certain books with certain amount of reviews had were marked as␣
↪"high", and so on. However, this is still

23
# interesting because it means that there is a relationship between the number␣
↪of reviews and the average rating. This

# is further demonstrated by the lacking accuracy of the Naive Bayes model,␣


↪because the Naive Bayes model does not take

# into account inter-attribute relationships when making predictions. The␣


↪number of reviews, in a purely numeric sense,

# does not provide accurate predictions. Additionally, the Linear Regression␣


↪model, which did not use classification and

# used purely raw average ratings, also performed poorly, lending credit to␣
↪this conclusion.

[ ]: # Conclusion

# In conclusion, this program successfully takes a somewhat messy data set␣


↪containing book data from GoodReads, cleans it,

# explores it, and attempts to come up with predictive models that would be␣

tie
↪beneficial to an interested party. Based on

at
# the results of this exploration, it is fair to say that while there is plenty␣
↪of interesting data that can be

Be
# explored, such as the relationships between authors, publishers, and their␣
↪ratings, the ability of the data set to act
n
va
# as a foundation for predictive modeling is not as strong as it could be. If␣
↪the data set contained more data regarding
Jo

# how users chose which books to review, or what they enjoyed about books, then␣
↪more classes could be developed that could
of

# lead to more interesting predictions. As it stands now, the data still␣


ty

↪provides very interesting insights, and should be


er

# further explored at a future point.


op
Pr

24

You might also like