FIT5202 Data Processing for Big Data
Assignment 1 - Part A
Student Name : Pooja Vishal Pancholi
Student ID : 29984939
Tutorial Day and Time : Thursday 6 to 8 PM
Tutor Name: Huashun Li
Step 01: Import pyspark and initialize SparkIn [2]:
In [5]:
# importing pyspark API Libraries
from pyspark import SparkContext, SparkConf # Spark
from pyspark.sql import SparkSession # Spark SOL
context = SparkContext.getOrCreate()
if (context is None):
conf = SparkConf().setAppName(""AssignnentiA Application"). setMaster(" local [*
context = SparkContext (conf=conf)
# in order to check all configurations
context. _conf.getall()
# import re
import re
# for the stopwords
import nltk
from nltk.corpus import stopwords
# uncomment and download it before running step 5
# nLtk. download('stopwords')
# for plotting
# Ipip install matplotlib
# Ipip install numpy
import matplotlib.pyplot as plt
import numpy as np
Ymatplotlib inline
Xpylab inline
Populating the interactive namespace from numpy and matplotlib
Step 02: Create Resilient Distributed Datasets (RDDs)
# RD for Scrum Handbook. txt file
rddScrum = context.textFile('Scrum Handbook. txt")
print("\n Number of total Lines in "Scrum Handbook.txt’ file is: ", rddScrum.coun
# ROD for Agile Processes in Software Engineering and Extreme Progranming. txt fi
rddAgile = context.textFile('Agile Processes in Software Engineering and Extrem,
print("\n Number of total lines in ‘Agile Processes in Software Engineering and E
rddagile.count())
Number of total lines in ‘Scrum Handbook.txt' file is: 4617
Number of total lines in ‘Agile Processes in Software Engineering and Extreme
Programming.txt' file is: 21569In [6]:
# Function to clean and format the RDD data
def formatRdd(rdd):
rddFile = rdd.map(lambda lines: re.sub(‘'[*a-zA-Z]+",
-filter(lambda sublines: re.sub("\[2-9]',
-map(lambda x: x.lower(), rddScrum)
return rddFile
# Formatting the ROD data
rddScrum = formatRdd(rddScrum)
rddAgile = formatRdd(rddagile)
# Printing the top 5 results after formatting the ROD's
index = 0
print("The first five tuples in Scrum Handbook are: \n")
for val in rddScrum.take(5):
index = index+1
print(index, ": ", val)
index = 0
print("\nthe first five tuples in Agile Processes Book are:
for val in rddAgile.take(5):
index = index+1
print(index, “: ", val)
The first five tuples in Scrum Handbook are:
jeff sutherland s
Scrum handbook
everything
you need
‘to know
The first five tuples in Agile Processes Book aré
Anbip
i helen sharp
tracy hall eds
agile processes
in software engineering
wane
Step 04: Transforming the Data/Counting the words
*,Lines))\
", sublines))\
\n")In [7]:
# Function to transform ROD to (word, 1) form
def filterkdd(rdd):
rddFiltered = rdd.flatmap(Lanbda x: re.split("\s+', x))
-filter(Lanbda enpty: len(enpty) > 0)\
smap(lanbda finalwords: (FinalWords, 1))
return rddFiltered
# Transforming ROD's to (word, 1) form
rddScrumPair = filterRdd(rddScrum)
rddagilepair = filterRdd(rddagile)
# printing the top 5 results after transformations are done
print("The transformed RDD for Scrum Handbook is:")
for val in rddScrumPair.take(5):
print(val)
print("\nthe transformed ROD for Agile Processes Book is:")
for val in rddagilePair. take(5):
print (val)
The transformed RDD for Scrum Handbook is:
Cet", 1)
(Sutherland, 1)
Cs', 1)
('scrum’, 1)
handbook’, 1)
The transformed RDD for Agile Processes Book is:
Cnbip’, 1)
ci, D
Chelen’, 1)
(sharp", 1)
tracy’, 1)In [8]:
# function to reduce rdd according to word frequency
def countFrequency(rdd):
count = rdd.reduceBykey (Lambda val2, vali: val2 + val1)\
smap(lambda a: (a[1], a[0]))\
-sortBykey (ascending=False)\
smap(lambda a: (a[1], a[0]))
return count
# reducing RDD's accoridng to word frequency
scrumCount = countFrequency(rddScrunPair)
agilecount = countFrequency(rddAgilePain)
# printing the top 20 words with most frequencies in both the books
print("The top 20 words with most frequency in Scrum Handbook are:
for val in scrumCount.take(2@):
print(val)
print("\nthe top 20 words with most frequency in Agile Processes Book are:
for val in agilecount.take(2@):
print(val)
The top 2 words with most frequency in Scrum Handbook are:
the’, 1238)
of", 538)
Cand", 534)
(to", 478)
(rat, 454)
(C’serum’, 399)
Cin", 363)
(Cis', 348)
(team’, 273)
Ciproduct", 233)
for", 195)
that", 182)
(at', 172)
Con", 149)
Csprint', 147)
(Cthis', 142)
Cwith’, 132)
Cas", 124)
Cat’, 119)
Care’, 119)
The top 2¢ words with most frequency in Agile Processes Book are:
(the', 8161)
Cand", 3975)
oF", 3954)
(to, 3751)
(‘in', 3101)
(at, 2755)
(is', 1541)
Cthat', 1356)
(for', 1195)
(‘on', 1027)
(as, 1023)
(we', 980)
with’, 978)
software’, 931)
C this", 915)
Care’, 785)
Cagile’, 784)
(Cit, 775)In [9]:
development", 748)
(Cwas', 711) g
Step 05: Removing Stop Words
stophordsList = set(stopwords.words( ‘english’ )) # getting the stopwords
# Function to remove stop words from both RODS
def renoveStopWords(rdd) :
removeStop = rdd.filter(lambda x: x[@] not in stopWordsList)
return renoveStop
# Removing the stopwords from the ROD
scrumCount = renoveStophiords(scrumCount)
agileCount = renoveStophords(agilecount)
# storing the count of words
totalCountScrum = scrumCount.count()
totalcountagile = agileCount.count()
# printing the count of unique words after removal of stopwords
print ("After the removal of stopwords: \n")
print("There are \"",totalCountScrum,"\" unique words in Scrum Handbook. ")
print("There are \"", totalCountagile,"\" unique words in Agile Processes Book
After the renoval of stopwords:
There are“ 2857 " unique words in Scrum Handbook.
There are " 8962 " unique words in Agile Processes Book.
Step 06: Find the average ocourrence of a wordIn [10]:
# Function to find the average occurences for each book
def averagedcc(rdd):
average = rdd.map(lambda finalWords: (finalWords[@], finalWords(1] / totalcou
return average
# Getting the average occurence for each book
scrumCountAverage = average0cc(scrumCount)
agileCountAverage = averageOcc(agileCount)
# Printing the average occurences of each book
print("The average occurence of the top 5 words in Scrum Handbook are :
for val in scrumCountAverage.take(5):
print(val)
print("\nThe average occurence of the top 5 words in Agile Processes Book are
for val in agileCountaverage.take(5):
print(val)
The average occurence of the top 5 words in Scrum Handbook are :
(‘scrum’, @.13965698284914246)
(Cteam’, @.09555477773888695)
(C product", @.08155407770388519)
Csprint’, @.051452572628631434)
(development, @.03430171508575429)
The average occurence of the top 5 words in Agile Processes Book are :
(C'software’, @.3258662933146657)
(Cagile’, @.2744137206860343)
‘development, @.26181309065453273)
(‘team', @.2072103605180259)
(Cwork", @.16135806790339516)
Step 7: Exploratory data analysisIn [13]:
index = np.arange(15)
bar_width = 0.25
# Fetching the first 15 book values
scrunkey, scrunValue = zip(*scrumCount.take(15))
agilekey, agileValue = zip(*agileCount.take(15))
pylab.rcParams[“figure.figsize’] = (15, 9) # setting the figure size
fig, ax = plt.subplots() # plotting the subplot
# Setting the Label and Title formatting
ax.set_xlabel(‘Words', size=14)
ax.set_ylabel(*Counts', size=14)
ax.set_title( Scrum Handbook vs Agile Processes Book’, size=18)
# plotting the bars for each book
scrunbata = ax.bar(index, scrunValue, bar_width, label="Scrum Handbook")
agilebata = ax.bar(index + bar_width, agileValue, bar_width, label="Agile Process.
# setting the tick values for X as Words
ax.set_xticks(index, minor=False)
ax.set_xticks(index + bar_width, minor=True)
ax.set_xticklabels(scrumkey, rotation=92, minor=False, ha=
ax.set_xticklabels(agilekey, rotation=98, minor=True, ha
ax.tick_params(axis="both', which='major’, labelsize=13)
ax.tick_params(axis="both’, which="minor", labelsize=13)
enter")
center")
# Show the plot
ax. legend()
plt.show()
‘Scrum Handbook vs Agile Processes Book
| NUN
55 HoH GH EE GE oH OH gb ub oH
4 PRR ENP ae Be
: i 5 i
a gEnd of Part A. I hope you like my work :)