Professional Documents
Culture Documents
ipynb - Colaboratory
Đồ án giữa kỳ
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
Yêu cầu
Spark Context
from pyspark import SparkContext
from pyspark.sql import SQLContext
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 1/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
sc = SparkContext("local","mid-term")
sqlc = SQLContext(sc)
data = sc.textFile('/content/data.csv')
'''
Câu 1
'''
def preprocessLine(x):
items = x.strip().split(',')[0:2]
return (str(items[0]+','+items[1]),1)
def f(x):
s = x[0].split(',')
return s[0]+','+s[1]+','+str(x[1])
data_F = data.map(preprocessLine).reduceByKey(lambda x, y: x + y).map(f)
data_F.saveAsTextFile('/content/counters')
import os
if os.path.exists('/content/counters/part-00000'):
sqlc.read.csv('/content/counters/part-00000', header=True).show()
+-------------+----------+---+
|Member_number| Date| 1|
+-------------+----------+---+
| 1249|01/01/2014| 2|
| 1381|01/01/2014| 2|
| 1440|01/01/2014| 2|
| 1659|01/01/2014| 2|
| 1789|01/01/2014| 2|
| 1922|01/01/2014| 2|
| 2226|01/01/2014| 2|
| 2237|01/01/2014| 2|
| 2351|01/01/2014| 2|
| 2542|01/01/2014| 2|
| 2610|01/01/2014| 3|
| 2709|01/01/2014| 2|
| 2727|01/01/2014| 2|
| 2943|01/01/2014| 2|
| 2974|01/01/2014| 3|
| 3681|01/01/2014| 3|
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 2/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
| 3797|01/01/2014| 2|
| 3942|01/01/2014| 3|
| 3956|01/01/2014| 4|
| 4260|01/01/2014| 2|
+-------------+----------+---+
'''
Câu 2
'''
def get_key_value(x):
items = x.strip().split(',')[0:3]
return (str(items[0]+';'+items[1]+';'),items[2])
def f2(x):
s = x[0].split(';')
return s[0]+';'+s[1]+';'+str(x[1])
data_2 = data.map(get_key_value).reduceByKey(lambda x, y:x+','+y).map(f2)
data_2.saveAsTextFile('/content/baskets')
import os
if os.path.exists('/content/baskets/part-00000'):
sqlc.read.csv('/content/baskets/part-00000', header=True, sep=';').show()
+-------------+----------+--------------------+
+-------------+----------+--------------------+
| 1381|01/01/2014| curd,soda|
| 1440|01/01/2014|other vegetables,...|
| 1659|01/01/2014|specialty chocola...|
| 1789|01/01/2014|hamburger meat,ca...|
| 1922|01/01/2014|tropical fruit,ot...|
| 2226|01/01/2014|sausage,bottled w...|
| 2237|01/01/2014|bottled water,Ins...|
| 2351|01/01/2014|cleaner,shopping ...|
| 2542|01/01/2014|sliced cheese,bot...|
| 2610|01/01/2014|hamburger meat,bo...|
| 2709|01/01/2014|yogurt,frozen veg...|
| 2727|01/01/2014|hamburger meat,fr...|
| 2943|01/01/2014|whole milk,flower...|
| 2974|01/01/2014|berries,whipped/s...|
| 3681|01/01/2014|onions,whipped/so...|
| 3942|01/01/2014|other vegetables,...|
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 3/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
| 3956|01/01/2014|yogurt,shopping b...|
+-------------+----------+--------------------+
def convertToItemsList(x):
get = x.strip().split(';')
id = get[0]
date = get[1]
items = list(set(get[2].split(',')))
return (id ,date, items)
tmp = [line.strip() for line in open('/content/baskets/part-00000', 'r')][1:]
tmp = sc.parallelize(tmp)
tmp = tmp.map(convertToItemsList)
dfBaskets = sqlc.createDataFrame(tmp,["Member_number","Date","Items"])
dfBaskets.show()
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="Items",
minSupport=0.01,
minConfidence=0.1)
model = fpGrowth.fit(dfBaskets)
model.freqItemsets.show()
model.associationRules.show()
+-------------+----------+--------------------+
+-------------+----------+--------------------+
| 1659|01/01/2014|[specialty chocol...|
| 1922|01/01/2014|[other vegetables...|
| 2226|01/01/2014|[sausage, bottled...|
| 3681|01/01/2014|[onions, whipped/...|
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 4/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
+-------------+----------+--------------------+
+--------------------+----+
| items|freq|
+--------------------+----+
| [beef]| 508|
| [napkins]| 331|
| [frankfurter]| 565|
| [sausage]| 903|
| [UHT-milk]| 320|
| [coffee]| 473|
| [waffles]| 277|
| [rolls/buns]|1646|
+--------------------+----+
+------------------+------------+-------------------+------------------+-------------
| antecedent| consequent| confidence| lift|
+------------------+------------+-------------------+------------------+-------------
|[other vegetables]|[whole milk]|0.12151067323481117|0.7694304712706219|0.014836596939
| [yogurt]|[whole milk]|0.12996108949416343|0.8229402378760761|0.01116086346
from pyspark.ml.linalg import Vectors
tmp = sqlc.read.option("delimiter", ",")\
.option("header", "true")\
.csv('/content/data.csv')
dfMembers = tmp.select(['Member_number','itemDescription'])\
.rdd\
.reduceByKey(lambda x, y: x + "," + y)\
.toDF()\
.selectExpr("_1 as Member_number", "_2 as Items")
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 5/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
dfMembers.show()
items = tmp.select('itemDescription')\
.rdd\
.distinct()\
.flatMap(lambda x: x)\
.collect()
items = sorted(items)
dictItems = {v:i for i,v in enumerate(items)}
print(items)
print(dictItems)
def basket2vector(member, basket, dictItems):
index = []
value = []
for k,v in dictItems.items():
if k in basket:
index.append(v)
value.append(1.0)
return Vectors.sparse(len(dictItems),index,value)
print(basket2vector(dfMembers.first()['Member_number'],
dfMembers.first()['Items'],
dictItems))
+-------------+--------------------+
|Member_number| Items|
+-------------+--------------------+
| 1249|citrus fruit,coff...|
| 1381|curd,soda,coffee,...|
| 1440|other vegetables,...|
| 1659|specialty chocola...|
| 1789|hamburger meat,ca...|
| 1922|tropical fruit,ot...|
| 2226|sausage,bottled w...|
| 2237|bottled water,Ins...|
| 2351|cleaner,shopping ...|
| 2542|sliced cheese,bot...|
| 2610|hamburger meat,bo...|
| 2709|yogurt,frozen veg...|
| 2727|hamburger meat,fr...|
| 2943|whole milk,flower...|
| 2974|berries,whipped/s...|
| 3681|onions,whipped/so...|
| 3797|waffles,whole mil...|
| 3942|other vegetables,...|
| 3956|yogurt,shopping b...|
| 4260|soda,brown bread,...|
+-------------+--------------------+
['Instant food products', 'UHT-milk', 'abrasive cleaner', 'artif. sweetener', 'baby cosm
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 6/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
from pyspark.ml.feature import MinHashLSH
from pyspark.sql.functions import col,monotonically_increasing_id
'''preprocess'''
new_dfMembers = dfMembers.rdd\
.map(lambda x: (x[0],basket2vector(x[0],x[1],dictItems)))\
.toDF()\
.select(col('_1').alias('Member_number'), col('_2').alias('Items'))
mh_lsh = MinHashLSH(inputCol="Items", outputCol="Hashes", numHashTables=5)
model = mh_lsh.fit(new_dfMembers)
model.transform(new_dfMembers).show()
model.approxSimilarityJoin(new_dfMembers, new_dfMembers, 0.3, distCol="JaccardDistance")\
.select(col("datasetA.Member_number").alias("idA"),
col("datasetB.Member_number").alias("idB"),
col("JaccardDistance")).filter(col("JaccardDistance") > 0).show()
+-------------+--------------------+--------------------+
+-------------+--------------------+--------------------+
| 1249|(167,[11,30,34,61...|[[5.3883012E7], [...|
| 1381|(167,[1,10,11,28,...|[[7550144.0], [2....|
| 1440|(167,[5,28,64,102...|[[4.57586438E8], ...|
| 1659|(167,[12,14,26,28...|[[4.7236819E7], [...|
| 1789|(167,[8,18,30,44,...|[[1.4196337E7], [...|
| 1922|(167,[10,12,15,16...|[[7550144.0], [5....|
| 2226|(167,[9,12,23,40,...|[[7550144.0], [5....|
| 2237|(167,[0,12,27,34,...|[[4.04607377E8], ...|
| 2351|(167,[5,31,34,49,...|[[4.50940245E8], ...|
| 2542|(167,[12,82,88,94...|[[1.0021588E8], [...|
| 2610|(167,[5,11,49,67,...|[[3.58274509E8], ...|
| 2709|(167,[12,30,40,44...|[[7550144.0], [6....|
| 2727|(167,[11,38,52,56...|[[5.3883012E7], [...|
| 2943|(167,[20,28,29,33...|[[3.51628316E8], ...|
| 2974|(167,[9,12,38,63,...|[[1.33256362E8], ...|
| 3681|(167,[38,47,63,88...|[[4.04607377E8], ...|
| 3797|(167,[15,38,64,10...|[[4.37647859E8], ...|
| 3942|(167,[0,5,21,31,3...|[[9.3569687E7], [...|
| 3956|(167,[0,5,15,16,2...|[[1.53194941E8], ...|
| 4260|(167,[14,20,27,38...|[[4.7236819E7], [...|
+-------------+--------------------+--------------------+
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 7/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
+----+----+-------------------+
+----+----+-------------------+
|3124|4174| 0.25|
|2347|1401| 0.25|
|1976|2342| 0.25|
|4730|1164|0.19999999999999996|
|4383|4885| 0.25|
|1161|1367| 0.2857142857142857|
|1164|4730|0.19999999999999996|
|3714|2911| 0.25|
|3715|4805| 0.25|
|3376|2442| 0.25|
|4805|3715| 0.25|
|2342|1976| 0.25|
|4885|4186| 0.25|
|4535|1643|0.19999999999999996|
|2442|3376| 0.25|
|4885|1117| 0.25|
|4174|2199| 0.25|
|1643|4535|0.19999999999999996|
|3008|3498| 0.2857142857142857|
|3598|4558| 0.2857142857142857|
+----+----+-------------------+
key = new_dfMembers.collect()[0]['Items']
model.approxNearestNeighbors(new_dfMembers, key, 5).show()
+-------------+--------------------+--------------------+------------------+
+-------------+--------------------+--------------------+------------------+
| 2708|(167,[3,34,61,75,...|[[5.3883012E7], [...|0.5714285714285714|
| 4327|(167,[30,34,63,76...|[[1.7958923E8], [...|0.5714285714285714|
+-------------+--------------------+--------------------+------------------+
from pyspark.ml.clustering import KMeans, KMeansModel
kmeans = KMeans(k=5)
kmeans.setSeed(1)
#maxIter = 20
new_dfMembers_km = new_dfMembers.select(col('Member_number').alias('label'), col('Items').ali
model = kmeans.fit(new_dfMembers_km)
model.setPredictionCol('prediction')
model.transform(new dfMembers km).show()
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 8/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
( _ _ ) ()
+-----+--------------------+----------+
|label| features|prediction|
+-----+--------------------+----------+
| 1249|(167,[11,30,34,61...| 1|
| 1381|(167,[1,10,11,28,...| 3|
| 1440|(167,[5,28,64,102...| 2|
| 1659|(167,[12,14,26,28...| 0|
| 1789|(167,[8,18,30,44,...| 0|
| 1922|(167,[10,12,15,16...| 4|
| 2226|(167,[9,12,23,40,...| 0|
| 2237|(167,[0,12,27,34,...| 1|
| 2351|(167,[5,31,34,49,...| 2|
| 2542|(167,[12,82,88,94...| 0|
| 2610|(167,[5,11,49,67,...| 2|
| 2709|(167,[12,30,40,44...| 0|
| 2727|(167,[11,38,52,56...| 4|
| 2943|(167,[20,28,29,33...| 0|
| 2974|(167,[9,12,38,63,...| 4|
| 3681|(167,[38,47,63,88...| 4|
| 3797|(167,[15,38,64,10...| 4|
| 3942|(167,[0,5,21,31,3...| 2|
| 3956|(167,[0,5,15,16,2...| 2|
| 4260|(167,[14,20,27,38...| 4|
+-----+--------------------+----------+
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 9/10
11:46, 29/10/2022 52000742_52000759_52000042.ipynb - Colaboratory
https://colab.research.google.com/drive/1whA-fOG_PaoEV-PMnlhMXdY3BRqAtenA#scrollTo=sWo0HJZ2wQsb&printMode=true 10/10