Professional Documents
Culture Documents
[2]:
In [3]:
import pyspark
In [4]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
spark
Out[4]:
SparkSession - in-memory
SparkContext
Spark UI (http://10.1.41.126:4040)
Version
v3.1.2
Master
local[*]
AppName
pyspark-shell
a) Read Data
In [5]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
SD=spark.read.csv("Suicide Rates Overview 1985 to 2016_new.csv",header=True,inferSchema=Tru
SD.show(5)
+-------+----+------+-----------+-----------+----------+-----------------+--
----------+------------------+------------------+---------------+
+-------+----+------+-----------+-----------+----------+-----------------+--
----------+------------------+------------------+---------------+
+-------+----+------+-----------+-----------+----------+-----------------+--
----------+------------------+------------------+---------------+
SD.printSchema()
root
In [7]:
CSD=SD.withColumnRenamed("suicides/100k pop","suicide_100k_pop")\
.withColumnRenamed("HDI for year","HDI_for_year")\
.withColumnRenamed(" gdp_for_year ($) ","gdp_for_year")\
.withColumnRenamed("gdp_per_capita ($)","gdp_for_capita")
CSD.printSchema()
root
In [8]:
c)Check Duplicates
In [9]:
DCSD.count()
CCSD=DCSD.dropDuplicates()
CCSD.count()
Out[9]:
27820
+-------+----+---+---+-----------+----------+----------------+------------+-
-----------+--------------+----------+
|country|year|sex|age|suicides_no|population|suicide_100k_pop|HDI_for_year|g
dp_for_year|gdp_for_capita|generation|
+-------+----+---+---+-----------+----------+----------------+------------+-
-----------+--------------+----------+
| 0| 0| 0| 0| 0| 0| 0| 19456|
0| 0| 0|
+-------+----+---+---+-----------+----------+----------------+------------+-
-----------+--------------+----------+
In [11]:
NCSD=DCSD.drop('HDI_for_year')
NCSD.printSchema()
root
In [12]:
NCSD.describe().show(truncate=False)
+-------+----------+------------------+------+-----------+------------------
+------------------+------------------+---------------------+---------------
---+----------+
+-------+----------+------------------+------+-----------+------------------
+------------------+------------------+---------------------+---------------
---+----------+
+-------+----------+------------------+------+-----------+------------------
+------------------+------------------+---------------------+---------------
---+----------+
In [13]:
NCSD.select("country").distinct().count()
Out[13]:
101
In [14]:
A=NCSD.select("country","suicide_100k_pop").groupBy("country").sum()
A.sort("sum(suicide_100k_pop)",ascending=False).show(5)
+------------------+---------------------+
| country|sum(suicide_100k_pop)|
+------------------+---------------------+
| Lithuania| 10588.879999999997|
| Hungary| 10156.069999999994|
| Kazakhstan| 9519.519999999995|
+------------------+---------------------+
Data Visualization
In [15]:
G=NCSD.select(["year",'sex','suicide_100k_pop']).groupBy("sex","year").sum()
G.select("sex","sum(suicide_100k_pop)","year").sort("year",ascending=True).show(5)
+------+---------------------+----+
| sex|sum(suicide_100k_pop)|year|
+------+---------------------+----+
|female| 1663.7499999999993|1985|
| male| 5148.139999999999|1985|
| male| 4905.789999999998|1986|
|female| 1674.049999999999|1986|
|female| 1857.2099999999991|1987|
+------+---------------------+----+
In [16]:
NCSD.select(['age','suicides_no']).groupBy("age").sum()
Out[16]:
age=NCSD.select(['age','suicide_100k_pop']).groupBy("age").sum()
age.sort("sum(suicide_100k_pop)",ascending=False).show()
+-----------+---------------------+
| age|sum(suicide_100k_pop)|
+-----------+---------------------+
+-----------+---------------------+
In [18]:
NCSD.select(['year','suicides_no'])
Out[18]:
A=NCSD.select(['year','country','suicides_no'])
# print(NCSD.corr("suicides_no","gdp_for_capita"))
# print(NCSD_index.corr("suicides_no","sex_index"))
# print(NCSD_index.corr("suicides_no","age_index"))
# print(NCSD_index.corr("suicides_no","generation_index"))
# print(NCSD_index.corr("suicides_no","population"))
# print(NCSD_index.corr("suicides_no","suicide_100k_pop"))
# print(NCSD_index.corr("suicides_no","gdp_for_capita"))
# print(NCSD_index.corr("suicides_no","country_index"))
# print(NCSD_index.corr("suicides_no","gdp_for_year"))
# NCSD_index.corr()
In [20]:
AB=NCSD.select('country','suicides_no','gdp_for_capita').groupBy('country').sum()
AB.sort("sum(suicides_no)",ascending=False).show(10)
+------------------+----------------+-------------------+
| country|sum(suicides_no)|sum(gdp_for_capita)|
+------------------+----------------+-------------------+
+------------------+----------------+-------------------+
NCSD.printSchema()
root
In [22]:
root
In [23]:
+--------------------+-----------+
| features|suicides_no|
+--------------------+-----------+
|[312900.0,6.71,79...| 21|
|[308000.0,5.19,79...| 16|
|[289700.0,4.83,79...| 14|
|[21800.0,4.59,796...| 1|
|[274300.0,3.28,79...| 9|
+--------------------+-----------+
In [28]:
In [29]:
In [31]:
lin_summary=lin_reg.summary
print("Linear Regression RMSE= %.2f" % lin_summary.rootMeanSquaredError)
print("Linear Regression r2= %.2f" %lin_summary.r2)
In [1388]:
In [1425]:
ran_eval_r2=RegressionEvaluator(labelCol='suicides_no',predictionCol='prediction',metricNam
r2_ran=ran_eval_r2.evaluate(ran_final)
ran_eval_rmse=RegressionEvaluator(labelCol='suicides_no',predictionCol='prediction',metricN
rmse_ran=ran_eval_rmse.evaluate(ran_final)
ran_eval_mae=RegressionEvaluator(labelCol='suicides_no',predictionCol='prediction',metricNa
mae_ran=ran_eval_mae.evaluate(ran_final)
gbt_eval_r2=RegressionEvaluator(labelCol='suicides_no',predictionCol='prediction',metricNam
r2_gbt=gbt_eval_r2.evaluate(gbt_final)
gbt_eval_rmse=RegressionEvaluator(labelCol='suicides_no',predictionCol='prediction',metricN
rmse_gbt=gbt_eval_rmse.evaluate(gbt_final)
gbt_eval_mae=RegressionEvaluator(labelCol='suicides_no',predictionCol='prediction',metricNa
mae_gbt=gbt_eval_mae.evaluate(gbt_final)
NCSD_index.describe()
Out[1051]:
In [1052]:
NCSD_index.columns
Out[1052]:
['country',
'year',
'sex',
'age',
'suicides_no',
'population',
'suicide_100k_pop',
'gdp_for_year',
'gdp_for_capita',
'generation',
'country_index',
'sex_index',
'age_index',
'generation_index']
In [1435]:
Scaling Data
In [1436]:
In [1437]:
In [1438]:
# silhouette_score=[]
print(""" Silhouette Score for k Mean Clustering
==========================================
Model\tScore\t
=====\t=====\t""")
for k in range(2,11):
kmeans=KMeans(featuresCol='clus_scaled_features',k=k)
kmeans_fit=kmeans.fit(cluster_data)
final_data=kmeans_fit.transform(cluster_data)
score=eval_model.evaluate(final_data)
# silhouette_score.append(score)
print("K=",k,"\t",score,"\t")
==========================================
Model Score
===== =====
K= 2 0.23044922043506058
K= 3 0.18251790521343167
K= 4 0.20159108306819562
K= 5 0.25716451288939984
K= 6 0.26436989889004076
K= 7 0.28181745942654746
K= 8 0.28772092069916644
K= 9 0.2760273036768222
K= 10 0.2818515831357536
Classification Models
In [1614]:
File "C:\Users\pagid\anaconda3\lib\site-packages\pyspark\mllib\common.py",
line 137, in __del__
self._sc._gateway.detach(self._java_model)
File "C:\Users\pagid\anaconda3\lib\site-packages\pyspark\mllib\common.py",
line 137, in __del__
self._sc._gateway.detach(self._java_model)
File "C:\Users\pagid\anaconda3\lib\site-packages\pyspark\mllib\common.py",
line 137, in __del__
self._sc._gateway.detach(self._java_model)
File "C:\Users\pagid\anaconda3\lib\site-packages\pyspark\mllib\common.py",
line 137, in __del__
self._sc._gateway.detach(self._java_model)
+---------------------+---------+
|scaled_class_features|sex_index|
+---------------------+---------+
| [234.382701998199...| 0.0|
| [234.382701998199...| 0.0|
| [234.500778926157...| 0.0|
| [234.736932782075...| 1.0|
| [234.736932782075...| 1.0|
+---------------------+---------+
In [1617]:
Logistic Regression
In [1639]:
In [1640]:
log_reg=LogisticRegression(labelCol='sex_index',featuresCol='scaled_class_features',maxIter
log_model=log_reg.fit(class_train)
In [1641]:
log_prediction=log_model.transform(class_test)
In [1642]:
eval_lr=MulticlassClassificationEvaluator(labelCol="sex_index",predictionCol="prediction",
metricName="accuracy")
lr_accuracy=eval_lr.evaluate(log_prediction)
print("Logistic Regression accuracy= %.2f" %lr_accuracy)
Random Forest
In [1209]:
ran_for=RandomForestClassifier(labelCol='sex_index',featuresCol='scaled_class_features',num
ran_model=ran_for.fit(class_train)
In [1210]:
ran_prediction=ran_model.transform(class_test)
In [1413]:
eval_ran=MulticlassClassificationEvaluator(labelCol="sex_index",predictionCol="prediction",
metricName="accuracy")
ran_accuracy=eval_ran.evaluate(ran_prediction)
print("Random Forest accuracy= %.2f" %ran_accuracy)
In [1213]:
ran_model.featureImportances
Out[1213]:
decision_class= DecisionTreeClassifier(labelCol='sex_index',featuresCol='scaled_class_featu
decision_model=decision_class.fit(class_train)
In [1215]:
decision_prediction=decision_model.transform(class_test)
In [1414]:
eval_decision=MulticlassClassificationEvaluator(labelCol="sex_index",predictionCol="predict
metricName="accuracy")
decision_accuracy=eval_decision.evaluate(decision_prediction)
print("Decision Tree accuracy= %.2f" %decision_accuracy)
In [1217]:
decision_model.featureImportances
Out[1217]:
Naive Bayes
In [1218]:
naive_bayes=NaiveBayes(labelCol='sex_index',featuresCol='scaled_class_features',smoothing=1
naive_model=naive_bayes.fit(class_train)
In [1219]:
naive_prediction=naive_model.transform(class_test)
In [1415]:
eval_naive=MulticlassClassificationEvaluator(labelCol="sex_index",predictionCol="prediction
metricName="accuracy")
naive_accuracy=eval_naive.evaluate(naive_prediction)
print("Naive Bayes accuracy= %.2f" %naive_accuracy)