You are on page 1of 2

Cheat Sheet for PySpark

Wenqiang Feng
E-mail: von198@gmail.com, Web: http://web.utk.edu/˜wfeng1; https://runawayhorse001.github.io/LearningApacheSpark

Spark Configuration Auditing Data Modeling Pipeline


from pyspark.sql import SparkSession
spark = SparkSession.builder Checking schema Deal with categorical feature and label data
.appName("Python Spark regression example")
.config("config.option", "value").getOrCreate() df.printSchema() # Deal with categorical feature data
root from pyspark.ml.feature import VectorIndexer
|-- _c0: integer (nullable = true) featureIndexer = VectorIndexer(inputCol="features",
Loading Data |-- TV: double (nullable = true) outputCol="indexedFeatures",
|-- Radio: double (nullable = true) maxCategories=4).fit(data)
|-- Newspaper: double (nullable = true)
|-- Sales: double (nullable = true) featureIndexer.transform(data).show(2, True)
From RDDs +--------------------+-----+--------------------+
| features|label| indexedFeatures|
# Using parallelize( ) +--------------------+-----+--------------------+
df = spark.sparkContext.parallelize([('1','Joe','70000','1'), Checking missing value |(29,[1,11,14,16,1...| no|(29,[1,11,14,16,1...|
('2', 'Henry', '80000', None)]) +--------------------+-----+--------------------+
.toDF(['Id', 'Name', 'Sallary','DepartmentId']) from pyspark.sql.functions import count
# Using createDataFrame( ) def my_count(df): # Deal with categorical label data
df = spark.createDataFrame([('1', 'Joe', '70000', '1'), df.agg(*[count(c).alias(c) for c in df_in.columns]).show() labelIndexer=StringIndexer(inputCol='label',
('2', 'Henry', '80000', None)], my_count(df_raw) outputCol='indexedLabel').fit(data)
+---------+---------+--------+-----------+---------+----------+-------+
['Id','Name','Sallary','DepartmentId']) |InvoiceNo|StockCode|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
labelIndexer.transform(data).show(2, True)
+---+-----+-------+------------+ +--------------------+-----+------------+
+---------+---------+--------+-----------+---------+----------+-------+
| Id| Name|Sallary|DepartmentId| | features|label|indexedLabel|
| 541909| 541909| 541909| 541909| 541909| 406829| 541909|
+---+-----+-------+------------+ +--------------------+-----+------------+
+---------+---------+--------+-----------+---------+----------+-------+
| 1| Joe| 70000| 1| |(29,[1,11,14,16,1...| no| 0.0|
| 2|Henry| 80000| null| +--------------------+-----+------------+
+---+-----+-------+------------+
Checking statistical results
From Data Sources # function form my pyspark Spliting the data to training and test data sets
df_raw.describe().show()
. From .csv +-------+-----------------+------------------+------------------+ (trainingData, testData) = data.randomSplit([0.6, 0.4])
|summary| TV| Radio| Newspaper|
ds = spark.read.csv(path='Advertising.csv', +-------+-----------------+------------------+------------------+
sep=',',encoding='UTF-8',comment=None, | count| 200| 200| 200|
header=True,inferSchema=True) | mean| 147.0425|23.264000000000024|30.553999999999995| Importing the model
+-----+-----+---------+-----+ | stddev|85.85423631490805|14.846809176168728| 21.77862083852283|
| min| 0.7| 0.0| 0.3| from pyspark.ml.classification import LogisticRegression
| TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+ | max| 296.4| 49.6| 114.0| lr = LogisticRegression(featuresCol='indexedFeatures',
|230.1| 37.8| 69.2| 22.1| +-------+-----------------+------------------+------------------+ labelCol='indexedLabel')
| 44.5| 39.3| 45.1| 10.4|
+-----+-----+---------+-----+
. From .json Converting indexed labels back to original labels
Manipulating Data (More details on next page)
df = spark.read.json('/home/feng/Desktop/data.json') from pyspark.ml.feature import IndexToString
+----------+--------------------+-------------------+ labelConverter = IndexToString(inputCol="prediction",
| id| location| timestamp| Fixing missing value outputCol="predictedLabel",
+----------+--------------------+-------------------+ labels=labelIndexer.labels)
|2957256202|[72.1,DE,8086,52....|2019-02-23 22:36:52| Function Description
|2957256203|[598.5,BG,3963,42...|2019-02-23 22:36:52|
+----------+--------------------+-------------------+ df.na.fill() #Replace null values
. From Database Wrapping Pipeline
df.na.drop() #Dropping any rows with null values.
user = 'username'; pw ='password' pipeline = Pipeline(stages=[labelIndexer, featureIndexer,
table_name = 'table_name' lr,labelConverter])
url='jdbc:postgresql://##.###.###.##:5432/dataset?user='
+user+'&password='+pw Joining data
p='driver':'org.postgresql.Driver','password':pw,'user':user
df = spark.read.jdbc(url=url,table=table_name,properties=p) Training model and making predictions
Description Function
+-----+-----+---------+-----+
| TV|Radio|Newspaper|Sales| #Data join left.join(right,key, how='*') * = left,right,inner,full
model = pipeline.fit(trainingData)
+-----+-----+---------+-----+ predictions = model.transform(testData)
|230.1| 37.8| 69.2| 22.1| predictions.select("features","label","predictedLabel").show(2)
| 44.5| 39.3| 45.1| 10.4| +--------------------+-----+--------------+
+-----+-----+---------+-----+ Wrangling with UDF | features|label|predictedLabel|
+--------------------+-----+--------------+
. From HDFS from pyspark.sql import functions as F |(29,[0,11,13,16,1...| no| no|
from pyspark.conf import SparkConf from pyspark.sql.types import DoubleType +--------------------+-----+--------------+
from pyspark.context import SparkContext # user defined function
from pyspark.sql import HiveContext def complexFun(x):
sc= SparkContext('local','example') return results
Evaluating
hc = HiveContext(sc) Fn = F.udf(lambda x: complexFun(x), DoubleType()) from pyspark.ml.evaluation import *
tf1 = sc.textFile("hdfs://###/user/data/file_name")
+-----+-----+---------+-----+ df.withColumn('2col', Fn(df.col)) evaluator = MulticlassClassificationEvaluator(
| TV|Radio|Newspaper|Sales| labelCol="indexedLabel",
+-----+-----+---------+-----+ predictionCol="prediction", metricName="accuracy")
|230.1| 37.8| 69.2| 22.1| accu = evaluator.evaluate(predictions)
| 44.5| 39.3| 45.1| 10.4| Reducing features print("Test Error: %g, AUC: %g"%(1-accu,Summary.areaUnderROC))
+-----+-----+---------+-----+
df.select(featureNameList) Test Error: 0.0986395, AUC: 0.886664269877
©All Rights Reserved by Dr.Wenqiang Feng. Powered by LATEX. Updated:02-26-2019. von198@gmail.com
Data Wrangling: Combining DataFrame Data Wrangling: Reshaping Data Data Wrangling: Reshaping Data

Mutating Joins Spliting Summarise Data


11 12 2 3 3
A B Change Function a
3
4
2
2
3
3
3
3
11 2 4 3
2 1 2 3
#ArrayType() tidyr::separate ::separate one column into several 3 3 3 3 3 3 4 2 3
X1 X2 X1 X3 3 3 3 3 3
key value key value0 value1 value2 df.select("key", df.value[0], df.value[1], df.value[2]).show()
a 1
+
a T
= a
b
[1,2,3]

[2,3,4]
a
b
1
2
2
3
3
4
#StructType() Function Description
b 2 b F df2.select('key', 'value.*').show()
df.describe() #Computes simple statistics
c 3 d T key
#Splitting one column into rows
value

#Computes the correlation matrix


a
Result Function key
a
1
2
df.select("key",F.split("values", ",").alias("values"), Correlation.corr(df)
F.posexplode(F.split("values",",")).alias("pos", "val")
value

X1 X2 X3 #Join matching rows from B to A a 1,2,3


a 3
).drop("val")
a 1 T #dplyr::left_join(A, B, by = "x1") b 2,3,4 b 2 df.count() #Count the number of rows
b 2 F
A.join(B,'X1',how='left') b 3 .select("key",F.expr("values[pos]").alias("val")).show()

c 3 T
.orderBy('X1', ascending=True).show() b 4 3
3
#Gather columns into rows 3 Summary Function 1
#Join matching rows from A to B def to_long(df, by): 3
X1 X2 X3 3
#dplyr::right_join(A, B, by = "x1")
key value cols, dtypes = zip(*((c,t) for (c, t) in df.dtypes if c not in by))
a 1 T Description Demo
A.join(B,'X1',how='right') a
a
1 # Spark SQL supports only homogeneous columns
b 2 F 2 assert len(set(dtypes))==1,"All columns have to be of the same type"
c null T .orderBy('X1', ascending=True).show() A
a
col0

1
col1

2
col2

3
a 3 # Create and explode an array of (column_name, column_value) structs #Sum df.agg(F.max(df.C)).head()[0]#Similar for: F.min,max,avg,stddev
b 4
b 4 5 6 kvs = explode(array([
#Retain only rows in both sets b 5
struct(lit(c).alias("key"), col(c).alias("val")) for c in cols
X1 X2 X3 #dplyr::inner_join(A, B, by = "x1") b 6
])).alias("kvs")
a 1 T A.join(B,'X1',how='inner') return df.select(by + [kvs]).select(by + ["kvs.key", "kvs.val"])
Group Data
b 2 F .orderBy('X1', ascending=True).show() A B C A B C
m 1 4 m 1 4 A min b max b avg c

X1 X2 X3 #Retain all values,all rows


m 2 5 m 2 5 m 1 2 4.5

a 1 T #dplyr::full_join(A, B, by = "x1") Pivot n


n
3
4
7
8
n
n
3
4
7
8
n 3 4 7.5

b 2 F A.join(B,'X1',how='full') df.groupBy(['A'])
#Spread rows into columns
key value

c 3 null .orderBy('X1', ascending=True).show() a 1

d null T
a
a
2 key 1 2 3
df.groupBy(['key'])
A min b max b avg c
.agg(F.min('B').alias('min_b'),
3 m
b 1 2 1 2 4.5
F.max('B').alias('max_b'),
null

.pivot('col1').sum('col1').show()
a 1 a 4 2 3
b 1 n 3 4 7.5
b 2 F.avg('C').alias('avg_c')).show()
Filtering Joins
def quant_pd(val_list):
#All rows in A that have a match in B Subset Observations (Rows) quant = np.round(np.percentile(val_list,
X1 X2 #dplyr::semi_join(A, B, by = "x1")
a 1 a.join(b,'X1',how='left_semi') 11 12 2 3 3 [20,50,75]),2)
.orderBy('X1', ascending=True).show() return list(map(float,quant))
3 2 3 3 11 11 4 3 51
b 2 a 4 2 3 3 21 1 2 3 51

Fn = F.udf(quant_pd,ArrayType(FloatType()))
3 3 3 3 3 a 4 2 3 53
A min b max b list c
#All rows in A, don't have a match in B 3 3 3 3 3
m 1 2 [4.2,4.5,4.75]
#dplyr::anti_join(A, B, by = "x1") Function Description #GroupBy and aggregate
X1 X2 A.join(B,'X1',how='left_anti') n 3 4 [7.2,7.5,7.75]

df.groupBy(['A'])
df.na.drop() #Omitting rows with null values
C 3
.orderBy('X1', ascending=True).show()
.agg(F.min('B').alias('min_b'),
df.where() #Filters rows using the given condition F.max('B').alias('max_b'),
Fn(F.collect_list(col('C'))).alias('list_c'))
DataFrame Operations df.filter() #Filters rows using the given condition
Y Z df.distinct() #Returns distinct rows in this DataFrame Windows
X1 X2 X1 X2
a 1 b 2
df.sample() #Returns a sampled subset of this DataFrame A
a
B
m
C
1
A
a
B
m
C
1
A
a
B
m
C
1
D
?

b 2
+ c 3
= b m 2 b m 2 b m 2 ?

df.sampleBy() #Returns a stratified sample without replacement


c n 3 c n 3 c n 3 ?
d n 6 d n 4 d n 6 ?
c 3 d 4
Result Function
Result Function from pyspark.sql import Window
X1 X2 #Rows that appear in both Y and Z Subset Variables (Columns) A B C D
#Define windows for difference
#dplyr::intersect(Y, Z) a m 1 0
b 2
c 3 Y.intersect(Z).show() key 2 3 3 key 2 3 b m 2 1 w = Window.partitionBy(df.B)
D = df.C - F.max(df.C).over(w)
3 2 3 3 3 2 3
a 4 2 3 3 a 4 2 3 c n 3 0
X1 X2 #Rows that appear in either or both Y and Z
a 1 #dplyr::union(Y, Z)
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
d n 6 3 df.withColumn('D',D).show()
b 2 Y.union(Z).dropDuplicates() Function Description
c .orderBy('X1', ascending=True).show() df = df.withColumn("D",
3
df.select() #Applys expressions and returns a new DataFrame A B C D
F.monotonically_increasing_id())
d 4 a m 1 1
#Rows that appear in Y but not Z b m 2 2 #Define windows for row_num
X1 X2 #dplyr::setdiff(Y, Z) c n 3 3 w = Window.orderBy("D")
a 1 Y.subtract(Z).show() Make New Vaiables d n 6 4 df.withColumn("D", F.row_number().over(w))
key 12 2 3 key 12 2 3 3
21
41
3
4
2
2
3
3
21
31
3
4
2
2
3
3
3
3
A B C D #Define windows for rank
Binding 41
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
b m 2 1 w = Window.partitionBy('B')
a m 1 2
X1 X2 Function Examples n
.orderBy(df.C.desc())
d 6 1
a df.withColumn("D",rank().over(w)).show()
1
#Append Z to Y as new rows df.withColumn() df.withColumn('new',1/df.col) c n 3 2
b 2 #dplyr::bind_rows(Y, Z)
c 3 Y.union(Z) df.withColumn('new',F.log(df.col))
b 2 .orderBy('X1', ascending=True).show() Rename Vaiables
c 3
d 4
df.withColumn('id', psf.monotonically_increasing_id()) key 2 3 3 key 2 3 3
3 2 3 3 3 2 3 3
a 4 2 3 3 a 4 2 3 3
#Append Z to Y as new columns
X1 X2 X1 X2
a #Caution: zipDataFrames form my package df.withColumn("new", Fn('col')) #Fn:F.udf() 3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
1 b 2
b 2 c 3 #dplyr::bind_cols(Y, Z) Function Description
c 3 d 4
zipDataFrames(Y,Z).show() df.withColumn('new', F.when((df.c1>1)&(df.c2<2),1)
.when((df.c3>3),2).otherwise(3)) df.withColumnRenamed() #Renaming an existing column
©All Rights Reserved by Dr.Wenqiang Feng. Powered by LATEX. Updated:02-26-2019. von198@gmail.com

You might also like