Professional Documents
Culture Documents
Wenqiang Feng
E-mail: von198@gmail.com, Web: http://web.utk.edu/˜wfeng1; https://runawayhorse001.github.io/LearningApacheSpark
[2,3,4]
a
b
1
2
2
3
3
4
#StructType() Function Description
b 2 b F df2.select('key', 'value.*').show()
df.describe() #Computes simple statistics
c 3 d T key
#Splitting one column into rows
value
c 3 T
.orderBy('X1', ascending=True).show() b 4 3
3
#Gather columns into rows 3 Summary Function 1
#Join matching rows from A to B def to_long(df, by): 3
X1 X2 X3 3
#dplyr::right_join(A, B, by = "x1")
key value cols, dtypes = zip(*((c,t) for (c, t) in df.dtypes if c not in by))
a 1 T Description Demo
A.join(B,'X1',how='right') a
a
1 # Spark SQL supports only homogeneous columns
b 2 F 2 assert len(set(dtypes))==1,"All columns have to be of the same type"
c null T .orderBy('X1', ascending=True).show() A
a
col0
1
col1
2
col2
3
a 3 # Create and explode an array of (column_name, column_value) structs #Sum df.agg(F.max(df.C)).head()[0]#Similar for: F.min,max,avg,stddev
b 4
b 4 5 6 kvs = explode(array([
#Retain only rows in both sets b 5
struct(lit(c).alias("key"), col(c).alias("val")) for c in cols
X1 X2 X3 #dplyr::inner_join(A, B, by = "x1") b 6
])).alias("kvs")
a 1 T A.join(B,'X1',how='inner') return df.select(by + [kvs]).select(by + ["kvs.key", "kvs.val"])
Group Data
b 2 F .orderBy('X1', ascending=True).show() A B C A B C
m 1 4 m 1 4 A min b max b avg c
b 2 F A.join(B,'X1',how='full') df.groupBy(['A'])
#Spread rows into columns
key value
d null T
a
a
2 key 1 2 3
df.groupBy(['key'])
A min b max b avg c
.agg(F.min('B').alias('min_b'),
3 m
b 1 2 1 2 4.5
F.max('B').alias('max_b'),
null
.pivot('col1').sum('col1').show()
a 1 a 4 2 3
b 1 n 3 4 7.5
b 2 F.avg('C').alias('avg_c')).show()
Filtering Joins
def quant_pd(val_list):
#All rows in A that have a match in B Subset Observations (Rows) quant = np.round(np.percentile(val_list,
X1 X2 #dplyr::semi_join(A, B, by = "x1")
a 1 a.join(b,'X1',how='left_semi') 11 12 2 3 3 [20,50,75]),2)
.orderBy('X1', ascending=True).show() return list(map(float,quant))
3 2 3 3 11 11 4 3 51
b 2 a 4 2 3 3 21 1 2 3 51
Fn = F.udf(quant_pd,ArrayType(FloatType()))
3 3 3 3 3 a 4 2 3 53
A min b max b list c
#All rows in A, don't have a match in B 3 3 3 3 3
m 1 2 [4.2,4.5,4.75]
#dplyr::anti_join(A, B, by = "x1") Function Description #GroupBy and aggregate
X1 X2 A.join(B,'X1',how='left_anti') n 3 4 [7.2,7.5,7.75]
df.groupBy(['A'])
df.na.drop() #Omitting rows with null values
C 3
.orderBy('X1', ascending=True).show()
.agg(F.min('B').alias('min_b'),
df.where() #Filters rows using the given condition F.max('B').alias('max_b'),
Fn(F.collect_list(col('C'))).alias('list_c'))
DataFrame Operations df.filter() #Filters rows using the given condition
Y Z df.distinct() #Returns distinct rows in this DataFrame Windows
X1 X2 X1 X2
a 1 b 2
df.sample() #Returns a sampled subset of this DataFrame A
a
B
m
C
1
A
a
B
m
C
1
A
a
B
m
C
1
D
?
b 2
+ c 3
= b m 2 b m 2 b m 2 ?