Professional Documents
Culture Documents
Wide Transformation Key-Value Pair in PySpark
Wide Transformation Key-Value Pair in PySpark
Out[4]:
In [9]:
In [12]:
data = sc.parallelize([(1,2),(3,4),(3,6),(3,4)])
data.collect()
Out[12]:
[(1, 2), (3, 4), (3, 6), (3, 4)]
In [13]:
type(data)
Out[13]:
pyspark.rdd.RDD
In [14]:
data.count()
Out[14]:
4
In [15]:
data.countByValue()
Out[15]:
defaultdict(int, {(1, 2): 1, (3, 4): 2, (3, 6): 1})
In [16]:
dataStr = sc.parallelize([(1,'mike'),(2,'john'),(3,'rambo'),(4,'bill')])
dataStr.collect()
Out[16]:
[(1, 'mike'), (2, 'john'), (3, 'rambo'), (4, 'bill')]
In [17]:
dataStr.count()
Out[17]:
4
In [21]:
dataStr.countByValue()
Out[21]:
defaultdict(int,
{(1, 'mike'): 1, (2, 'john'): 1, (3, 'rambo'): 1, (3, 'bill'): 1})
In [5]:
data.top(2)
Out[5]:
[(3, 6), (3, 4)]
In [18]:
data.collect()
Out[18]:
sortByKey
In [6]:
data.sortByKey().collect()
Out[6]:
[(1, 2), (3, 4), (3, 6), (3, 4)]
In [7]:
In [8]:
data.keys().collect()
Out[8]:
[1, 3, 3, 3]
In [9]:
data.values().collect()
Out[9]:
[2, 4, 6, 4]
In [10]:
data.mapValues(lambda a : a*a).collect()
Out[10]:
[(1, 4), (3, 16), (3, 36), (3, 16)]
In [11]:
In [5]:
Image(filename='ReduceByKey.jpg')
Out[5]:
In [19]:
data.collect()
Out[19]:
[(1, 2), (3, 4), (3, 6), (3, 4)]
In [20]:
data.reduceByKey(lambda x, y : x+y).collect()
Out[20]:
[(1, 2), (3, 14)]
In [21]:
data.reduceByKey(max).collect()
Out[21]:
[(1, 2), (3, 6)]
In [6]:
Image(filename='GroupByKey.jpg')
Image(filename='GroupByKey.jpg')
Out[6]:
In [27]:
# groupBy: This transformation groups all the rows with the same key into a single row.
result = data.groupByKey().collect()
In [28]:
result
Out[28]:
[(1, <pyspark.resultiterable.ResultIterable at 0x1a4b23c4048>),
(3, <pyspark.resultiterable.ResultIterable at 0x1a4b23c4bc8>)]
In [29]:
for (k,v) in result:
print(k, list(v))
1 [2]
3 [4, 6, 4]
In [ ]:
In [30]:
aa = data.groupByKey().mapValues(sum)
In [31]:
aa.collect()
Out[31]:
[(1, 2), (3, 14)]
In [32]:
bb = data.groupByKey().mapValues(max)
In [33]:
bb.collect()
Out[33]:
[(1, 2), (3, 6)]
In [34]:
# reduceByKey = gropupByKey().mapValues(), then what is the difference ??
In [35]:
data.collect()
Out[35]:
[(1, 2), (3, 4), (3, 6), (3, 4)]
In [36]:
data.flatMapValues(lambda x: range(1, x)).collect()
Out[36]:
[(1, 1),
(3, 1),
(3, 2),
(3, 3),
(3, 1),
(3, 2),
(3, 3),
(3, 4),
(3, 5),
(3, 1),
(3, 2),
(3, 3)]
In [37]:
data.collect()
Out[37]:
[(1, 2), (3, 4), (3, 6), (3, 4)]
In [38]:
data2 = sc.parallelize([(3,9)])
data2.collect()
Out[38]:
[(3, 9)]
In [39]:
data.subtractByKey(data2).collect()
Out[39]:
[(1, 2)]
In [40]:
data2.subtractByKey(data).collect()
Out[40]:
[]
In [41]:
Image(filename='Join.jpg')
Out[41]:
In [47]:
data.collect()
Out[47]:
[(1, 2), (3, 4), (3, 6), (3, 4)]
In [43]:
data2 = sc.parallelize([(3,9),(4,15)])
data2.collect()
Out[43]:
[(3, 9), (4, 15)]
In [44]:
data.join(data2).collect()
Out[44]:
[(3, (4, 9)), (3, (6, 9)), (3, (4, 9))]
In [48]:
data2.join(data).collect()
Out[48]:
[(3, (9, 4)), (3, (9, 6)), (3, (9, 4))]
In [45]:
# rightOuterJoin: Perform a join between two RDDs where key must be present
# in the first RDD.
data.rightOuterJoin(data2).collect()
Out[45]:
[(3, (4, 9)), (3, (6, 9)), (3, (4, 9)), (4, (None, 15))]
In [49]:
data2.rightOuterJoin(data).collect()
Out[49]:
[(1, (None, 2)), (3, (9, 4)), (3, (9, 6)), (3, (9, 4))]
In [46]:
# leftOuterJoin: Perform a join between two RDDs where key must
# be present in the OTHER RDD.
data.leftOuterJoin(data2).collect()
Out[46]:
[(1, (2, None)), (3, (4, 9)), (3, (6, 9)), (3, (4, 9))]
In [50]:
data2.leftOuterJoin(data).collect()
Out[50]:
[(3, (9, 4)), (3, (9, 6)), (3, (9, 4)), (4, (15, None))]
In [8]:
sc.stop()
In [ ]: