You are on page 1of 7

In [4]:

from IPython.display import Image


Image(filename='RDD_Actions_Transformations.jpg')

Out[4]:

In [9]:

from pyspark import SparkContext, SparkConf


sc = SparkContext()

In [12]:

data = sc.parallelize([(1,2),(3,4),(3,6),(3,4)])
data.collect()

Out[12]:
[(1, 2), (3, 4), (3, 6), (3, 4)]

In [13]:
type(data)

Out[13]:
pyspark.rdd.RDD

In [14]:
data.count()
Out[14]:
4

In [15]:

data.countByValue()
Out[15]:
defaultdict(int, {(1, 2): 1, (3, 4): 2, (3, 6): 1})
In [16]:
dataStr = sc.parallelize([(1,'mike'),(2,'john'),(3,'rambo'),(4,'bill')])
dataStr.collect()
Out[16]:
[(1, 'mike'), (2, 'john'), (3, 'rambo'), (4, 'bill')]

In [17]:
dataStr.count()

Out[17]:
4

In [21]:
dataStr.countByValue()
Out[21]:
defaultdict(int,
{(1, 'mike'): 1, (2, 'john'): 1, (3, 'rambo'): 1, (3, 'bill'): 1})

In [5]:
data.top(2)

Out[5]:
[(3, 6), (3, 4)]

In [18]:
data.collect()
Out[18]:

[(1, 2), (3, 4), (3, 6), (3, 4)]

sortByKey

In [6]:
data.sortByKey().collect()
Out[6]:
[(1, 2), (3, 4), (3, 6), (3, 4)]

In [7]:

# lookup : Return all value associated with the given key.


data.lookup(3)
Out[7]:
[4, 6, 4]

In [8]:
data.keys().collect()
Out[8]:
[1, 3, 3, 3]

In [9]:
data.values().collect()
Out[9]:
[2, 4, 6, 4]

In [10]:
data.mapValues(lambda a : a*a).collect()
Out[10]:
[(1, 4), (3, 16), (3, 36), (3, 16)]

In [11]:

# data.map() no use in Key value


# data.reduce() no use in Key value

In [5]:
Image(filename='ReduceByKey.jpg')
Out[5]:

In [19]:
data.collect()
Out[19]:
[(1, 2), (3, 4), (3, 6), (3, 4)]

In [20]:
data.reduceByKey(lambda x, y : x+y).collect()
Out[20]:
[(1, 2), (3, 14)]

In [21]:
data.reduceByKey(max).collect()
Out[21]:
[(1, 2), (3, 6)]

In [6]:
Image(filename='GroupByKey.jpg')
Image(filename='GroupByKey.jpg')
Out[6]:

In [27]:
# groupBy: This transformation groups all the rows with the same key into a single row.
result = data.groupByKey().collect()

In [28]:
result
Out[28]:
[(1, <pyspark.resultiterable.ResultIterable at 0x1a4b23c4048>),
(3, <pyspark.resultiterable.ResultIterable at 0x1a4b23c4bc8>)]

In [29]:
for (k,v) in result:
print(k, list(v))

1 [2]
3 [4, 6, 4]

In [ ]:

In [30]:
aa = data.groupByKey().mapValues(sum)

In [31]:
aa.collect()
Out[31]:
[(1, 2), (3, 14)]

In [32]:
bb = data.groupByKey().mapValues(max)

In [33]:
bb.collect()
Out[33]:
[(1, 2), (3, 6)]

In [34]:
# reduceByKey = gropupByKey().mapValues(), then what is the difference ??

In [35]:
data.collect()
Out[35]:
[(1, 2), (3, 4), (3, 6), (3, 4)]

In [36]:
data.flatMapValues(lambda x: range(1, x)).collect()
Out[36]:
[(1, 1),
(3, 1),
(3, 2),
(3, 3),
(3, 1),
(3, 2),
(3, 3),
(3, 4),
(3, 5),
(3, 1),
(3, 2),
(3, 3)]

In [37]:
data.collect()
Out[37]:
[(1, 2), (3, 4), (3, 6), (3, 4)]

In [38]:
data2 = sc.parallelize([(3,9)])
data2.collect()
Out[38]:
[(3, 9)]

In [39]:
data.subtractByKey(data2).collect()
Out[39]:
[(1, 2)]

In [40]:
data2.subtractByKey(data).collect()
Out[40]:
[]

In [41]:
Image(filename='Join.jpg')
Out[41]:

In [47]:
data.collect()
Out[47]:
[(1, 2), (3, 4), (3, 6), (3, 4)]

In [43]:
data2 = sc.parallelize([(3,9),(4,15)])
data2.collect()
Out[43]:
[(3, 9), (4, 15)]

In [44]:
data.join(data2).collect()
Out[44]:
[(3, (4, 9)), (3, (6, 9)), (3, (4, 9))]

In [48]:
data2.join(data).collect()
Out[48]:
[(3, (9, 4)), (3, (9, 6)), (3, (9, 4))]

In [45]:
# rightOuterJoin: Perform a join between two RDDs where key must be present
# in the first RDD.
data.rightOuterJoin(data2).collect()
Out[45]:
[(3, (4, 9)), (3, (6, 9)), (3, (4, 9)), (4, (None, 15))]

In [49]:
data2.rightOuterJoin(data).collect()
Out[49]:
[(1, (None, 2)), (3, (9, 4)), (3, (9, 6)), (3, (9, 4))]

In [46]:
# leftOuterJoin: Perform a join between two RDDs where key must
# be present in the OTHER RDD.
data.leftOuterJoin(data2).collect()
Out[46]:
[(1, (2, None)), (3, (4, 9)), (3, (6, 9)), (3, (4, 9))]

In [50]:
data2.leftOuterJoin(data).collect()
Out[50]:
[(3, (9, 4)), (3, (9, 6)), (3, (9, 4)), (4, (15, None))]

In [8]:
sc.stop()

In [ ]:

You might also like