Professional Documents
Culture Documents
data: List[String] = List(spark, scala, spark, spark, spark, scala, java, scala)
reduceByKey
scala> mapData.reduceByKey(_+_).collect.foreach(println)
(spark,4)
(scala,3)
(java,1)
groupByKey
scala> mapData.groupByKey().map(x => (x._1 , x._2.sum) ).collect.foreach(println)
(spark,4)
(scala,3)
(java,1)
In the above two transformations (reduceByKey , groupByKey) we are getting the same
Output...however
scala> data.partitions.length
res3: Int = 4
scala> data.glom().collect
res4: Array[Array[Double]] = Array(Array(1.0, 2.0, 3.0, 4.0), Array(5.0,
6.0, 7.0, 7.0, 8.9), Array(12.0, 34.0, 5.0, 4.0, 76.0), Array(90.0, 87.0, 87.0,
65.0, 36.0))
scala> data.glom().collect
res6: Array[Array[Double]] = Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0),
Array(7.0, 7.0, 8.9, 12.0, 34.0, 5.0), Array(4.0, 76.0, 90.0, 87.0, 87.0, 65.0,
36.0))
scala>