You are on page 1of 4

val x = sc.

parallelize(List("spark rdd example", "sample example"))

val x = sc.parallelize(List("spark rdd example", "sample example”),2)

x.collect()

val textFileLocalTest = sc.textFile("/Users/syedrizvi/Desktop/HadoopExamples/file.txt");

val textFile = sc.textFile("hdfs://localhost:9000/test.txt")

Flat Map
val x = sc.parallelize(List("spark rdd example", "sample example"))
val y = x.flatMap(x => x.split(" "))

Map
val z = y.map(x => (x, 1));

Filter
val x = sc.parallelize(1 to 10)

Or with partition

val x = sc.parallelize(1 to 10, 2)


val y = x.filter(num => num%2==0)
y.collect();

Reduce
val x = sc.parallelize(1 to 10, 2)
val y = x.reduce((a, b) => (a+b))

Pair RDD Operations


GroupBy
val x = sc.parallelize(Array("Joseph", "Jimmy", "Tina","Thomas", "James", "Cory","Christine", "Jackeline",
"Juan"))
val y = x.groupBy(word => word.charAt(0))

y.collect();

ReduceByKey
val x = sc.parallelize(Array(("a", 1), ("b", 1), ("a", 1),("a", 1), ("b", 1),("b", 1),("b", 1), ("b", 1)))
val y = x.reduceByKey((key, value) => (key + value))
y.collect()

SortByKey
val y = x.sortByKey()
y.collect()

Joins
val salesprofit = sc.parallelize(Array(("Cadbury's", 3.5),("Nestle", 2.8),("Mars", 2.5), ("Thorton's", 2.2)));

val salesyear = sc.parallelize(Array(("Cadbury's", 2015),("Nestle", 2014),("Mars", 2014), ("Thorton's", 2013)));

val join = salesprofit.join(salesyear);

join.collect();
Spark SQL

val sqlContext = new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.read.json("/Users/syedrizvi/Desktop/HadoopExamples/Spark/sample.json")

df.show();

df.printSchema();

df.select(“name”).show();

df.select(df("name"),df("age")+1).show();

df.filter(df("age")>21).show()

df.groupBy("age").count().show();

Creating Temp Views


df.createOrReplaceTempView("people")
val sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show();

Creating Data sets on the fly

case class Person(name: String, age: Long)


val caseClassDS = Seq(Person("Andy", 32)).toDS()
caseClassDS.show()

val primitiveDS = Seq(1, 2, 3).toDS()


primitiveDS.map(_ + 1).collect()

Creating Schemas with Reflection

val sqlContext = new org.apache.spark.sql.SQLContext(sc);

case class Person(name: String, age: Long)

val peopleDF =
spark.sparkContext.textFile("/Users/syedrizvi/Desktop/HadoopExamples/Spark/people.txt").map(_.split(",")).m
ap(attributes=>Person(attributes(0),attributes(1).trim.toInt)).toDF();

peopleDF.createOrReplaceTempView("people")

val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")

teenagersDF.map(teenager => "Name: " + teenager(0)).show()

teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()


Interacting with Hive
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession

val warehouseLocation = "spark-warehouse"

val spark = SparkSession.builder().appName("Spark Hive Example").config("spark.sql.warehouse.dir",


warehouseLocation).enableHiveSupport().getOrCreate()

import spark.implicits._
import spark.sql

sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")

sql("LOAD DATA LOCAL INPATH '/Users/syedrizvi/Desktop/HadoopExamples/Spark/kv1.txt' INTO TABLE


src")

sql("SELECT * FROM src").show()

sql("select current_database()").show(false)

Spark Streaming
To run the example from source

To Run net cat


nc -lk 9999

/usr/local/Cellar/apache-spark/2.1.0/bin/run-example streaming.NetworkWordCount localhost 9999

Your own word count

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}

val ssc = new StreamingContext(sc, Seconds(1))


val lines = ssc.socketTextStream("localhost", 9999)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()

You might also like