RDD Actions

Actions-RDD
foreach example
from pyspark import SparkContext
sc = SparkContext("local", "ForEachExample")
rdd = sc.parallelize([1, 2, 3, 4, 5])
def my_function(x):
print(x)
rdd.foreach(my_function)
sc.stop()
foreachPartition example
sc = SparkContext("local", "ForEachPartitionExample")
rdd = sc.parallelize([1, 2, 3, 4, 5], 2) # Creating 2 partitions
def my_partition_function(iterator):
for x in iterator:
print(x)
rdd.foreachPartition(my_partition_function)
sc.stop()
Fold() example
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder.appName("FoldExample").getOrCreate()
# Create an RDD of numbers
numbers_rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
# Define the binary function for multiplication
def multiply(x, y):
return x * y
# Use the fold function
product_result = numbers_rdd.fold(1, multiply)
# Print the result
print("Product using fold:", product_result)
# Stop the Spark session
spark.stop()
Reduce() example
spark = SparkSession.builder.appName("ReduceExample").getOrCreate()
# Create an RDD of numbers
numbers_rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
# Define the binary function for addition
def add(x, y):
return x + y
# Use the reduce function
sum_result = numbers_rdd.reduce(add)
# Print the result
print("Sum using reduce:", sum_result)
spark.stop()
Aggregate Fn example
import findspark
findspark.init() def comb_op(acc1, acc2):
from pyspark.sql import SparkSession # Combine two accumulators by adding their
sum and multiplying their products
return (acc1[0] + acc2[0], acc1[1] * acc2[1])
spark = SparkSession.builder.appName("AggregateExample").getOrCreate()
# Create an RDD of numbers # Use the aggregate function
numbers_rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 2) (sum_result, product_result) =
# Define the zero value and the aggregate functions numbers_rdd.aggregate(zero_value, seq_op,
comb_op)
zero_value = (0, 1) # Accumulator for sum, product
def seq_op(accumulator, element): # Print the results
# Update the accumulator by adding the element to sum and multiplying print("Sum:", sum_result)
to product
print("Product:", product_result)
return (accumulator[0] + element, accumulator[1] * element)
spark.stop()
takeordered ()

spark = SparkSession.builder.appName("takeOrderedExample").getOrCreate()
# Sample data
data = [(3, "Alice"), (1, "Bob"), (5, "Charlie"), (2, "David"), (4, "Eve")]
# Create an RDD from the sample data
rdd = spark.sparkContext.parallelize(data)
# Take the top 3 elements based on the first element of each tuple (ascending order)
top_elements = rdd.takeOrdered(3, key=lambda x: x[0])
# Print the top elements
for element in top_elements:
print(element)
spark.stop()
Sampling from rdd
spark = SparkSession.builder.appName("takeSampleExample").getOrCreate()
# Sample data
data = list(range(1, 20))
# Create an RDD from the sample data
rdd = spark.sparkContext.parallelize(data)
# Take a random sample of 5 elements without replacement
sample_without_replacement = rdd.takeSample(False, 5)
# Take a random sample of 5 elements with replacement
sample_with_replacement = rdd.takeSample(True, 5)
# Print the samples
print("Sample without replacement:", sample_without_replacement)
print("Sample with replacement:", sample_with_replacement)
spark.stop( )
Persistence in RDD
• Spark RDD’s are lazily evaluation
• Hence spark will recompute an RDD and its dependencies every time
an action is called
• This might become expensive for iterative algorithms
• Persist data-a better option
Persist()
RDD.persist(storageLevel)
• storageLevel specifies where and how to persist the RDD. It is an optional argument that determines the
storage level.
• Common storage levels include:

• MEMORY_ONLY: Cache the RDD in memory as deserialized Java objects (default).
• MEMORY_ONLY_SER: Cache the RDD in memory as serialized Java objects.
• MEMORY_AND_DISK: Cache the RDD in memory, and spill to disk if the memory is not sufficient.
• MEMORY_AND_DISK_SER: Cache the RDD in memory as serialized Java objects, and spill to disk if the memory is not sufficient.
• DISK_ONLY: Cache the RDD on disk.

Example of persist()
import findspark
findspark.init()
from pyspark.storagelevel import StorageLevel
sc = SparkContext("local", "RDD Persistence Example")
# Create an RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)
# Persist the RDD in memory as deserialized Java objects
rdd.persist(storageLevel=StorageLevel.MEMORY_ONLY)
# Perform some operations on the RDD
sum_result = rdd.reduce(lambda x, y: x + y)
print("Sum of elements:", sum_result)
# The RDD is cached in memory, so it can be reused without recomputation
product_result = rdd.map(lambda x: x * 2).collect()
print("Doubled elements:", product_result)
# Stop the SparkContext
sc.stop()
More about persist() in spark
• If memory overflow happens spark evicts data based on LRU policy
• rnpersist() can be used
• Rdd.unpersist()
WORKING WITH (KEY,VALUE) PAIRS
Pair RDD
ETL performed on RDD to get them to (key,value) pair
Special operations defined on RDD pair
reducebykey()
join()
Creating pair rdd
import findspark
findspark.init()
# Create a SparkContext
sc = SparkContext("local", "Pair RDD Example")
# Create an RDD with tuples (key, value)
data = [(1, "apple"), (2, "banana"), (3, "cherry"), (4, "date"), (5, "elderberry")]
rdd = sc.parallelize(data)
# Now, 'rdd' is a Pair RDD
# Perform operations on the Pair RDD
# For example, let's filter the fruits with keys greater than 2
filtered_rdd = rdd.filter(lambda x: x[0] > 2)
# Collect and print the results
results = filtered_rdd.collect()
for result in results:
print(result)
sc.stop()
Note: Other programming languages like Scala and Java require the data type of the rdd to change , before applying aggregate functions
Transformations on Pair RDD’s
reducebykey()
import findspark
findspark.init()
sc = SparkContext("local", "reduceByKey Example")
# Create a Pair RDD with key-value pairs
data = [(1, 2), (2, 4), (1, 6), (2, 8), (3, 1)]
pair_rdd = sc.parallelize(data)
# Use reduceByKey to calculate the sum of values for each key

sum_rdd = pair_rdd.reduceByKey(lambda x, y: x + y)
# Collect and print the results
results = sum_rdd.collect()
print("Key:", result[0], "Sum:", result[1])
sc.stop()
Groupbykey()
import findspark Output:
findspark.init()
from pyspark import SparkContext Key: 1, Values: ['apple', 'cherry']
Key: 2, Values: ['banana', 'date']
Key: 3, Values: ['elderberry']
sc = SparkContext("local", "groupByKey Example")
# Create a Pair RDD with key-value pairs
data = [(1, 'apple'), (2, 'banana'), (1, 'cherry'), (2, 'date'), (3, 'elderberry')]
pair_rdd = sc.parallelize(data)
# Use groupByKey to group values by key
grouped_rdd = pair_rdd.groupByKey()
# Iterate through the grouped results and print them
for key, values in grouped_rdd.collect():
print(f"Key: {key}, Values: {list(values)}")
sc.stop()
Combinebykey()
average_scores_rdd = pair_rdd.combineByKey(createCombiner, mergeValue, mergeCombiners)
# Create a SparkContext # Calculate the average score for each student
sc = SparkContext("local", "combineByKey Example") average_scores = average_scores_rdd.map(lambda x: (x[0], x[1][0] / x[1][1]))
# Create a Pair RDD with student scores
data = [("Alice", 85), ("Bob", 90), ("Alice", 78), ("Bob", 88), # Collect and print the results
("Alice", 92)] results = average_scores.collect()
pair_rdd = sc.parallelize(data) print("Student:", result[0], "Average Score:", result[1])
# Use combineByKey to calculate the average score for each
student # Stop the SparkContext
# - createCombiner initializes an accumulator (sum, count) for sc.stop()
each key
# - mergeValue adds a new score to the accumulator
# - mergeCombiners combines the accumulators from different
partitions OUTPUT:
def createCombiner(score):
return (score, 1)
def mergeValue(accumulator, score): Student: Alice Average Score: 85.0
total_score, count = accumulator Student: Bob Average Score: 89.0
return (total_score + score, count + 1)
def mergeCombiners(accumulator1, accumulator2):

total_score1, count1 = accumulator1
total_score2, count2 = accumulator2
return (total_score1 + total_score2, count1 + count2)

RDD Actions

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

RDD Actions

Uploaded by

Copyright:

Available Formats

Actions-RDD

from pyspark.sql import SparkSession

• Common storage levels include:

• MEMORY_ONLY_SER: Cache the RDD in memory as serialized Java objects.

• DISK_ONLY: Cache the RDD on disk.

# Use reduceByKey to calculate the sum of values for each key

def mergeCombiners(accumulator1, accumulator2):

You might also like