You are on page 1of 2

Creating RDD

Sample_data = sc.parallelize([(pk,26),(sm,24),(sr,26),(na,23),(pm,26)])

sc.textFile(‘/local/folder/filename.csv’):from local
sc.textFile(‘hdfs/folder/filename.csv’):from hdfs file system

sc.textFile(‘s3/folder/filename.csv’):from aws amazon s3 bucket

sc.textFile(‘dbfs/folder/filename.csv’):from databricks file system

DataFrame

sample_df = spark.createDataFrame(
Sample_data, [
'Id'
, 'Model'
, 'Year'
]
)

From Csv
sample_data_csv = (
spark
.read
.csv(
'../Data/DataFrames_sample.csv'
, header=True
, inferSchema=True)
)

From Json

sample_data_json_df = (
spark
.read
.json('../Data/DataFrames_sample.json')
)

import pyspark.sql as sql


import pyspark.sql.functions as f
sample_data_transformed = (
sample_data_df
.rdd
.map(lambda row: sql.Row(
**row.asDict()
, HDD_size=row.HDD.split(' ')[0]
)
)
.map(lambda row: sql.Row(
**row.asDict()
, HDD_type=row.HDD.split(' ')[1]
)
)
.map(lambda row: sql.Row(
**row.asDict()
, Volume=row.H * row.D * row.W
)
)
.toDF()
.select(
sample_data_df.columns +
[
'HDD_size'
, 'HDD_type'
, f.round(
f.col('Volume')
).alias('Volume_cuIn')
]
)
)

You might also like