You are on page 1of 1

spark = SparkSession.builder.appName("H1B") .config("spark.some.config.

option",
"some-value").getOrCreate()
parquet1DF = spark.read.parquet("h1_b_dataset.parquet")
PARQUETdf11 = parquet1DF.select("CASE_STATUS", "VISA_CLASS", "EMPLOYER_NAME",
"JOB_TITLE", "PREVAILING_WAGE", "PW_SOURCE_YEAR", "WORKSITE_STATE")
p1 =
parquetDF11.withColumnRenamed("PREVAILING_WAGE","SALARY").withColumnRenamed("PW_SOU
RCE_YEAR","FINANCIAL_YEAR")
p2 = p1.where(p1.CASE_STATUS == "CERTIFIED")
p3 = p2.na.drop()
p4 = p3.selectExpr("cast(CASE_STATUS as string) CASE_STATUS","cast(VISA_CLASS as
string) VISA_CLASS","cast(EMPLOYER_NAME as string) EMPLOYER_NAME","cast(JOB_TITLE
as string) JOB_TITLE",
"cast(SALARY as double) SALARY","cast(FINANCIAL_YEAR as integer)
FINANCIAL_YEAR","cast(WORKSITE_STATE as string) WORKSITE_STATE")
p3.filter(~p3.EMPLOYER_NAME.endswith("LLC"))
p5= p4.filter(~p3.EMPLOYER_NAME.endswith("LLC"))

p6.write.format('csv').option('header',True).option('sep',',').save('c.csv')

m
er as
val data
=spark.read.option("header","true").option("InferSchema","true").parquet("h1_b_data

co
set.parquet")

eH w
val raw =data.select("CASE_STATUS", "VISA_CLASS", "EMPLOYER_NAME", "JOB_TITLE",

o.
"PREVAILING_WAGE", "PW_SOURCE_YEAR", "WORKSITE_STATE")
rs e
ou urc
val info =
raw.withColumnRenamed("PREVAILING_WAGE","SALARY").withColumnRenamed("PW_SOURCE_YEAR
","FINANCIAL_YEAR")
val value = info.filter(info("CASE_STATUS") === "CERTIFIED")
o
aC s

val raws = value.filter(~value("EMPLOYER_NAME).endswith("LLC"))


vi y re

val conditions = value.columns.map(value(_).endsWith("LLC")).reduce(_ or _)


val output = value.withColumn("condition", conditions).filter($"condition" ===
true).drop("condition")
ed d
ar stu
is
Th
sh

This study source was downloaded by 100000834862076 from CourseHero.com on 10-19-2021 02:39:43 GMT -05:00

https://www.coursehero.com/file/101518475/spark-mini-projecttxt/
Powered by TCPDF (www.tcpdf.org)

You might also like