You are on page 1of 2

val orders = spark.read.

schema("order_id INT, order_date TIMESTAMP, order_customer_id INT,


order_status STRING").
csv("/FileStore/tables/retail_db/orders")

orders.printSchema
orders.show
orders.count

val orderItems = spark.read.


schema(s"""order_item_id INT,
order_item_order_id INT,
order_item_product_id INT,
order_item_quantity INT,
order_item_subtotal FLOAT,
order_item_product_price FLOAT""").
csv("/FileStore/tables/retail_db/order_items")

orderItems.printSchema
orderItems.show

val products = spark.read.


schema(s"""product_id INT,
product_category_id INT,
product_name STRING,
product_description STRING,
product_price FLOAT,
product_image STRING""").
csv("/FileStore/tables/retail_db/products")

products.printSchema
products.show
import org.apache.spark.sql.functions.{count, lit}

val orderStatusCount = orders.


groupBy("order_status").
agg(count(lit(1)).alias("order_count"))

display(orderStatusCount)

val ordersCompleted = orders.


filter("order_status IN ('COMPLETE', 'CLOSED')")

orders.count
ordersCompleted.show
ordersCompleted.count
import org.apache.spark.sql.functions.{sum, round}

val dailyProductRevenue = joinResults.


groupBy("order_date", "product_name").
agg(round(sum("order_item_subtotal"), 2).alias("revenue"))

dailyProductRevenue.show
import org.apache.spark.sql.functions.col
import spark.implicits._

val dailyProductRevenueSorted = dailyProductRevenue.


orderBy($"order_date", col("revenue").desc)

dailyProductRevenueSorted.show(false)
spark.conf.set("spark.sql.shuffle.partitions", "2")

dailyProductRevenueSorted.
write.
mode("overwrite").
csv("/FileStore/tables/retail_db/daily_product_revenue")

val dailyProductRevenue = spark.read.


schema("order_date STRING, product_name STRING, revenue FLOAT").
csv("/FileStore/tables/retail_db/daily_product_revenue")

display(dailyProductRevenue)

You might also like