SQL90 GH 97

=======-
payments = [('15-02-2022',100),('16-02-2022',500),('17-02-2022',900),('18-02-2022',300),('19-02-
2022',400),('20-02-2022',120),('21-02-2022',1000)]
df_payments = spark.createDataFrame(payments,['Date','Payments'])
df_payments = df_payments.withColumn('Date',to_date(col('Date'),'dd-MM-yyyy'))
windowSpec = Window.orderBy(col('Date'))
df_payments = df_payments.withColumn('Lag_Payments',lag("Payments",1).over(windowSpec)) \
.withColumn('variance_flag',when(col('Lag_Payments') == "null","null").when(col('Lag_Payments') > col('
Payments'), -1)\
.when(col('Lag_Payments') < col('Payments'), 1)).drop('Lag_Payments')
df_payments.show()
+----------+--------+-------------+
| Date|Payments|variance_flag|
+----------+--------+-------------+
|2022-02-15| 100| null|
|2022-02-16| 500| 1|
|2022-02-17| 900| 1|
|2022-02-18| 300| -1|
|2022-02-19| 400| 1|
|2022-02-20| 120| -1|
|2022-02-21| 1000| 1|
+----------+--------+-------------+
=============-
with mytable as (select *, lag(Payments,1) over (order by date) as temp_pay from t1)
select date, Payments,case
when temp_pay is Null then Null
when Payments>temp_pay then 1 else -1 end as variance_flag from mytable;
=========-=
Using "lag" window function on "payments" column we can achieve this output.
==========-=
select * , case when payment>lag(payment) over(order by date desc) then 1 else -1 end variance_flag
from payment
========================-====================-======================================
Data Engineer Interview Question:

======================
Write a Spark program to get the below Output based on the below given Input
Input :
=====
Team1,Team2,Winner
-----------------
India,Aus,India
Srilanka,Aus,Aus
Srilanka,India,India
Output :
======
Team,Total_match,Total_win,Total_loss
--------------------------------------
India,2,2,0
Srilanka,2,0,2
Aus,2,1,1
==========-==========
from pyspark.sql.functions import col,sum,coalesce,lit
lst_data = [("India","Aus","India"),("Srilanka","Aus","Aus"),("Srilanka","India","India")]
schema = ["Team1","Team2","Winner"]
df = spark.createDataFrame(lst_data,schema)
df1 = df.groupBy("team1").count().withColumnRenamed("team1","team")
df2 = df.groupBy("team2").count().withColumnRenamed("team2","team")
df3 =
df1.unionAll(df2).groupBy("team").sum("count").withColumnRenamed("sum(count)","total_match")
df4 = df.groupBy("Winner").count().withColumnRenamed("count","total_win")
df5 = df3.join(df4,df3.team ==
df4.Winner,"left").withColumn("total_win",coalesce(df4.total_win,lit(0))).select("team","total_match","
total_win").withColumn("total_loss",(col("total_match") - col("total_win")))
===========-=========
Select t.team,count(*) as total_match,sum(t.win) as total_win,sum(t.loss) as total_loss from(select

Team2 as team, case when Team2=Winner Then 1 Else 0 END as win,case when Team2!=Winner Then 1
Else 0 END as loss from temp union all select Team1 as team, case when Team1=Winner Then 1 Else 0
END as win, case when Team1!=Winner Then 1 Else 0 END as loss from temp)t group by t.team;
=========-======
val a=List(("India","Aus","India"),("Japan","Aus","Aus"),("Japan","India","India"))
val df=a.toDF("team1","team2","win")
+-----+-----+-----+
|team1|team2| win|
+-----+-----+-----+
|India| Aus|India|
|Japan| Aus| Aus|
|Japan|India|India|
+-----+-----+-----+
val df2=df.select("team1").union(df.select("team2"))
val df3=df2.groupBy("team1").count().withColumnRenamed("count","Total_Matches")
+-----+-------------+
|team1|Total_Matches|
+-----+-------------+
|India| 2|
| Aus| 2|
|Japan| 2|
+-----+-------------+
val df4=df.groupBy("win").count().withColumnRenamed("count","winner")
+-----+------+
| win|winner|
+-----+------+
|India| 2|
| Aus| 1|
+-----+------+
df3.join(df4,col("team1")===col("win"),"left").drop("win").na.fill(0).withColumn("loss",col("Total_Match
es")-col("winner")).show
+-----+-------------+------+----+
|team1|Total_Matches|winner|loss|
+-----+-------------+------+----+
|India| 2| 2| 0|
| Aus| 2| 1| 1|
|Japan| 2| 0| 2|
+-----+-------------+------+----+
==============================-======
WITH gro AS (SELECT team1 team FROM tri

UNION
SELECT team2 FROM tri)
SELECT team, tm.total_match, NVL(Total_win, 0)tot_win, NVL((total_match-total_win),0) Total_loss
FROM gro,
(SELECT winner, count(*) Total_win FROM tri GROUP BY winner) win,
(SELECT team1, COUNT(*) total_match
FROM (SELECT team1 FROM tri UNION ALL SELECT team2 FROM tri) GROUP BY team1) tm
WHERE gro.team = win.winner(+) AND (tm.team1 = gro.team);
==============-=
Hope the below query will give the solution if you use spark SQL:
With teams_cte as
(Select team1 as team, (case when team1=winner then 1 else 0 end) as won from table
Union all
Select team2 as team, (case when team2=winner then 1 else 0 end) as won from table)
Select team, count(*) as tot_natches, sum(won) as total_won , count(*)-sum(won) as total_loss
From teams_cte
Group by team;
=============-====
==============-==========
This is a very good question. I was asked this question in one of my interviews but in SQL. Just attempted
to solve this using pyspark.
from pyspark.sql.functions import col

from pyspark.sql.functions import sum,avg,max,min,mean,count
df_t1 = df.select(col("Team1").alias("Team"))
df_t2 = df.select(col("Team2").alias("Team"))
df_all_teams = df_t1.union(df_t2)
df_all_teams_Agg = df_all_teams.groupby("Team").agg(count("Team").alias("Total_matchs"))
df_all_teams_Agg.show(truncate=False)
df_winner = df.groupby("Winner").agg(count("Winner").alias("Total_Won_matchs"))

SQL90 GH 97

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

SQL90 GH 97

Uploaded by

Copyright:

Available Formats

=======-

Data Engineer Interview Question:

from pyspark.sql.functions import col,sum,coalesce,lit

Select t.team,count(*) as total_match,sum(t.win) as total_win,sum(t.loss) as total_loss from(select

WITH gro AS (SELECT team1 team FROM tri

from pyspark.sql.functions import col

You might also like