You are on page 1of 5

=======-

payments = [('15-02-2022',100),('16-02-2022',500),('17-02-2022',900),('18-02-2022',300),('19-02-
2022',400),('20-02-2022',120),('21-02-2022',1000)]

df_payments = spark.createDataFrame(payments,['Date','Payments'])
df_payments = df_payments.withColumn('Date',to_date(col('Date'),'dd-MM-yyyy'))

windowSpec = Window.orderBy(col('Date'))

df_payments = df_payments.withColumn('Lag_Payments',lag("Payments",1).over(windowSpec)) \
.withColumn('variance_flag',when(col('Lag_Payments') == "null","null").when(col('Lag_Payments') > col('
Payments'), -1)\
.when(col('Lag_Payments') < col('Payments'), 1)).drop('Lag_Payments')

df_payments.show()

+----------+--------+-------------+
| Date|Payments|variance_flag|
+----------+--------+-------------+
|2022-02-15| 100| null|
|2022-02-16| 500| 1|
|2022-02-17| 900| 1|
|2022-02-18| 300| -1|
|2022-02-19| 400| 1|
|2022-02-20| 120| -1|
|2022-02-21| 1000| 1|
+----------+--------+-------------+

=============-

with mytable as (select *, lag(Payments,1) over (order by date) as temp_pay from t1) 
select date, Payments,case 
when temp_pay is Null then Null
when Payments>temp_pay then 1 else -1 end as variance_flag from mytable;

=========-=

Using "lag" window function on "payments" column we can achieve this output.

==========-=

select * , case when payment>lag(payment) over(order by date desc) then 1 else -1 end variance_flag
from payment
========================-====================-======================================

Data Engineer Interview Question:


======================
Write a Spark program to get the below Output based on the below given Input
Input :
=====
Team1,Team2,Winner
-----------------
India,Aus,India
Srilanka,Aus,Aus
Srilanka,India,India

Output :
======
Team,Total_match,Total_win,Total_loss
--------------------------------------
India,2,2,0
Srilanka,2,0,2
Aus,2,1,1

==========-==========

from pyspark.sql.functions import col,sum,coalesce,lit

lst_data = [("India","Aus","India"),("Srilanka","Aus","Aus"),("Srilanka","India","India")]
schema = ["Team1","Team2","Winner"]
df = spark.createDataFrame(lst_data,schema)

df1 = df.groupBy("team1").count().withColumnRenamed("team1","team")
df2 = df.groupBy("team2").count().withColumnRenamed("team2","team")
df3 =
df1.unionAll(df2).groupBy("team").sum("count").withColumnRenamed("sum(count)","total_match")
df4 = df.groupBy("Winner").count().withColumnRenamed("count","total_win")
df5 = df3.join(df4,df3.team ==
df4.Winner,"left").withColumn("total_win",coalesce(df4.total_win,lit(0))).select("team","total_match","
total_win").withColumn("total_loss",(col("total_match") - col("total_win")))

===========-=========

Select t.team,count(*) as total_match,sum(t.win) as total_win,sum(t.loss) as total_loss from(select


Team2 as team, case when Team2=Winner Then 1 Else 0 END as win,case when Team2!=Winner Then 1
Else 0 END as loss from temp union all select Team1 as team, case when Team1=Winner Then 1 Else 0
END as win, case when Team1!=Winner Then 1 Else 0 END as loss from temp)t group by t.team;

=========-======

val a=List(("India","Aus","India"),("Japan","Aus","Aus"),("Japan","India","India"))
val df=a.toDF("team1","team2","win")
+-----+-----+-----+
|team1|team2| win|
+-----+-----+-----+
|India| Aus|India|
|Japan| Aus| Aus|
|Japan|India|India|
+-----+-----+-----+
val df2=df.select("team1").union(df.select("team2"))
val df3=df2.groupBy("team1").count().withColumnRenamed("count","Total_Matches")
+-----+-------------+
|team1|Total_Matches|
+-----+-------------+
|India|      2|
| Aus|      2|
|Japan|      2|
+-----+-------------+
val df4=df.groupBy("win").count().withColumnRenamed("count","winner")
+-----+------+
| win|winner|
+-----+------+
|India|   2|
| Aus|   1|
+-----+------+
df3.join(df4,col("team1")===col("win"),"left").drop("win").na.fill(0).withColumn("loss",col("Total_Match
es")-col("winner")).show
+-----+-------------+------+----+
|team1|Total_Matches|winner|loss|
+-----+-------------+------+----+
|India|      2|   2|  0|
| Aus|      2|   1|  1|
|Japan|      2|   0|  2|
+-----+-------------+------+----+
==============================-======

WITH gro AS (SELECT team1 team FROM tri


UNION
SELECT team2 FROM tri)
SELECT team, tm.total_match, NVL(Total_win, 0)tot_win, NVL((total_match-total_win),0) Total_loss  
 FROM gro, 
(SELECT winner, count(*) Total_win FROM tri GROUP BY winner) win,
(SELECT team1, COUNT(*) total_match 
  FROM (SELECT team1 FROM tri UNION ALL SELECT team2 FROM tri) GROUP BY team1) tm
WHERE gro.team = win.winner(+) AND (tm.team1 = gro.team);

==============-=

Hope the below query will give the solution if you use spark SQL:

With teams_cte as
(Select team1 as team, (case when team1=winner then 1 else 0 end) as won from table
Union all
Select team2 as team, (case when team2=winner then 1 else 0 end) as won from table)
Select team, count(*) as tot_natches, sum(won) as total_won , count(*)-sum(won) as total_loss
From teams_cte
Group by team;

=============-====
==============-==========

This is a very good question. I was asked this question in one of my interviews but in SQL. Just attempted
to solve this using pyspark.

from pyspark.sql.functions import col


from pyspark.sql.functions import sum,avg,max,min,mean,count
df_t1 = df.select(col("Team1").alias("Team"))
df_t2 = df.select(col("Team2").alias("Team"))
df_all_teams = df_t1.union(df_t2)
df_all_teams_Agg = df_all_teams.groupby("Team").agg(count("Team").alias("Total_matchs"))
df_all_teams_Agg.show(truncate=False)
df_winner = df.groupby("Winner").agg(count("Winner").alias("Total_Won_matchs"))

You might also like