You are on page 1of 8

19L-1871

May 17, 2021

[1]: import pyspark


from pyspark.sql import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
import pandas as pd
from pyspark.sql.functions import length

[2]: conf = SparkConf().setMaster("local[*]").setAppName("DataframesPractice")


sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

[3]: moviesdf = spark.read.csv("MoviesA3.csv",sep=";",inferSchema=True,header=True)

[4]: moviesdf.dropna().count()
#moviesdf.show(truncate=False)

[4]: 1128

[5]: moviesdf.columns

[5]: ['Year',
'Length',
'Title',
'Genre',
'Actor',
'Actress',
'Director',
'Popularity',
'Awards',
'Image']

[6]: p1 = moviesdf.select("Title","Year","Director").filter(moviesdf.Awards=="Yes")
p1.show()

+--------------------+----+--------------------+
| Title|Year| Director|
+--------------------+----+--------------------+
| Fanny and Alexander|1982| Bergman, Ingmar|
| A Man & a Woman|1966| Lelouch, Claude|

1
|Un Hombre y una M…|1966| Lelouch, Claude|
| Official Story, The|1985| Puenzo, Luiz|
| Wild Strawberries|1957| Bergman, Ingmar|
|Through a Glass D…|1962| Bergman, Ingmar|
| Cries & Whispers|1972| Bergman, Ingmar|
| Sound of Music, The|1965| Wise, Robert|
| Babette's Feast|1987| Axel, Gabriel|
|Murder on the Ori…|1974| Lumet, Sidney|
| Elephant Man, The|1980| Lynch, David|
|E. T. The Extra-T…|1982| Spielberg, Steven|
| Misery|1990| Reiner, Rob|
| Gandhi|1982|Attenborough, Ric…|
| Autumn Sonata|1978| Bergman, Ingmar|
| Gaslight|1944| Cukor, George|
|A Woman Called Golda|1982| Gibson, Alan|
| Anastasia|1956| Litvak, Anatole|
| Cactus Flower|1969| Saks, Gene|
| Amadeus|1984| Forman, Milos|
+--------------------+----+--------------------+
only showing top 20 rows

[7]: p2 = moviesdf.select("Title","Awards","Popularity").filter(moviesdf.
,→Awards=="No").orderBy(moviesdf.Popularity.desc())

p2.show(10)

+--------------------+------+----------+
| Title|Awards|Popularity|
+--------------------+------+----------+
| Let It Ride| No| 88|
| Great Race, The| No| 88|
| New Year's Day| No| 88|
| Final Notice| No| 88|
| Fellini Satyricon| No| 88|
| Guilty by Suspicion| No| 88|
| Time Machine, The| No| 88|
| Raw Nerve| No| 88|
|Long Voyage Home,…| No| 88|
| Class Act| No| 88|
+--------------------+------+----------+
only showing top 10 rows

[8]: p3 = moviesdf.select("Title", "Year", "Popularity").filter((moviesdf.Year <␣


,→1980)&(~moviesdf.Popularity.isNull())).orderBy(moviesdf.Popularity)

p3.show(10)

+--------------------+----+----------+

2
| Title|Year|Popularity|
+--------------------+----+----------+
| Airport|1970| 0|
| Anna Christie|1930| 0|
| Shalako|1968| 0|
| Tales of Tomorrow|1953| 0|
| Shout at the Devil|1976| 0|
| Holocaust|1978| 1|
| Stavisky|1974| 1|
| Anderson Tapes, The|1971| 1|
| Indiscreet|1958| 1|
|Law of the Golden…|1949| 1|
+--------------------+----+----------+
only showing top 10 rows

[9]: p4 = moviesdf.groupBy("Genre").avg("Length")
p4.show()

+---------------+------------------+
| Genre| avg(Length)|
+---------------+------------------+
| Crime| 66.0|
| Romance| 127.0|
| Adventure| 119.0|
| null| 120.5|
| Drama|113.30455259026688|
| War| 116.90625|
| Fantasy| 102.0|
| Mystery|103.00990099009901|
| Music|100.48780487804878|
|Science Fiction|106.47368421052632|
| Horror| 93.92727272727272|
| Short| 40.0|
| Western| 93.0091743119266|
| Comedy| 96.50540540540541|
| Action| 104.5|
| Westerns| 124.8|
+---------------+------------------+

[10]: moviesdf.show(2)

+----+------+--------------------+------+-----------------+---------------+-----
-----------+----------+------+------------------+
|Year|Length| Title| Genre| Actor| Actress|
Director|Popularity|Awards| Image|
+----+------+--------------------+------+-----------------+---------------+-----
-----------+----------+------+------------------+

3
|1990| 111|Tie Me Up! Tie Me…|Comedy|Banderas, Antonio|Abril,
Victoria|Almod�var, Pedro| 68| No|NicholasCage.png,,|
|1991| 113| High Heels|Comedy| Bos�, Miguel|Abril,
Victoria|Almod�var, Pedro| 68| No|NicholasCage.png,,|
+----+------+--------------------+------+-----------------+---------------+-----
-----------+----------+------+------------------+
only showing top 2 rows

[11]: p5 = moviesdf.select("Actor","Actress","Genre").filter((moviesdf.
,→Genre=="Comedy"))

p5 = p5.groupBy("Actor","Actress","Genre").count().filter("count > 3")


p5.dropna().show()

+--------------+------------------+------+-----+
| Actor| Actress| Genre|count|
+--------------+------------------+------+-----+
| Allen, Woody| Keaton, Diane|Comedy| 5|
|Tracy, Spencer|Hepburn, Katharine|Comedy| 6|
+--------------+------------------+------+-----+

[12]: p6 = moviesdf.select("Actor","Genre").filter((moviesdf.
,→Genre=="Comedy")&(moviesdf.Genre=="Drama"))

p6.show()

+-----+-----+
|Actor|Genre|
+-----+-----+
+-----+-----+

[13]: p7 = moviesdf.select("Actor","Genre").filter((moviesdf.
,→Genre=="Comedy")|(moviesdf.Genre=="Drama"))

p7.show(10)

+--------------------+------+
| Actor| Genre|
+--------------------+------+
| Banderas, Antonio|Comedy|
| Bos�, Miguel|Comedy|
| Gere, Richard| Drama|
| Bergen, Robert D.| Drama|
|Lambert, Christopher| Drama|
| Depardieu, G�rard| Drama|
| Ahlstedt, B�rje| Drama|
| Tognazzi, Ugo| Drama|
|Trintignant, Jean…| Drama|
|Trintignant, Jean…| Drama|

4
+--------------------+------+
only showing top 10 rows

[14]: p8 = moviesdf.select("Actor","Genre").filter(~(moviesdf.Genre=="Comedy"))
p8.show(10)

+--------------------+------+
| Actor| Genre|
+--------------------+------+
| Walken, Christopher|Horror|
| Connery, Sean|Action|
| Gere, Richard| Drama|
| Moore, Roger|Action|
| Connors, Chuck|Action|
| Bergen, Robert D.| Drama|
|Lambert, Christopher| Drama|
| Depardieu, G�rard| Drama|
| Ahlstedt, B�rje| Drama|
| Tognazzi, Ugo| Drama|
+--------------------+------+
only showing top 10 rows

[15]: p9 = moviesdf.groupBy("Actor").agg(F.min(moviesdf.Popularity),F.max(moviesdf.
,→Popularity),F.avg(moviesdf.Popularity))

p9.show(10)

+-----------------+---------------+---------------+-----------------+
| Actor|min(Popularity)|max(Popularity)| avg(Popularity)|
+-----------------+---------------+---------------+-----------------+
| Din, Ayub Khan| 6| 6| 6.0|
| Ferrer, Jose| 7| 7| 7.0|
| Harrison, Rex| 10| 10| 10.0|
| McCleery, Gary| 41| 41| 41.0|
| Matthau, Walter| 52| 83|67.33333333333333|
|Keillor, Garrison| 6| 6| 6.0|
| Busey, Gary| 74| 74| 74.0|
| Boyer, Charles| 25| 70| 55.0|
| Kime, Jeffrey| 73| 73| 73.0|
|Gauthier, Vincent| 11| 11| 11.0|
+-----------------+---------------+---------------+-----------------+
only showing top 10 rows

[16]: p10 = moviesdf.groupBy(moviesdf.Year).count()


p10.orderBy(p10.Year.desc()).show()

+----+-----+

5
|Year|count|
+----+-----+
|1997| 1|
|1996| 1|
|1993| 21|
|1992| 88|
|1991| 129|
|1990| 105|
|1989| 101|
|1988| 96|
|1987| 102|
|1986| 95|
|1985| 42|
|1984| 38|
|1983| 40|
|1982| 37|
|1981| 26|
|1980| 30|
|1979| 33|
|1978| 18|
|1977| 29|
|1976| 17|
+----+-----+
only showing top 20 rows

[17]: moviesdf.select("Title","Genre", "Year").filter((moviesdf.Genre ==␣


,→"Comedy")&(moviesdf.Year==1975)).show()

+--------------------+------+----+
| Title| Genre|Year|
+--------------------+------+----+
|Alice Doesn't Liv…|Comedy|1975|
| Shampoo|Comedy|1975|
|Adventures of She…|Comedy|1975|
| Love & Death|Comedy|1975|
| Sunshine Boys, The|Comedy|1975|
|Great McGonagall,…|Comedy|1975|
+--------------------+------+----+

[18]: p11 = moviesdf.groupBy("Genre","Year").count()


p11.orderBy("Year").show()

+---------------+----+-----+
| Genre|Year|count|
+---------------+----+-----+
| Drama|1920| 1|
| Western|1923| 1|

6
| Drama|1924| 3|
| Drama|1925| 1|
|Science Fiction|1926| 1|
| Mystery|1926| 1|
| Action|1926| 1|
| Drama|1926| 1|
| Drama|1927| 2|
| Mystery|1927| 1|
| Drama|1928| 4|
| War|1928| 1|
| Mystery|1929| 1|
| Drama|1929| 4|
| Drama|1930| 2|
| Mystery|1930| 1|
| Drama|1931| 5|
| Western|1931| 3|
| Comedy|1931| 1|
| Horror|1932| 1|
+---------------+----+-----+
only showing top 20 rows

[19]: p12 = moviesdf.select("Title").sort("Title")


p12.show()

+--------------------+
| Title|
+--------------------+
|2001: A Space Ody…|
| 48 Hrs.|
| 8 1/2|
|A Big Hand for th…|
| A Certain Sacrifice|
| A Child Is Waiting|
|A Chorus Line, Th…|
|A Chorus of Disap…|
| A Clockwork Orange|
|A Coeur Joie, (He…|
| A Cry in the Dark|
| A Day in October|
| A Dry White Season|
| A Fine Madness|
| A Fish Called Wanda|
|A Fistful of Dollars|
| A Guy Named Joe|
| A Lesson in Love|
|A Little Night Music|
| A Man & a Woman|

7
+--------------------+
only showing top 20 rows

[20]: p13 = moviesdf.select("Title",length("Title").alias("Length"))


p13 = p13.filter(p13.Length > 50)
p13.show()

+--------------------+------+
| Title|Length|
+--------------------+------+
|Fawlty Towers, Go…| 69|
|Unnamable II, The…| 51|
|Industrial Sympho…| 52|
+--------------------+------+

[27]: p14 = moviesdf.filter(moviesdf.Genre == "Comedy")


p14 = p14.groupBy(['Actor', 'Actress']).count()
p14 = p14.filter(p14['count'] > 3)
p14.dropna().show()

+--------------+------------------+-----+
| Actor| Actress|count|
+--------------+------------------+-----+
| Allen, Woody| Keaton, Diane| 5|
|Tracy, Spencer|Hepburn, Katharine| 6|
+--------------+------------------+-----+

You might also like