You are on page 1of 6

Day 9: Data frames:

1. Download the store_locations.json file from google drive

2. create a directory in hadoop

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -mkdir /sparkLabData

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -ls /sparkLabData

3. Copy the downloaded store_locations.json from desktop(local system) into


hadoop

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -copyFromLocal


/home/hadoopuser/Desktop/store_locations.json /sparkLabData/

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -ls /sparkLabData

Found 1 items

-rw-r--r-- 1 hadoopuser supergroup 6053 2020-12-29 20:06


/sparkLabData/store_locations.json

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -cat


/sparkLabData/store_locations.json

{"city": "Antioch", "state": "CA", "zip_code": 945097911}

{"city": "Woodland", "state": "CA", "zip_code": 957765409}

{"city": "San Jose", "state": "CA", "zip_code": 951311866}

{"city": "Victorville", "state": "CA", "zip_code": 923954216}

{"city": "Chico", "state": "CA", "zip_code": 959284422}

1. Start the spark

hadoopuser@hadoopuser-VirtualBox:~$ spark-shell

scala> sc.setLogLevel("ERROR")

2. Load the data


scala> val storeDF =
spark.read.format("json").load("/sparkLabData/store_locations.json")

storeDF: org.apache.spark.sql.DataFrame = [city: string, state: string ... 1


more field]

3. see the data in dataframe

scala> storeDF.collect

res1: Array[org.apache.spark.sql.Row] = Array([Antioch,CA,945097911],


[Woodland,CA,957765409], [San Jose,CA,951311866], [Victorville,CA,923954216],
[Chico,CA,959284422], [San Dimas,CA,917731725], [Visalia,CA,932779527],
[Manteca,CA,953366745], [Redwood City,CA,940632854], [Lakewood,CA,907122409],
[Hayward,CA,945455008], [Pacoima,CA,913312352], [San Marcos,CA,92069],
[Lodi,CA,95240], [Huntington Beach,CA,92647], [Westlake Village,CA,913624063],
[San Leandro,CA,945771209], [Woodland Hills,CA,913672227], [El
Centro,CA,922431323], [Tustin,CA,927828918], [Vista,CA,920814546],
[Eureka,CA,955012121], [Garden Grove,CA,928431206], [Simi
Valley,CA,930656207], [Santa Clara,CA,950503100], [Los Angeles,CA,900391502],
[SandCity,CA,939553051], [Vallejo,CA,945913702], [Redding,CA,960034071],
[Clovis,CA...

OR

scala> storeDF.show(5)

+-----------+-----+---------+

| city|state| zip_code|

+-----------+-----+---------+

| Antioch| CA|945097911|

| Woodland| CA|957765409|

| San Jose| CA|951311866|

|Victorville| CA|923954216|

| Chico| CA|959284422|

+-----------+-----+---------+
only showing top 5 rows

4. To see the schema

scala> storeDF.schema

res3: org.apache.spark.sql.types.StructType =
StructType(StructField(city,StringType,true),
StructField(state,StringType,true), StructField(zip_code,LongType,true))

5. Manually Define Schema

scala> import org.apache.spark.sql.types.Metadata

import org.apache.spark.sql.types.Metadata

scala> import org.apache.spark.sql.types.


{StructType,StructField,StringType,LongType}

import org.apache.spark.sql.types.{StructType, StructField, StringType,


LongType}

scala> val manualSchema =


StructType(Array(StructField("city",StringType,true),
StructField("state",StringType,true), StructField("zip_code",LongType,true)))

manualSchema: org.apache.spark.sql.types.StructType =
StructType(StructField(city,StringType,true),
StructField(state,StringType,true), StructField(zip_code,LongType,true))

scala> val storeDF =


spark.read.format("json").schema(manualSchema).load("/sparkLabData/
store_locations.json")

storeDF: org.apache.spark.sql.DataFrame = [city: string, state: string ... 1


more field]

scala> storeDF.show(5)
+-----------+-----+---------+

| city|state| zip_code|

+-----------+-----+---------+

| Antioch| CA|945097911|

| Woodland| CA|957765409|

| San Jose| CA|951311866|

|Victorville| CA|923954216|

| Chico| CA|959284422|

+-----------+-----+---------+

only showing top 5 rows

6. To see all the columns in dataframe

scala> storeDF.columns

res12: Array[String] = Array(city, state, zip_code)

scala> storeDF.col("city")

res16: org.apache.spark.sql.Column = city

7. To see Rows in Dataframe

scala> storeDF.take(5)

res21: Array[org.apache.spark.sql.Row] = Array([Antioch,CA,945097911],


[Woodland,CA,957765409], [San Jose,CA,951311866], [Victorville,CA,923954216],
[Chico,CA,959284422])

scala> storeDF.collect()

res17: Array[org.apache.spark.sql.Row] = Array([Antioch,CA,945097911],


[Woodland,CA,957765409], [San Jose,CA,951311866], [Victorville,CA,923954216],
[Chico,CA,959284422], [San Dimas,CA,917731725], [Visalia,CA,932779527],
[Manteca,CA,953366745], [Redwood City,CA,940632854], [Lakewood,CA,907122409],
[Hayward,CA,945455008], [Pacoima,CA,913312352], [San Marcos,CA,92069],
[Lodi,CA,95240], [Huntington Beach,CA,92647], [Westlake Village,CA,913624063],
[San Leandro,CA,945771209], [Woodland Hills,CA,913672227], [El
Centro,CA,922431323], [Tustin,CA,927828918], [Vista,CA,920814546],
[Eureka,CA,955012121], [Garden Grove,CA,928431206], [Simi
Valley,CA,930656207], [Santa Clara,CA,950503100], [Los Angeles,CA,900391502],
[SandCity,CA,939553051], [Vallejo,CA,945913702], [Redding,CA,960034071],
[Clovis,C...

scala> storeDF.show()

+----------------+-----+---------+

| city|state| zip_code|

+----------------+-----+---------+

| Antioch| CA|945097911|

| Woodland| CA|957765409|

| San Jose| CA|951311866|

| Victorville| CA|923954216|

| Chico| CA|959284422|

| San Dimas| CA|917731725|

| Visalia| CA|932779527|

| Manteca| CA|953366745|

| Redwood City| CA|940632854|

| Lakewood| CA|907122409|

| Hayward| CA|945455008|

| Pacoima| CA|913312352|

| San Marcos| CA| 92069|

| Lodi| CA| 95240|

|Huntington Beach| CA| 92647|

|Westlake Village| CA|913624063|

| San Leandro| CA|945771209|

| Woodland Hills| CA|913672227|

| El Centro| CA|922431323|
| Tustin| CA|927828918|

+----------------+-----+---------+

only showing top 20 rows

scala> storeDF.first

res19: org.apache.spark.sql.Row = [Antioch,CA,945097911]

# to see all

scala> storeDF.show(storeDF.count().toInt)

+-------------------+-----+---------+

| city|state| zip_code|

+-------------------+-----+---------+

| Antioch| CA|945097911|

| Woodland| CA|957765409|

| San Jose| CA|951311866|

| Victorville| CA|923954216|

You might also like