Professional Documents
Culture Documents
Spark
Spark
%spark
%spark
%spark
nat_disasters_df.show()
%spark
%spark
fossil_fuels_df.show()
%spark
%spark
global_sea_level_df.show()
%spark
%spark
global_temperature_df.show()
%spark
selected_disasters_df.show()
%spark
selected_fossil_fuels_df.show()
%spark
selected_sea_level_df.show()
%spark
selected_temperature_df.show()
%spark
global_temp_df.show()
%spark
global_mean_sea_level_df.show()
%spark
.join(selected_fossil_fuels_df, Seq("year"))
.join(global_mean_sea_level_df, Seq("year"))
.join(selected_disasters_df, Seq("year"))
table_df.show()
%spark
table_df.createOrReplaceTempView("table")
%sql
show tables
%sql
select
year,
global_temperature,
carbon_emissions,
LOG(mean_sea_level) AS log_mean_sea_level,
LOG(total_natural_disasters) AS log_total_natural_disasters
table_df.createOrReplaceTempView("my_table")
FROM my_table
""")
plotData.createOrReplaceTempView("plot_data")
%sql
%sql
SELECT
year,
carbon_emissions
FROM
%spark
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
// Assuming 'df' is your DataFrame
.setOutputCol("features")
.setLabelCol("total_natural_disasters")
.setFeaturesCol("features")
// Create a pipeline
.setLabelCol("total_natural_disasters")
.setPredictionCol("prediction")
.setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)
%spark
predictions.createOrReplaceTempView("predictions_table")
%sql
SELECT
year,
total_natural_disasters,
prediction
FROM predictions_table