You are on page 1of 2

from pyspark.

sql import SparkSession

spark = SparkSession.builder.appName("DataIngestion").getOrCreate()

df = spark.read.csv("path/to/data.csv", header=True)

import pandas as pd

from pyspark.sql.functions import when

pandas_df = df.toPandas()

# Handling missing values

pandas_df.fillna(pandas_df.mean(), inplace=True)

# Converting data types

pandas_df['column_name'] = pandas_df['column_name'].astype('float')

# Scaling numeric features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

pandas_df['column_name'] = scaler.fit_transform(pandas_df[['column_name']])

# Encoding categorical variables

pandas_df = pd.get_dummies(pandas_df, columns=['column_name'])

# Feature selection

selected_columns = ['column_name_1', 'column_name_2', 'column_name_3']

pandas_df = pandas_df[selected_columns]

# Feature extraction
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pandas_df[['pca_1', 'pca_2']] = pca.fit_transform(pandas_df[['column_name_1', 'column_name_2']])

# Feature transformation

pandas_df['column_name'] = pandas_df['column_name'].apply(lambda x: x ** 2)

# Splitting data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pandas_df.drop('target_variable', axis=1),

pandas_df['target_variable'],

test_size=0.3,

random_state=42)

# Fitting the model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

# Evaluating the model

from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

# Converting the Scikit-learn model to a PySpark model

from pyspark.ml.classification import LogisticRegression as PySparkLogisticRegression

py_spark_model = PySparkLogisticRegression(featuresCol="features", labelCol="label")

py_spark_model.setParams(**model.get_params())

# Creating a UDF

from pyspark.sql.functions import

You might also like