Professional Documents
Culture Documents
spark = SparkSession.builder.appName("DataIngestion").getOrCreate()
df = spark.read.csv("path/to/data.csv", header=True)
import pandas as pd
pandas_df = df.toPandas()
pandas_df.fillna(pandas_df.mean(), inplace=True)
pandas_df['column_name'] = pandas_df['column_name'].astype('float')
scaler = StandardScaler()
pandas_df['column_name'] = scaler.fit_transform(pandas_df[['column_name']])
# Feature selection
pandas_df = pandas_df[selected_columns]
# Feature extraction
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
# Feature transformation
pandas_df['column_name'] = pandas_df['column_name'].apply(lambda x: x ** 2)
pandas_df['target_variable'],
test_size=0.3,
random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
py_spark_model.setParams(**model.get_params())
# Creating a UDF