Professional Documents
Culture Documents
[ ]: #9)Take a dataset with missing values and outliers and perform data␣
↪preprocessing steps such as imputation, outlier treatment, and normalization␣
↪in Python.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
# importing DataSet
df = pd.read_csv("dataset.csv")
# Handling outliers
clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
y_pred = clf.fit_predict(df_imp)
# Handling normalization
scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_inliers))
df_normalized.columns = df_inliers.columns
df_normalized.index = df_inliers.index
import pandas as pd
1
# import dataset
df = pd.read_csv('dataset.csv')
#visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',␣
↪linewidths=.5)
plt.title('Correlation Matrix')
plt.show()
↪normally distributed.
# Calculate skewness
skewness = df.apply(lambda x: skew(x))
# Plot histograms
sns.histplot(df['Feature1'], kde=True)
plt.title('Histogram of Feature1')
plt.show()
transformed_skewness = np.log1p(df['Transformed_Feature1']).skew()
print(f"Skewness after log transformation: {transformed_skewness}")
2
# Applying log transformations to other features
df['Transformed_Feature2'] = np.log1p(df['Feature2'])
#importing libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Box Plot
3
plt.figure(figsize=(10, 6))
sns.boxplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Box Plot of {numeric_attribute} for {categorical_attribute}')
plt.show()
# Violin Plot
plt.figure(figsize=(10, 6))
sns.violinplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Violin Plot of {numeric_attribute} for {categorical_attribute}')
plt.show()
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)
# Visualize
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Target', data=pca_df, palette='viridis',␣
↪s=70)
import pandas as pd
4
from sklearn.datasets import load_iris
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Create the RFE model and select the number of features to keep
num_features_to_keep = 2
rfe = RFE(estimator=clf, n_features_to_select=num_features_to_keep)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)
selected_features = iris.feature_names[selected_feature_indices]
print('Selected features:', selected_features)