December 23, 2023

[ ]: #9)Take a dataset with missing values and outliers and perform data␣
↪preprocessing steps such as imputation, outlier treatment, and normalization␣

↪in Python.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor

# importing DataSet
df = pd.read_csv("dataset.csv")

# Handling missing values

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df_imp = pd.DataFrame(imp.fit_transform(df))
df_imp.columns = df.columns
df_imp.index = df.index

# Handling outliers
clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
y_pred = clf.fit_predict(df_imp)

# Boolean mask for inliers

mask = y_pred != -1

# Creating new DataFrame for inliers

df_inliers = df_imp[mask]

# Handling normalization
scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_inliers))
df_normalized.columns = df_inliers.columns
df_normalized.index = df_inliers.index

[ ]: #11)Given a dataset, calculate the correlation matrix and interpret the␣

↪relationships between different features.

import pandas as pd

# import dataset
df = pd.read_csv('dataset.csv')

# calculate the correlation matrix.

correlation_matrix = df.corr()
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',␣

plt.title('Correlation Matrix')

# interpreting relationships between different features

correlation_feature1_feature2 = correlation_matrix.loc['Feature1', 'Feature2']
print(f"Correlation between Feature1 and Feature2:␣

[ ]: #12)Identify the presence of skewed distributions in a dataset and apply␣

↪suitable transformations (e.g., log transformation) to make the data more␣

↪normally distributed.

import matplotlib.pyplot as plt

import seaborn as sns
import numpy as np
from scipy.stats import skew

# Calculate skewness
skewness = df.apply(lambda x: skew(x))

# Plot histograms
sns.histplot(df['Feature1'], kde=True)
plt.title('Histogram of Feature1')

# log transformation to feature

df['Transformed_Feature1'] = np.log1p(df['Feature1'])

transformed_skewness = np.log1p(df['Transformed_Feature1']).skew()
print(f"Skewness after log transformation: {transformed_skewness}")

# Visualize transformed feature

sns.histplot(df['Transformed_Feature1'], kde=True)
plt.title('Histogram of Transformed_Feature1')

# Applying log transformations to other features
df['Transformed_Feature2'] = np.log1p(df['Feature2'])

# plot overall distribution

sns.histplot(df['Transformed_Feature1'], kde=True)
sns.histplot(df['Transformed_Feature2'], kde=True)
plt.title('Transformed Distributions')

[ ]: #13)Create a scatter plot matrix (pair plot) for a multi-dimensional dataset␣

↪and analyze the relationships between different pairs of features.

#importing libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset into a pandas DataFrame

df = pd.read_csv('dataset.csv')

# Create a pair plot


# Analyze specific relationships

sns.scatterplot(x='Feature1', y='Feature2', data=df)
plt.title('Scatter Plot of Feature1 vs Feature2')

sns.scatterplot(x='Feature3', y='Feature4', data=df)

plt.title('Scatter Plot of Feature3 vs Feature4')

[ ]: #14)Generate a box plot or violin plot to visualize the distribution of a␣

↪numeric attribute for different categories in the dataset.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset into a pandas DataFrame

df = pd.read_csv('your_dataset.csv')

# Choose a numeric attribute and a categorical attribute for visualization

numeric_attribute = 'NumericAttribute'
categorical_attribute = 'CategoricalAttribute'

# Box Plot

plt.figure(figsize=(10, 6))
sns.boxplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Box Plot of {numeric_attribute} for {categorical_attribute}')

# Violin Plot
plt.figure(figsize=(10, 6))
sns.violinplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Violin Plot of {numeric_attribute} for {categorical_attribute}')

[ ]: #15)Select a dataset with a large number of features and apply dimensionality␣

↪reduction techniques (e.g., PCA - Principal Component Analysis) to reduce␣

↪the number of features.

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the Iris dataset

iris = load_iris()
iris_df = pd.DataFrame(, columns=iris.feature_names)
target =

# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)

# Apply PCA to reduce dimensionality

pca = PCA(n_components=2) # You can choose the number of components
iris_pca = pca.fit_transform(iris_scaled)

# Create a DataFrame with the principal components

pca_df = pd.DataFrame(data=iris_pca, columns=['PC1', 'PC2'])
pca_df['Target'] = target

# Visualize
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Target', data=pca_df, palette='viridis',␣

plt.title('PCA - Principal Component Analysis')

[ ]: #16)Implement a feature selection algorithm (e.g., Recursive Feature␣

↪Elimination) to choose the most relevant features from a dataset.

import pandas as pd

from sklearn.datasets import load_iris
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset

iris = load_iris()
X =
y =

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣

# classifier for feature ranking

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the RFE model and select the number of features to keep
num_features_to_keep = 2
rfe = RFE(estimator=clf, n_features_to_select=num_features_to_keep)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Fit a classifier on the selected features, y_train)

# Make predictions and evaluate accuracy on the test set

y_pred = clf.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with {num_features_to_keep} features: {accuracy:.2f}')

# Get the selected feature indices

selected_feature_indices = [i for i, selected in enumerate(rfe.support_) if␣

selected_features = iris.feature_names[selected_feature_indices]
print('Selected features:', selected_features)

