You are on page 1of 5

unit1

December 23, 2023

[ ]: #9)Take a dataset with missing values and outliers and perform data␣
↪preprocessing steps such as imputation, outlier treatment, and normalization␣

↪in Python.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor

# importing DataSet
df = pd.read_csv("dataset.csv")

# Handling missing values


imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df_imp = pd.DataFrame(imp.fit_transform(df))
df_imp.columns = df.columns
df_imp.index = df.index

# Handling outliers
clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
y_pred = clf.fit_predict(df_imp)

# Boolean mask for inliers


mask = y_pred != -1

# Creating new DataFrame for inliers


df_inliers = df_imp[mask]

# Handling normalization
scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_inliers))
df_normalized.columns = df_inliers.columns
df_normalized.index = df_inliers.index

[ ]: #11)Given a dataset, calculate the correlation matrix and interpret the␣


↪relationships between different features.

import pandas as pd

1
# import dataset
df = pd.read_csv('dataset.csv')

# calculate the correlation matrix.


correlation_matrix = df.corr()
import seaborn as sns
import matplotlib.pyplot as plt

#visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',␣
↪linewidths=.5)

plt.title('Correlation Matrix')
plt.show()

# interpreting relationships between different features


correlation_feature1_feature2 = correlation_matrix.loc['Feature1', 'Feature2']
print(f"Correlation between Feature1 and Feature2:␣
↪{correlation_feature1_feature2:.2f}")

[ ]: #12)Identify the presence of skewed distributions in a dataset and apply␣


↪suitable transformations (e.g., log transformation) to make the data more␣

↪normally distributed.

import matplotlib.pyplot as plt


import seaborn as sns
import numpy as np
from scipy.stats import skew

# Calculate skewness
skewness = df.apply(lambda x: skew(x))

# Plot histograms
sns.histplot(df['Feature1'], kde=True)
plt.title('Histogram of Feature1')
plt.show()

# log transformation to feature


df['Transformed_Feature1'] = np.log1p(df['Feature1'])

transformed_skewness = np.log1p(df['Transformed_Feature1']).skew()
print(f"Skewness after log transformation: {transformed_skewness}")

# Visualize transformed feature


sns.histplot(df['Transformed_Feature1'], kde=True)
plt.title('Histogram of Transformed_Feature1')
plt.show()

2
# Applying log transformations to other features
df['Transformed_Feature2'] = np.log1p(df['Feature2'])

# plot overall distribution


sns.histplot(df['Transformed_Feature1'], kde=True)
sns.histplot(df['Transformed_Feature2'], kde=True)
plt.title('Transformed Distributions')
plt.show()

[ ]: #13)Create a scatter plot matrix (pair plot) for a multi-dimensional dataset␣


↪and analyze the relationships between different pairs of features.

#importing libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset into a pandas DataFrame


df = pd.read_csv('dataset.csv')

# Create a pair plot


sns.pairplot(df)
plt.show()

# Analyze specific relationships


sns.scatterplot(x='Feature1', y='Feature2', data=df)
plt.title('Scatter Plot of Feature1 vs Feature2')
plt.show()

sns.scatterplot(x='Feature3', y='Feature4', data=df)


plt.title('Scatter Plot of Feature3 vs Feature4')
plt.show()

[ ]: #14)Generate a box plot or violin plot to visualize the distribution of a␣


↪numeric attribute for different categories in the dataset.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset into a pandas DataFrame


df = pd.read_csv('your_dataset.csv')

# Choose a numeric attribute and a categorical attribute for visualization


numeric_attribute = 'NumericAttribute'
categorical_attribute = 'CategoricalAttribute'

# Box Plot

3
plt.figure(figsize=(10, 6))
sns.boxplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Box Plot of {numeric_attribute} for {categorical_attribute}')
plt.show()

# Violin Plot
plt.figure(figsize=(10, 6))
sns.violinplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Violin Plot of {numeric_attribute} for {categorical_attribute}')
plt.show()

[ ]: #15)Select a dataset with a large number of features and apply dimensionality␣


↪reduction techniques (e.g., PCA - Principal Component Analysis) to reduce␣

↪the number of features.

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the Iris dataset


iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
target = iris.target

# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)

# Apply PCA to reduce dimensionality


pca = PCA(n_components=2) # You can choose the number of components
iris_pca = pca.fit_transform(iris_scaled)

# Create a DataFrame with the principal components


pca_df = pd.DataFrame(data=iris_pca, columns=['PC1', 'PC2'])
pca_df['Target'] = target

# Visualize
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Target', data=pca_df, palette='viridis',␣
↪s=70)

plt.title('PCA - Principal Component Analysis')


plt.show()

[ ]: #16)Implement a feature selection algorithm (e.g., Recursive Feature␣


↪Elimination) to choose the most relevant features from a dataset.

import pandas as pd

4
from sklearn.datasets import load_iris
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset


iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)

# classifier for feature ranking


clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the RFE model and select the number of features to keep
num_features_to_keep = 2
rfe = RFE(estimator=clf, n_features_to_select=num_features_to_keep)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Fit a classifier on the selected features


clf.fit(X_train_rfe, y_train)

# Make predictions and evaluate accuracy on the test set


y_pred = clf.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with {num_features_to_keep} features: {accuracy:.2f}')

# Get the selected feature indices


selected_feature_indices = [i for i, selected in enumerate(rfe.support_) if␣
↪selected]

selected_features = iris.feature_names[selected_feature_indices]
print('Selected features:', selected_features)

You might also like