Unit1 ML Programs

unit1
December 23, 2023
[ ]: #9)Take a dataset with missing values and outliers and perform data␣
↪preprocessing steps such as imputation, outlier treatment, and normalization␣
↪in Python.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
# importing DataSet
df = pd.read_csv("dataset.csv")
# Handling missing values

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df_imp = pd.DataFrame(imp.fit_transform(df))
df_imp.columns = df.columns
df_imp.index = df.index
# Handling outliers
clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
y_pred = clf.fit_predict(df_imp)
# Boolean mask for inliers

mask = y_pred != -1
# Creating new DataFrame for inliers

df_inliers = df_imp[mask]
# Handling normalization
scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_inliers))
df_normalized.columns = df_inliers.columns
df_normalized.index = df_inliers.index
[ ]: #11)Given a dataset, calculate the correlation matrix and interpret the␣

↪relationships between different features.
import pandas as pd
1
# import dataset
df = pd.read_csv('dataset.csv')
# calculate the correlation matrix.

correlation_matrix = df.corr()
import seaborn as sns
import matplotlib.pyplot as plt
#visualization
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',␣
↪linewidths=.5)
plt.title('Correlation Matrix')
plt.show()
# interpreting relationships between different features

correlation_feature1_feature2 = correlation_matrix.loc['Feature1', 'Feature2']
print(f"Correlation between Feature1 and Feature2:␣
↪{correlation_feature1_feature2:.2f}")
[ ]: #12)Identify the presence of skewed distributions in a dataset and apply␣

↪suitable transformations (e.g., log transformation) to make the data more␣
↪normally distributed.

import numpy as np
from scipy.stats import skew
# Calculate skewness
skewness = df.apply(lambda x: skew(x))
# Plot histograms
sns.histplot(df['Feature1'], kde=True)
plt.title('Histogram of Feature1')
plt.show()
# log transformation to feature

df['Transformed_Feature1'] = np.log1p(df['Feature1'])
transformed_skewness = np.log1p(df['Transformed_Feature1']).skew()
print(f"Skewness after log transformation: {transformed_skewness}")
# Visualize transformed feature

sns.histplot(df['Transformed_Feature1'], kde=True)
plt.title('Histogram of Transformed_Feature1')
plt.show()
2
# Applying log transformations to other features
df['Transformed_Feature2'] = np.log1p(df['Feature2'])
# plot overall distribution

plt.title('Transformed Distributions')
plt.show()
[ ]: #13)Create a scatter plot matrix (pair plot) for a multi-dimensional dataset␣

↪and analyze the relationships between different pairs of features.
#importing libraries
import pandas as pd
# Load your dataset into a pandas DataFrame

df = pd.read_csv('dataset.csv')
# Create a pair plot

sns.pairplot(df)
plt.show()
# Analyze specific relationships

sns.scatterplot(x='Feature1', y='Feature2', data=df)
plt.title('Scatter Plot of Feature1 vs Feature2')
plt.show()
sns.scatterplot(x='Feature3', y='Feature4', data=df)

plt.title('Scatter Plot of Feature3 vs Feature4')
plt.show()
[ ]: #14)Generate a box plot or violin plot to visualize the distribution of a␣

↪numeric attribute for different categories in the dataset.
import pandas as pd
# Load your dataset into a pandas DataFrame

df = pd.read_csv('your_dataset.csv')
# Choose a numeric attribute and a categorical attribute for visualization

numeric_attribute = 'NumericAttribute'
categorical_attribute = 'CategoricalAttribute'
# Box Plot
3
sns.boxplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Box Plot of {numeric_attribute} for {categorical_attribute}')
plt.show()
# Violin Plot
sns.violinplot(x=categorical_attribute, y=numeric_attribute, data=df)
plt.title(f'Violin Plot of {numeric_attribute} for {categorical_attribute}')
plt.show()
[ ]: #15)Select a dataset with a large number of features and apply dimensionality␣

↪reduction techniques (e.g., PCA - Principal Component Analysis) to reduce␣
↪the number of features.
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
# Load the Iris dataset

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
target = iris.target
# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)
# Apply PCA to reduce dimensionality

pca = PCA(n_components=2) # You can choose the number of components
iris_pca = pca.fit_transform(iris_scaled)
# Create a DataFrame with the principal components

pca_df = pd.DataFrame(data=iris_pca, columns=['PC1', 'PC2'])
pca_df['Target'] = target
# Visualize
sns.scatterplot(x='PC1', y='PC2', hue='Target', data=pca_df, palette='viridis',␣
↪s=70)
plt.title('PCA - Principal Component Analysis')

plt.show()
[ ]: #16)Implement a feature selection algorithm (e.g., Recursive Feature␣

↪Elimination) to choose the most relevant features from a dataset.
import pandas as pd
4
from sklearn.datasets import load_iris
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the Iris dataset

iris = load_iris()
X = iris.data
y = iris.target
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)
# classifier for feature ranking

clf = RandomForestClassifier(n_estimators=100, random_state=42)
# Create the RFE model and select the number of features to keep
num_features_to_keep = 2
rfe = RFE(estimator=clf, n_features_to_select=num_features_to_keep)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)
# Fit a classifier on the selected features

clf.fit(X_train_rfe, y_train)
# Make predictions and evaluate accuracy on the test set

y_pred = clf.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with {num_features_to_keep} features: {accuracy:.2f}')
# Get the selected feature indices

selected_feature_indices = [i for i, selected in enumerate(rfe.support_) if␣
↪selected]
selected_features = iris.feature_names[selected_feature_indices]
print('Selected features:', selected_features)

Unit1 ML Programs

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Unit1 ML Programs

Uploaded by

Copyright:

Available Formats

unit1

December 23, 2023

# Handling missing values

# Boolean mask for inliers

# Creating new DataFrame for inliers

[ ]: #11)Given a dataset, calculate the correlation matrix and interpret the␣

# calculate the correlation matrix.

# interpreting relationships between different features

[ ]: #12)Identify the presence of skewed distributions in a dataset and apply␣

import matplotlib.pyplot as plt

# log transformation to feature

# Visualize transformed feature

# plot overall distribution

[ ]: #13)Create a scatter plot matrix (pair plot) for a multi-dimensional dataset␣

# Load your dataset into a pandas DataFrame

# Create a pair plot

# Analyze specific relationships

sns.scatterplot(x='Feature3', y='Feature4', data=df)

[ ]: #14)Generate a box plot or violin plot to visualize the distribution of a␣

# Load your dataset into a pandas DataFrame

# Choose a numeric attribute and a categorical attribute for visualization

[ ]: #15)Select a dataset with a large number of features and apply dimensionality␣

↪the number of features.

# Load the Iris dataset

# Apply PCA to reduce dimensionality

# Create a DataFrame with the principal components

plt.title('PCA - Principal Component Analysis')

[ ]: #16)Implement a feature selection algorithm (e.g., Recursive Feature␣

# Load the Iris dataset

# Split the dataset into training and testing sets

# classifier for feature ranking

# Fit a classifier on the selected features

# Make predictions and evaluate accuracy on the test set

# Get the selected feature indices

You might also like