You are on page 1of 37

PRACTICAL FILE

Basics of Machine Learning


PCC-CSE-402G

Submitted by: Submitted to:


Dr. Kamaldeep

CSE–A (8th Sem) CSE Dept.


UIET MDU
Index
Sr.No. Title Remarks
1 Calculating the basic statistical
parameters for a normally
distributed dataset.
2 Plotting a Normal Distribution
as a Scatter Plot.
3 Plotting a Normal Distribution
as a Histogram.
4 Plotting a Histogram of a
Uniform Distribution.
5 Predicting the House Prices of a
neighborhood using Liner
Regression.
6 Reading a csv file and plotting
subplots for the various
variables.
7 Predicting the average
Canadian Per Capita Income
using Linear Regression.
8 Using a multivariate Linear
Regression model for predicting
House Prices.
9 Predicting Salaries for possible
candidates for a position using
Linear Regression.
10 Training and Testing a simple
Logistic Regression Model.
11 Saving the contents of a model
in a pickle file.
12 Performing K-Means Clustering
on a Dataset.
13 Plotting an ROC curve for a
prediction model.
14 Calculating the F1 Score for a
given dataset and results.
15 Classifying Objects in the Iris
Dataset.
1. Calculating the basic statistical
parameters for a normally
distributed dataset.

import statistics
import numpy as np

x = np.random.normal(1, 100000, 100)


print("Dataset:\n", x)
print("\nMean of the dataset:", statistics.mean(x))
print("Median of the dataset:", statistics.median(x))
print("Mode of the dataset:", statistics.mode(x))

Output:
2. Plotting a Normal Distribution as a
Scatter Plot.
import numpy as np
import statistics
import matplotlib.pyplot as plt

x = np.random.normal(1, 100000, 500)


plt.title("Scatter of a Normal Distribution")
plt.xlabel("Randomly Generated Numbers")
plt.ylabel("Frequency")

x_mean = "Mean: {0}".format(statistics.mean(x))


x_mode = "Mode: {0}".format(statistics.mode(x))
x_median = "Median: {0}".format(statistics.median(x))

plt.grid()
plt.scatter(x, x)
plt.text(0, 0.1, x_mean, fontsize=14,
transform=plt.gcf().transFigure)
plt.text(0, 0.05, x_mode, fontsize=14,
transform=plt.gcf().transFigure)
plt.text(0, 0, x_median, fontsize=14,
transform=plt.gcf().transFigure)
plt.subplots_adjust(bottom=0.25)
plt.show()
Output:
3. Plotting a Normal Distribution as a
Histogram.
import numpy as np
import statistics
import matplotlib.pyplot as plt

x = np.random.normal(1, 100000, 1000)


print(x)
plt.title("Histogram of a Normal Distribution")
plt.xlabel("Randomly Generated Numbers")
plt.ylabel("Frequency")

x_mean = "Mean: {0}".format(statistics.mean(x))


x_mode = "Mode: {0}".format(statistics.mode(x))
x_median = "Median: {0}".format(statistics.median(x))

plt.grid()
plt.hist(x)
plt.text(0, 0.1, x_mean, fontsize=14,
transform=plt.gcf().transFigure)
plt.text(0, 0.05, x_mode, fontsize=14,
transform=plt.gcf().transFigure)
plt.text(0, 0, x_median, fontsize=14,
transform=plt.gcf().transFigure)
plt.subplots_adjust(bottom=0.25)
plt.show()
Output:
4. Plotting a histogram of a Uniform
Distribution.
import numpy as np
import statistics
import matplotlib.pyplot as plt

x = np.random.uniform(1, 100000, 1000)

plt.title("Histogram of a Uniform Distribution")


plt.xlabel("Randomly Generated Numbers")
plt.ylabel("Frequency")

x_mean = "Mean: {0}".format(statistics.mean(x))


x_mode = "Mode: {0}".format(statistics.mode(x))
x_median = "Median: {0}".format(statistics.median(x))

plt.grid()
plt.hist(x)
plt.text(0, 0.1, x_mean, fontsize=14,
transform=plt.gcf().transFigure)
plt.text(0, 0.05, x_mode, fontsize=14,
transform=plt.gcf().transFigure)
plt.text(0, 0, x_median, fontsize=14,
transform=plt.gcf().transFigure)
plt.subplots_adjust(bottom=0.25)
plt.show()
Output:
5. Predicting the House Prices of a
neighborhood using Liner
Regression.

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

x = np.array([[2600], [3000], [3200], [3600], [4000]])


# data = np.array([[2600, 550000], [3000, 565000], [3200, 610000], [3600,
680000], [4000, 725000]])
y = np.array([[550000], [565000], [610000], [680000], [725000]])

# print(data)
reg = linear_model.LinearRegression()
reg.fit(x, y)

user = input("Enter the Area:")


print(reg.predict([[int(user)]]))
print(reg.coef_)
print(reg.intercept_)

plt.title("Housing Prices in Monroe, NJ")


plt.xlabel("Area (sq. feet)")
plt.ylabel("Price ($)")

plt.grid()
plt.plot(x, y)
plt.show()
Output:
6. Reading a csv file and plotting
subplots for the various variables.
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model

df = pd.read_csv("HR_comma_sep.csv")

print(df.head())

plt.subplot(3, 3, 1)
plt.scatter(df.left, df.satisfaction_level)

plt.subplot(3, 3, 2)
plt.scatter(df.left, df.last_evaluation)

plt.subplot(3, 3, 3)
plt.scatter(df.left, df.number_project)

plt.subplot(3, 3, 4)
plt.scatter(df.left, df.average_monthly_hours)

plt.subplot(3, 3, 5)
plt.scatter(df.left, df.time_spend_company)

plt.subplot(3, 3, 6)
plt.scatter(df.left, df.Work_accident)

plt.subplot(3, 3, 7)
plt.scatter(df.left, df.promotion_last_5years)
plt.show()
Output:
7. Predicting the average Canadian Per
Capita Income using Linear
Regression.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.impute import SimpleImputer
df = pd.read_csv("canada_per_capita_income.csv")

year = df.drop("income", axis="columns")


income = df.drop("year", axis="columns")
reg = linear_model.LinearRegression()
reg.fit(year, income)
print(reg.predict([[2022]]))

Output:
8. Using a multivariate Linear
Regression model for predicting
House Prices.
import pandas as pd
import numpy as np
from sklearn import linear_model

df = pd.read_csv("homeprices.csv")

print(df)

df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())

reg = linear_model.LinearRegression()
reg.fit(df.drop("price", axis="columns"), df.price)

cf = reg.coef_
icept = reg.intercept_

print("Coefficients:\n", cf)
print("Intercept:\n", icept)

pred = int(reg.predict([[3000, 3, 40]]))

print("Prediction for a 40 year old house with 3000 sq ft Area and 3


Bedrooms:\n$", pred)

Output:
9. Predicting Salaries for possible
candidates for a position using
Linear Regression.
import pandas as pd
import numpy as np
from sklearn import linear_model

df = pd.read_csv("hiring.csv")

'''list_of_columns = list(df.columns)
print("Columns:\n", list_of_columns)'''
df.test_score = df.test_score.fillna(df.test_score.median())
df.experience = df.experience.fillna(df.experience.median())

print(df)

reg = linear_model.LinearRegression()
reg.fit(df.drop("salary", axis="columns"), df.salary)

cf = reg.coef_
icept = reg.intercept_

print("Coefficients:\n", cf)
print("Intercept:\n", icept)

print("Predicted Salary for Candidate 1:\n", reg.predict([[2, 9,


6]]))
print("Predicted Salary for Candidate 1:\n", reg.predict([[12, 10,
10]]))
Output:
10.Training and Testing a simple
Logistic Regression Model.
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
age = np.array([[10], [50], [20], [25], [30], [35], [39], [50]])
ins_bgt = np.array([[0], [1], [0], [0], [1], [1], [1], [1]])

plt.scatter(age, ins_bgt)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(age, ins_bgt,
train_size=0.8)

model = LogisticRegression()
model.fit(X_train, y_train)

print("X_train:\n", X_train)
print("y_train:\n", y_train)
print("X_test:\n", X_test)
print("y_test:\n", y_test)

y_predicted = model.predict(X_test)
print("Prediction Probability:\n", model.predict_proba(X_test))
print("Score:\n", model.score(X_test, y_test))

print("y_predicted:\n", y_predicted)
print("X_test:\n", X_test)

print("Coefficients:\n", model.coef_)
print("Intercept:\n", model.intercept_)
Output:
11. Saving the contents of a model in a
pickle file.
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import random
import pickle

DATADIR = r"C:/Documents/LabWork"
CATEGORIES = ["Me", "Anyone Other than me"]

training_data = []

def create_training_data():
for category in CATEGORIES:
path = os.path.join(DATADIR, category
class_num = CATEGORIES.index(category)
for img in os.listdir(path):
img_array = cv2.imread(os.path.join(path, img),
cv2.IMREAD_GRAYSCALE)
training_data.append([img_array, class_num])

create_training_data()

print(len(training_data))

random.shuffle(training_data)
for sample in training_data:
print(sample[1])

X = []
y = []

for features, label in training_data:


X.append(features)
y.append(label)

X = np.array(X).reshape(-1, 256, 256, 1)


print(type(X))

pickle_out = open("X.pickle", "wb")


pickle.dump(X, pickle_out)

pickle_out = open("y.pickle", "wb")


pickle.dump(y, pickle_out)
Output:
12. Performing K-Means Clustering on a
Dataset.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# The Data
X = -2 * np.random.rand(100, 2)
X1 = 1 + 2 * np.random.rand(50, 2)
X[50:100, :] = X1

# Displaying the Data


plt.scatter(X[:, 0], X[:, 1], s=50, c='b')
plt.show()

# Using K-Means
# Taking in "K" as 2
Kmean = KMeans(n_clusters=2)
Kmean.fit(X)

# Finding the Centroids and displaying them in separate colours


centroids = Kmean.cluster_centers_
print(centroids)
plt.scatter(X[:, 0], X[:, 1], s=50, c='b')
plt.scatter(centroids[0][0], centroids[0][1], s=200, c='g', marker='s')
plt.scatter(centroids[1][0], centroids[1][1], s=200, c='r', marker='s')
plt.show()

# Testing
sample_test = np.array([-3.0, -3.0])
second_test = sample_test.reshape(1, -1)
test_prediction = Kmean.predict(second_test)
print(test_prediction)
Output:

The Unlabelled Dataset


The Calculated Cluster Means

The Test Predictions


13.Plotting an ROC curve for a
prediction model.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("default.csv")
X = data[['student', 'balance', 'income']]
y = data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.8)

model = LogisticRegression()
model.fit(X_train, y_train)

print("X_train:\n", X_train)
print("y_train:\n", y_train)
print("X_test:\n", X_test)
print("y_test:\n", y_test)

y_predicted = model.predict(X_test)
print("Prediction Probability:\n", model.predict_proba(X_test))
print("Score:\n", model.score(X_test, y_test))

print("y_predicted:\n", y_predicted)
print("X_test:\n", X_test)

print("Coefficients:\n", model.coef_)
print("Intercept:\n", model.intercept_)

y_pred_proba = model.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)

plt.plot(fpr, tpr, label="AUC="+str(auc))


plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.legend(loc=4)
plt.show()
Output:

The ROC Curve


The Rest of the Outputs
14.Calculating the F1 Score for a given
dataset and results.
import numpy as np
from sklearn.metrics import f1_score

actual = np.repeat([1, 0], repeats=[160, 240])


pred = np.repeat([1, 0, 1, 0], repeats=[120, 40, 70, 170])

print("F1 Score: ", f1_score(actual, pred))

Output:
15. Classifying Objects in the Iris
Dataset.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import parallel_coordinates
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import
LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("iris_data.csv")

print("Data:\n", data.head(5)) print("\


nData Summary:\n", data.describe())
print("\nSpecies:\n", data.groupby('species').size())

train, test = train_test_split(data, test_size=0.4,


stratify=data['species'], random_state=42)

n_bins = 10
fig, axs = plt.subplots(2, 2)
axs[0, 0].hist(train['sepal_length'], bins=n_bins)
axs[0, 0].set_title('Sepal Length')
axs[0, 1].hist(train['sepal_width'], bins=n_bins)
axs[0, 1].set_title('Sepal Width')
axs[1, 0].hist(train['petal_length'], bins=n_bins)
axs[1, 0].set_title('Petal Length')
axs[1, 1].hist(train['petal_width'], bins=n_bins)
axs[1, 1].set_title('Petal Width')
fig.tight_layout(pad=1.0)
plt.show()

fn = ["sepal_length", "sepal_width", "petal_length", "petal_width"]


cn = ['setosa', 'versicolor', 'virginica']

sns.violinplot(x="species", y="petal_length", data=train, size=5,


order=cn, palette='colorblind')
plt.show()

corrmat = train.corr()
sns.heatmap(corrmat, annot=True, square=True)
plt.show()
X_train = train[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
y_train = train.species
X_test = test[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
y_test = test.species

mod_dt = DecisionTreeClassifier(max_depth=3, random_state=1)


mod_dt.fit(X_train, y_train)
prediction = mod_dt.predict(X_test)
print("The Accuracy of the Decision tree:",
metrics.accuracy_score(prediction, y_test))

plt.figure(figsize=(10, 8))
plot_tree(mod_dt, feature_names=fn, class_names=cn, filled=True)
plt.show()
Output:

Histogram Plots of the Variables


Violin Plots of the Species vs. Their Petal Lengths
Heat Map of the Correlation Matrix
The Decision Tree (Along with the GINI Index)
The Rest of the Outputs

You might also like