Professional Documents
Culture Documents
ipynb - Colaboratory
ANANYAA GUPTA
20BCT0177
ML MTT
24/11/21
Q3> Breast cancer dataset
# numerical analysis
import numpy as np
# storing and processing in dataframes
import pandas as pd
# simple plotting
import matplotlib.pyplot as plt
# advanced plotting
import seaborn as sns
# splitting dataset into train and test
from sklearn.model_selection import train_test_split
# scaling features
from sklearn.preprocessing import StandardScaler
# selecting important features
from sklearn.feature_selection import RFECV
# k nearest neighbors model
from sklearn.neighbors import KNeighborsClassifier
# accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc,classificatio
# plot style
sns.set_style('whitegrid')
# color palettes
pal = ['#0e2433', '#ff007f']
# read data
df = pd.read_csv('/content/data.csv')
# first few rows
df.head()
# no. of rows and columns
df.shape
(569, 33)
https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 1/4
11/24/21, 5:00 PM Untitled5.ipynb - Colaboratory
# columns names
df.columns
dtype='object')
# no of values in each class
print(df['diagnosis'].value_counts())
# plot class distribution
sns.countplot(df['diagnosis'], palette=pal, alpha=0.8)
plt.show()
B 357
M 212
fig, ax = plt.subplots()
m = ax.hist(df[df["diagnosis"] == "M"]['radius_mean'], bins=20, range=(0, 30),
label = "Malignant", alpha=0.7, color='#232121')
b = ax.hist(df[df["diagnosis"] == "B"]['radius_mean'], bins=20, range=(0, 30),
label = "Benign", alpha=0.7, color='#df2378')
plt.xlabel("Radius")
plt.ylabel("Count")
plt.title("Mean Radius")
plt.legend()
plt.show()
https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 2/4
11/24/21, 5:00 PM Untitled5.ipynb - Colaboratory
print('Min radius of benign cancer :', df[df['diagnosis']=='B']['radius_mean'].min())
print('Max radius of benign cancer :', df[df['diagnosis']=='B']['radius_mean'].max())
print('Min radius of malignant cancer :', df[df['diagnosis']=='M']['radius_mean'].min())
print('Min radius of malignant cancer :', df[df['diagnosis']=='M']['radius_mean'].max())
sns.boxplot?
# Drop unwanted columns
df.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)
print(df.shape)
(569, 31)
df['diagnosis']·=·df['diagnosis'].apply(lambda·x:·1·if·x=='M'·else·0)
# features and labels
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis']
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=42)
# model initialization
model = KNeighborsClassifier()
# model fitting
model.fit(X_train, y_train)
# predict using the model
pred = model.predict(X_test)
# model validation
print("accuracy score")
https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 3/4
11/24/21, 5:00 PM Untitled5.ipynb - Colaboratory
print(accuracy_score(pred, y_test))
print(confusion_matrix(pred, y_test))
print(classification_report(pred, y_test))
accuracy score
0.96875
[[98 4]
[ 1 57]]
https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 4/4