You are on page 1of 4

11/24/21, 5:00 PM Untitled5.

ipynb - Colaboratory

ANANYAA GUPTA
20BCT0177
ML MTT
24/11/21
Q3> Breast cancer dataset

# numerical analysis

import numpy as np

# storing and processing in dataframes

import pandas as pd

# simple plotting

import matplotlib.pyplot as plt

# advanced plotting

import seaborn as sns

# splitting dataset into train and test

from sklearn.model_selection import train_test_split

# scaling features

from sklearn.preprocessing import StandardScaler

# selecting important features

from sklearn.feature_selection import RFECV

# k nearest neighbors model

from sklearn.neighbors import KNeighborsClassifier

# accuracy

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc,classificatio

# plot style

sns.set_style('whitegrid')

# color palettes

pal = ['#0e2433', '#ff007f']

# read data

df = pd.read_csv('/content/data.csv')

# first few rows

df.head()

id diagnosis radius_mean texture_mean perimeter_mean area_mean smooth

0 842302 M 17.99 10.38 122.80 1001.0

1 842517 M 20.57 17.77 132.90 1326.0

2 84300903 M 19.69 21.25 130.00 1203.0

3 84348301 M 11.42 20.38 77.58 386.1

4 84358402 M 20.29 14.34 135.10 1297.0

# no. of rows and columns

df.shape

(569, 33)

https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 1/4
11/24/21, 5:00 PM Untitled5.ipynb - Colaboratory

# columns names

df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',

'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',

'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',

'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',

'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',

'fractal_dimension_se', 'radius_worst', 'texture_worst',

'perimeter_worst', 'area_worst', 'smoothness_worst',

'compactness_worst', 'concavity_worst', 'concave points_worst',

'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],

dtype='object')

# no of values in each class

print(df['diagnosis'].value_counts())

# plot class distribution

sns.countplot(df['diagnosis'], palette=pal, alpha=0.8)

plt.show()

B 357

M 212

Name: diagnosis, dtype: int64

fig, ax = plt.subplots()

m = ax.hist(df[df["diagnosis"] == "M"]['radius_mean'], bins=20, range=(0, 30), 
            label = "Malignant", alpha=0.7, color='#232121')

b = ax.hist(df[df["diagnosis"] == "B"]['radius_mean'], bins=20, range=(0, 30), 
            label = "Benign", alpha=0.7, color='#df2378')

plt.xlabel("Radius")

plt.ylabel("Count")

plt.title("Mean Radius")

plt.legend()

plt.show()

https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 2/4
11/24/21, 5:00 PM Untitled5.ipynb - Colaboratory

print('Min radius of benign cancer :', df[df['diagnosis']=='B']['radius_mean'].min())
print('Max radius of benign cancer :', df[df['diagnosis']=='B']['radius_mean'].max())
print('Min radius of malignant cancer :', df[df['diagnosis']=='M']['radius_mean'].min())
print('Min radius of malignant cancer :', df[df['diagnosis']=='M']['radius_mean'].max())

Min radius of benign cancer : 6.981

Max radius of benign cancer : 17.85

Min radius of malignant cancer : 10.95

Min radius of malignant cancer : 28.11

sns.boxplot?

# Drop unwanted columns
df.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)
print(df.shape)

(569, 31)

df['diagnosis']·=·df['diagnosis'].apply(lambda·x:·1·if·x=='M'·else·0)

# features and labels
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=42)

# model initialization

model = KNeighborsClassifier()

# model fitting

model.fit(X_train, y_train)

# predict using the model

pred = model.predict(X_test)

# model validation

print("accuracy score")

https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 3/4
11/24/21, 5:00 PM Untitled5.ipynb - Colaboratory

print(accuracy_score(pred, y_test))

print(confusion_matrix(pred, y_test))

print(classification_report(pred, y_test))

accuracy score

0.96875

[[98 4]

[ 1 57]]

precision recall f1-score support

0 0.99 0.96 0.98 102

1 0.93 0.98 0.96 58

accuracy 0.97 160

macro avg 0.96 0.97 0.97 160

weighted avg 0.97 0.97 0.97 160

check 0s completed at 4:53 PM

https://colab.research.google.com/drive/11xNT5fA-UIp1kMaXUzHSB6oeLSRp2XOE#scrollTo=PHeACsYqmGP7&printMode=true 4/4

You might also like