You are on page 1of 6

FACULTY INITIATIVE – SLOT-3

BARATH P
II YEAR CSE

CANCER DISEASE CLASSIFICATION


Program and Output:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#load the data


df = pd.read_csv('/content/data.csv')
df['diagnosis'] = df['diagnosis'].replace({'M':1,'B':0})
df.head()

print("Cancer data set dimensions : {}".format(df.shape))


df.isnull().sum()

df.describe()
X = df.iloc[:, 1:31].values
Y = df.iloc[:, 31].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

sns.countplot(x ='diagnosis', data = df)


plt.show()
B, M = df['diagnosis'].value_counts()
print('Number of Benign: ',B)
print('Number of Malignant : ',M)

df.corr()
#Visualize the correlation
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(df.corr(), annot = True, fmt= '.2f')
#for data scaling
from sklearn.preprocessing import StandardScaler
#for splitting dataset
from sklearn.model_selection import train_test_split
#for fitting SVM model
from sklearn.svm import SVC
#for displaying evaluation metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#Scale the data (Feature Scaling)


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

#Create a function for models


def models(X_train, Y_train):

#Logistic Regression
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train, Y_train)

#Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, Y_train)
#Random Forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train, Y_train)

#Print the model accuracy of training data


print('[0]Logistic Regression Training Accuracy : ',log.score(X_train, Y_train))
print('[1]Decision Tree Training Accuracy : ',tree.score(X_train, Y_train))
print('[2]Random Forest Training Accuracy : ',forest.score(X_train, Y_train))

return log, tree, forest

#Getting all the models


model = models(X_train, Y_train)

#test model accuracy on test data using confusion matrix


from sklearn.metrics import confusion_matrix
for i in range (len(model)):
print('Model :',model[i])
cm = confusion_matrix(Y_test,model[i].predict(X_test))
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
print(cm)
print('Testing Accuracy =',(TP + TN)/(TP + FP + FN + TN))
print()
#test our trained model on the test data
test_df = pd.read_csv('/content/data.csv')
test = test_df.drop(['id','diagnosis'],axis = 1)
test.head()
y_pred = pd.DataFrame(model[0].predict(test), columns=['diagnosis'])
final_df = pd.DataFrame({'Id': test_df['id'], 'diagnosis': y_pred['diagnosis']})
final_df

You might also like