You are on page 1of 13

9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [1]: import pandas as pd


import numpy as np

# import plotting libraries


import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline

import seaborn as sns


sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

# import libraries for metrics and reporting


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 1/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [2]: # 4. Relevant Information:


# Samples arrive periodically as Dr. Wolberg reports his clinical cases.
# The database therefore reflects this chronological grouping of the data.
# This grouping information appears immediately below, having been removed
# from the data itself:
# Group 1: 367 instances (January 1989)
# Group 2: 70 instances (October 1989)
# Group 3: 31 instances (February 1990)
# Group 4: 17 instances (April 1990)
# Group 5: 48 instances (August 1990)
# Group 6: 49 instances (Updated January 1991)
# Group 7: 31 instances (June 1991)
# Group 8: 86 instances (November 1991)
# -----------------------------------------
# Total: 699 points (as of the donated datbase on 15 July 1992)

# Note that the results summarized above in Past Usage refer to a dataset
# of size 369, while Group 1 has only 367 instances. This is because it
# originally contained 369 instances; 2 were removed. The following
# statements summarizes changes to the original Group 1's set of data:

# ##### Group 1 : 367 points: 200B 167M (January 1989)


# ##### Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 11878
05
# ##### Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no reco
rd
# ##### : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithel
ial
# ##### : Changed 0 to 1 in field 6 of sample 1219406
# ##### : Changed 0 to 1 in field 8 of following sample:
# ##### : 1182404,2,3,1,1,1,2,0,1,1,1

# 5. Number of Instances: 699 (as of 15 July 1992)


# 6. Number of Attributes: 10 plus the class attribute
# 7. Attribute Information: (class attribute has been moved to last column)
# # Attribute Domain
# -- -----------------------------------------
# 1. Sample code number id number
# 2. Clump Thickness 1 - 10
# 3. Uniformity of Cell Size 1 - 10
# 4. Uniformity of Cell Shape 1 - 10
# 5. Marginal Adhesion 1 - 10
# 6. Single Epithelial Cell Size 1 - 10
# 7. Bare Nuclei 1 - 10
# 8. Bland Chromatin 1 - 10
# 9. Normal Nucleoli 1 - 10
# 10. Mitoses 1 - 10
# 11. Class: (2 for benign, 4 for malignant)

# 8. Missing attribute values: 16


# There are 16 instances in Groups 1 to 6 that contain a single missing
# (i.e., unavailable) attribute value, now denoted by "?".
# 9. Class distribution:

# Benign: 458 (65.5%)


# Malignant: 241 (34.5%)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 2/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [4]: location = r"D:\KOMAL\SIMPLILEARN\MY COURSES\IN PROGRESS\DATA SCIENCE WITH PYT


HON\Live class downloads\Aug 11 Sat - Sep 15 Sat - Attending\datasets\breastca
ncer-rf.csv"

In [5]: names = ['id_number', 'diagnosis', 'radius_mean',


'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean',
'concavity_mean','concave_points_mean',
'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se',
'area_se', 'smoothness_se', 'compactness_se',
'concavity_se', 'concave_points_se',
'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst',
'smoothness_worst', 'compactness_worst',
'concavity_worst', 'concave_points_worst',
'symmetry_worst', 'fractal_dimension_worst']

dx = ['Benign', 'Malignant']

In [6]: # load the training data from breast cancer data set
df_training = pd.read_csv(location, names=names)

In [7]: # examine the structure of data


df_training.head()

Out[7]:
id_number diagnosis radius_mean texture_mean perimeter_mean area_mean smo

0 842302 M 17.99 10.38 122.80 1001.0 0.118

1 842517 M 20.57 17.77 132.90 1326.0 0.084

2 84300903 M 19.69 21.25 130.00 1203.0 0.109

3 84348301 M 11.42 20.38 77.58 386.1 0.142

4 84358402 M 20.29 14.34 135.10 1297.0 0.100

5 rows × 32 columns

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 3/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [8]: # Examine any missing


df_training.isnull().sum()

Out[8]: id_number 0
diagnosis 0
radius_mean 0
texture_mean 0
perimeter_mean 0
area_mean 0
smoothness_mean 0
compactness_mean 0
concavity_mean 0
concave_points_mean 0
symmetry_mean 0
fractal_dimension_mean 0
radius_se 0
texture_se 0
perimeter_se 0
area_se 0
smoothness_se 0
compactness_se 0
concavity_se 0
concave_points_se 0
symmetry_se 0
fractal_dimension_se 0
radius_worst 0
texture_worst 0
perimeter_worst 0
area_worst 0
smoothness_worst 0
compactness_worst 0
concavity_worst 0
concave_points_worst 0
symmetry_worst 0
fractal_dimension_worst 0
dtype: int64

In [9]: # Cleaning
# We do some minor cleanage like setting the id_number to be the
# data frame index, along with converting the diagnosis to the standard
# binary 1, 0 representation using the map() function.

# Setting 'id_number' as our index


df_training.set_index(['id_number'], inplace = True)

In [10]: # Converted to binary to help later on with models and plots


df_training['diagnosis'] = df_training['diagnosis'].map({'M':1, 'B':0})

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 4/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [11]: # all the features


names_index = names[2:]
names_index

Out[11]: ['radius_mean',
'texture_mean',
'perimeter_mean',
'area_mean',
'smoothness_mean',
'compactness_mean',
'concavity_mean',
'concave_points_mean',
'symmetry_mean',
'fractal_dimension_mean',
'radius_se',
'texture_se',
'perimeter_se',
'area_se',
'smoothness_se',
'compactness_se',
'concavity_se',
'concave_points_se',
'symmetry_se',
'fractal_dimension_se',
'radius_worst',
'texture_worst',
'perimeter_worst',
'area_worst',
'smoothness_worst',
'compactness_worst',
'concavity_worst',
'concave_points_worst',
'symmetry_worst',
'fractal_dimension_worst']

In [12]: df_training.head()

Out[12]:
diagnosis radius_mean texture_mean perimeter_mean area_mean smooth

id_number

842302 1 17.99 10.38 122.80 1001.0 0.11840

842517 1 20.57 17.77 132.90 1326.0 0.08474

84300903 1 19.69 21.25 130.00 1203.0 0.10960

84348301 1 11.42 20.38 77.58 386.1 0.14250

84358402 1 20.29 14.34 135.10 1297.0 0.10030

5 rows × 31 columns

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 5/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [13]: # features available


df_training.dtypes

Out[13]: diagnosis int64


radius_mean float64
texture_mean float64
perimeter_mean float64
area_mean float64
smoothness_mean float64
compactness_mean float64
concavity_mean float64
concave_points_mean float64
symmetry_mean float64
fractal_dimension_mean float64
radius_se float64
texture_se float64
perimeter_se float64
area_se float64
smoothness_se float64
compactness_se float64
concavity_se float64
concave_points_se float64
symmetry_se float64
fractal_dimension_se float64
radius_worst float64
texture_worst float64
perimeter_worst float64
area_worst float64
smoothness_worst float64
compactness_worst float64
concavity_worst float64
concave_points_worst float64
symmetry_worst float64
fractal_dimension_worst float64
dtype: object

In [14]: df_training.shape

Out[14]: (569, 31)

In [15]: # check for class imbalance


# examine the number of unique class labels
# data set does not suffer from class imbalance.
print(df_training.diagnosis.unique())
print(df_training.diagnosis.value_counts())

[1 0]
0 357
1 212
Name: diagnosis, dtype: int64

In [16]: print(df_training.diagnosis.value_counts(normalize=True))

0 0.627417
1 0.372583
Name: diagnosis, dtype: float64

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 6/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [17]: df_training.describe()

Out[17]:
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothne

count 569.000000 569.000000 569.000000 569.000000 569.000000 569.00000

mean 0.372583 14.127292 19.289649 91.969033 654.889104 0.096360

std 0.483918 3.524049 4.301036 24.298981 351.914129 0.014064

min 0.000000 6.981000 9.710000 43.790000 143.500000 0.052630

25% 0.000000 11.700000 16.170000 75.170000 420.300000 0.086370

50% 0.000000 13.370000 18.840000 86.240000 551.100000 0.095870

75% 1.000000 15.780000 21.800000 104.100000 782.700000 0.105300

max 1.000000 28.110000 39.280000 188.500000 2501.000000 0.163400

8 rows × 31 columns

In [18]: # data varies in distribution, this will be important when considering


# classification models.

# pre-processing
# Random Forest does not require any pre-processing

In [19]: # X_df = df_training.iloc[:, df_training.columns != 'diagnosis']


# y_df = df_training.iloc[:, df_training.columns == 'diagnosis']
X_df = df_training[names_index]
y_df = df_training['diagnosis']

In [20]: print(type(X_df))
print(type(y_df))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>

In [21]: # Split into train and test sets.


X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.
20, random_state =42)

In [22]: # instantiate the RF classifier


# Set the random state for reproducibility
clf = RandomForestClassifier(random_state=42)

In [23]: X_train = X_train.values


y_train = y_train.values

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 7/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [24]: # train the algorithm utilizing the training and target class
clf.fit(X_train, y_train)

Out[24]: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',


max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=42, verbose=0, warm_start=False)

In [25]: # Variable Importance


# Get numerical feature importances
importances = list(clf.feature_importances_)
importances

Out[25]: [0.07070875852652238,
0.006575694819293099,
0.05866098072621878,
0.01399705469211196,
0.01012210099909593,
0.0019237134838078498,
0.0021572692743547835,
0.10995070971888617,
0.008025453949977576,
0.003333374263403106,
0.06202480367181799,
0.003832344234070842,
0.009663008594770445,
0.06283545902452922,
0.002782000712192183,
0.004408529077759847,
0.01433132209795599,
0.007234021783473726,
0.002804912141556622,
0.008586381993643893,
0.083409041645263,
0.015484722256977418,
0.03184810894288064,
0.09713728084231674,
0.00405296245964799,
0.03585605229950856,
0.0203828559132144,
0.2373544725706544,
0.00322830207046923,
0.007288307213625225]

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 8/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [26]: # List of tuples with variable and importance


feature_importances = [(feature, round(importance, 2)) for feature, importance
in zip(names_index, importances)]
feature_importances

Out[26]: [('radius_mean', 0.07),


('texture_mean', 0.01),
('perimeter_mean', 0.06),
('area_mean', 0.01),
('smoothness_mean', 0.01),
('compactness_mean', 0.0),
('concavity_mean', 0.0),
('concave_points_mean', 0.11),
('symmetry_mean', 0.01),
('fractal_dimension_mean', 0.0),
('radius_se', 0.06),
('texture_se', 0.0),
('perimeter_se', 0.01),
('area_se', 0.06),
('smoothness_se', 0.0),
('compactness_se', 0.0),
('concavity_se', 0.01),
('concave_points_se', 0.01),
('symmetry_se', 0.0),
('fractal_dimension_se', 0.01),
('radius_worst', 0.08),
('texture_worst', 0.02),
('perimeter_worst', 0.03),
('area_worst', 0.1),
('smoothness_worst', 0.0),
('compactness_worst', 0.04),
('concavity_worst', 0.02),
('concave_points_worst', 0.24),
('symmetry_worst', 0.0),
('fractal_dimension_worst', 0.01)]

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.html 9/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [27]: # Sort the feature importances by most important first


feature_importances = sorted(feature_importances, key = lambda x: x[1], revers
e = True)
feature_importances

Out[27]: [('concave_points_worst', 0.24),


('concave_points_mean', 0.11),
('area_worst', 0.1),
('radius_worst', 0.08),
('radius_mean', 0.07),
('perimeter_mean', 0.06),
('radius_se', 0.06),
('area_se', 0.06),
('compactness_worst', 0.04),
('perimeter_worst', 0.03),
('texture_worst', 0.02),
('concavity_worst', 0.02),
('texture_mean', 0.01),
('area_mean', 0.01),
('smoothness_mean', 0.01),
('symmetry_mean', 0.01),
('perimeter_se', 0.01),
('concavity_se', 0.01),
('concave_points_se', 0.01),
('fractal_dimension_se', 0.01),
('fractal_dimension_worst', 0.01),
('compactness_mean', 0.0),
('concavity_mean', 0.0),
('fractal_dimension_mean', 0.0),
('texture_se', 0.0),
('smoothness_se', 0.0),
('compactness_se', 0.0),
('symmetry_se', 0.0),
('smoothness_worst', 0.0),
('symmetry_worst', 0.0)]

In [28]: np.arange(len(names_index))

Out[28]: array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,


17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [29]: # plot the graph - feature importance wise


df_feature_importance = pd.DataFrame(feature_importances, columns=['feature',
'importance'])
# df_feature_importance

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.… 10/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [30]: ax = df_feature_importance.plot(kind='bar',
x='feature',
y='importance',
figsize=(10,8),
title= 'Feature importances for Random Forest Model',
grid=True,
legend=True,
fontsize = 12,
color='orange',
);
# Set the x-axis label
ax.set_xlabel("Feature")

# Set the y-axis label


ax.set_ylabel("Importance");

In [31]: # predict the test set


y_pred = clf.predict(X_test)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.h… 11/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [32]: # Confusion Matrix


conf_mat = confusion_matrix(y_test, y_pred)

In [33]: sns.heatmap(conf_mat, annot=True, fmt='d', cbar=False)


plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Actual vs. Predicted Confusion Matrix')

Out[33]: Text(0.5,1,'Actual vs. Predicted Confusion Matrix')

In [34]: # comparing actual response values (y_test) with predicted response values (y_
pred)
print("model accuracy:", metrics.accuracy_score(y_test, y_pred)* 100)

model accuracy: 95.6140350877193

In [35]: # ROC Curve Metrics

# We grab the second array from the output which corresponds to


# to the predicted probabilites of positive classes
# Ordered wrt fit.classes_ in our case [0, 1] where 1 is our positive class

predictions_prob = clf.predict_proba(X_test)[:, 1]

predictions_prob

Out[35]: array([0. , 1. , 1. , 0. , 0. , 1. , 1. , 0.8, 0.7, 0. , 0.1, 1. , 0. ,


0.9, 0. , 0.9, 0. , 0. , 0. , 1. , 0.2, 0. , 1. , 0. , 0. , 0.3,
0. , 0. , 0. , 1. , 0. , 0. , 0.3, 0. , 0. , 0. , 0.9, 0.1, 1. ,
0.2, 0. , 1. , 0. , 0. , 0.4, 0. , 0. , 0.1, 0. , 0. , 1. , 1. ,
0.2, 0.2, 0. , 0. , 0. , 1. , 0.8, 0. , 0. , 1. , 1. , 0. , 0. ,
0. , 1. , 1. , 0. , 0. , 0.7, 1. , 0.1, 1. , 0. , 0.1, 0. , 0.4,
0. , 0.1, 1. , 0. , 0.1, 0.9, 0.9, 1. , 0.9, 1. , 0. , 0. , 0. ,
0.3, 0.3, 0.1, 0. , 0. , 1. , 0.9, 0. , 1. , 0.9, 0. , 1. , 1. ,
0. , 0.1, 0. , 1. , 0.6, 0. , 0.9, 0. , 0.3, 1. ])

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.… 12/13
9/21/2018 ML-RANDOM-FOREST-12-breast-cancer

In [36]: fpr2, tpr2, _ = metrics.roc_curve(y_test,


predictions_prob,
pos_label = 1)

In [37]: auc_rf = metrics.auc(fpr2, tpr2)


auc_rf

Out[37]: 0.9909924664264658

In [38]: # Classification Report


print(classification_report(y_test, y_pred))

precision recall f1-score support

0 0.96 0.97 0.97 71


1 0.95 0.93 0.94 43

avg / total 0.96 0.96 0.96 114

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_breastCancer.… 13/13

You might also like