Haberman

Haberman Cancer Survival Data Visualisation
In [1]: # all header files required

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import axes3d
import warnings
warnings.filterwarnings("ignore")
In [2]: haberman_cancer=pd.read_csv("D:\haberman (2).csv")
In [3]: haberman_cancer=haberman_cancer.replace({
"status":1
},"survived")
haberman_cancer=haberman_cancer.replace({
"status":2
},"dead")
#to make it more convinient to understand
In [4]: haberman_cancer.columns
Out[4]: Index(['age', 'year', 'nodes', 'status'], dtype='object')
In [5]: haberman_cancer.shape
Out[5]: (305, 4)
In [6]: haberman_cancer["status"].value_counts()
Out[6]: survived 225

dead 80
Name: status, dtype: int64
In [8]: haberman_cancer["age"].value_counts()
Out[8]: 52 14
54 13
50 12
43 11
47 11
53 11
57 11
38 10
49 10
41 10
55 10
65 10
45 9
42 9
61 9
63 8
59 8
56 7
34 7
46 7
48 7
62 7
70 7
58 7
44 7
67 6
51 6
39 6
37 6
60 6
66 5
64 5
69 4
72 4
40 3
30 3
68 2
73 2
74 2
36 2
35 2
33 2
31 2
77 1
71 1
75 1
76 1
78 1
Name: age, dtype: int64
In [9]: haberman_cancer["year"].value_counts()
Out[9]: 58 35
64 31
63 30
66 28
65 28
60 28
59 27
61 26
67 25
62 23
68 13
69 11
Name: year, dtype: int64
In [10]: haberman_cancer["nodes"].value_counts()
Out[10]: 0 136
1 41
3 20
2 19
4 13
6 7
7 7
8 7
5 6
9 6
13 5
14 4
11 4
10 3
15 3
19 3
22 3
23 3
12 2
20 2
46 1
16 1
17 1
18 1
21 1
24 1
25 1
28 1
30 1
35 1
52 1
Name: nodes, dtype: int64
In [11]: haberman_cancer_alive = haberman_cancer.loc[haberman_cancer["status"]=="survived"]

haberman_cancer_dead = haberman_cancer.loc[haberman_cancer["status"]=="dead"]
In [12]: haberman_cancer_alive["age"].value_counts()
Out[12]: 50 10
52 10
54 9
38 9
57 8
49 8
55 8
47 8
41 7
63 7
42 7
59 7
58 7
43 7
45 6
61 6
65 6
37 6
64 5
34 5
70 5
56 5
53 5
39 5
51 4
48 4
62 4
44 4
67 4
60 4
40 3
30 3
46 3
66 3
69 3
72 3
68 2
73 2
36 2
35 2
33 2
31 2
76 1
71 1
74 1
75 1
77 1
In [13]: haberman_cancer_alive["year"].value_counts()
Out[13]: 60 24
58 24
64 23
61 23
66 22
63 22
67 21
59 18
62 16
65 15
68 10
69 7
In [14]: haberman_cancer_alive["nodes"].value_counts()
Out[14]: 0 117
1 33
2 15
3 13
4 10
7 5
8 5
6 4
14 3
10 2
22 2
5 2
9 2
46 1
11 1
30 1
13 1
15 1
16 1
18 1
19 1
20 1
25 1
28 1
12 1
In [15]: haberman_cancer_dead["year"].value_counts()
Out[15]: 65 13
58 11
59 9
64 8
63 8
62 7
66 6
69 4
67 4
60 4
68 3
61 3
In [16]: haberman_cancer_dead["age"].value_counts()
Out[16]: 53 6
65 4
52 4
46 4
54 4
43 4
44 3
45 3
41 3
61 3
47 3
48 3
57 3
62 3
42 2
49 2
50 2
51 2
34 2
55 2
56 2
60 2
66 2
67 2
70 2
74 1
59 1
63 1
69 1
72 1
39 1
38 1
78 1
In [17]: haberman_cancer_dead["nodes"].value_counts()
Out[17]: 0 19
1 8
3 7
9 4
2 4
5 4
13 4
23 3
4 3
6 3
11 3
19 2
15 2
7 2
8 2
52 1
10 1
35 1
14 1
17 1
20 1
21 1
22 1
24 1
12 1
In [18]: #we will now viualualise data first univariate then mutlivariate
#Aur main focus is to understand the data present infront of us through visualisation
OBJECTIVE
To do analysis of data through viualisation and if possible generate a model to predict the survival of patients having breast
cancer using simple linear methods
UNIVARIATE ANALYSIS
1 dimensional univariate
Scatter Plot
In [19]: plt.scatter(haberman_cancer_alive["age"],np.zeros_like(haberman_cancer_alive["age"]),label=
"age_alive")
plt.scatter(haberman_cancer_dead["age"],np.zeros_like(haberman_cancer_dead["age"]),label="ag
e_dead")
plt.legend()
#not a good as most points got mixed
#very hard to make a sense
# makes no sense to do this for other features
Out[19]: <matplotlib.legend.Legend at 0x17be9d8d1c8>
2 dimensional Univariate
Histogram
In [20]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"age")\
.add_legend()
plt.show()
In [21]: # women in early 30's ,late 70's and 80's have very less probability of dying by breast canc
er
# women of age 40 to 55 have more probability to death than survival
# completely overlapped
.map(sns.distplot,"year")\
.add_legend()
plt.show()
In [23]: # total deaths keeps on decreasing as years increase

#complete overlapping of both the classes
.map(sns.distplot,"nodes")\
.add_legend()
plt.show()
In [25]: # with zero nodes probability of survival is much more than deaths
# as the nodes increases probability of deaths surpasses survival
#partially overlapped
Cumalative Distribution function
In [26]: counts,bins_edges=np.histogram(haberman_cancer_alive["age"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[1:],cdf,label="cdf_alive")
plt.plot(bins_edges[1:],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["age"],bins=10,density=True)
cdf=np.cumsum(pdf)
plt.xlabel("age")
plt.plot(bins_edges[1:],cdf,label="cdf_dead")
plt.plot(bins_edges[1:],pdf,label="pdf_dead")
plt.legend()
Out[26]: <matplotlib.legend.Legend at 0x17beab7db48>
In [27]: # survival probability is much more than death till 40 years of age
In [28]: counts,bins_edges=np.histogram(haberman_cancer_alive["year"],bins=10,density=True)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_alive")
plt.plot(bins_edges[:-1],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["year"],bins=10,density=True)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_dead")
plt.plot(bins_edges[:-1],pdf,label="pdf_dead")
plt.xlabel("year")
plt.legend()
Out[28]: <matplotlib.legend.Legend at 0x17beac1eb48>
In [29]: # more 20 percentile of dead patients and 20 percentile of survived patients have operationa
l year 1958
In [30]: counts,bins_edges=np.histogram(haberman_cancer_alive["nodes"],bins=10,density=True)
cdf=np.cumsum(pdf)
plt.grid()
plt.plot(bins_edges[:-1],cdf,label="cdf_alive")
plt.plot(bins_edges[:-1],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["nodes"],bins=10,density=True)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_dead")
plt.xlabel("nodes")
plt.plot(bins_edges[:-1],pdf,label="pdf_dead")
plt.legend()
Out[30]: <matplotlib.legend.Legend at 0x17beaca4e88>
In [31]: # more 80 percentile of survival occurs at just zero node

# more than of 20 percentile deaths occured of women having 10 nodes
# 90 percentile of survived women have less than 5 nodes
Box Plot
In [32]: sns.boxplot(x="status",y="age",data=haberman_cancer)
plt.grid()
plt.title("Age")
plt.show()
In [33]: #more than 50% of the total cases have came from people having age betwwen 45 to 60.
# 50% of total deaths occured of women having age group of 45 to 60
# all cases have occured to women after reaching the age 30
In [34]: sns.boxplot(x="status",y="year",data=haberman_cancer)
plt.grid()
plt.title("Operational Year")
plt.show()
In [35]: #5o percentile of deaths due to breast cancer occured just in 5 years
#25 percentile of deaths occured in 2years between 58 to 60
In [36]: sns.boxplot(x="status",y="nodes",data=haberman_cancer)
plt.grid()
plt.title("Axil Nodes")
plt.show()
In [37]: #have outliers cant use mean for any central tendency calculations in case of nodes
#75% of of survivers have nodes between 0 to 3 and 50% of dead persons have more than 3 node
s
Violen Plots
In [38]: #combination of both box and pdf plots
In [39]: sns.violinplot(x="status",y="age", data=haberman_cancer)
Out[39]: <matplotlib.axes._subplots.AxesSubplot at 0x17be9d9ec48>
In [40]: sns.violinplot(x="status",y="year", data=haberman_cancer)
Out[40]: <matplotlib.axes._subplots.AxesSubplot at 0x17bea104648>
In [41]: sns.violinplot(x="status",y="nodes", data=haberman_cancer)
Out[41]: <matplotlib.axes._subplots.AxesSubplot at 0x17beaa86fc8>
In [42]: # it shows it has almost 100 percent probability to die if it has nodes more than 50
Bivariate analysis
Scatter Plot
.map(sns.scatterplot,"nodes","age")\
.add_legend()
plt.show()
.map(sns.scatterplot,"year","age")\
.add_legend()
plt.show()
.map(sns.scatterplot,"nodes","year")\
.add_legend()
plt.show()
In [46]: # In all of these plots both the classes are 80 to 90 percent overlapping each other
#very difficult to reach in conclusion through scatter plots
Pair Plots
In [47]: sns.set_style("whitegrid")
sns.pairplot(haberman_cancer,hue="status",size=3)
#helps us to visualise our scatter plots together
Out[47]: <seaborn.axisgrid.PairGrid at 0x17be9ff8048>
Contour Plot
In [48]: sns.jointplot(x="age",y="year",data=haberman_cancer_alive,kind="kde")
plt.grid()
In [49]: #most density of survived patients have age 50 to 60 and operational year 60 to 62.5
In [50]: sns.jointplot(x="age",y="nodes",data=haberman_cancer_alive,kind="kde")
plt.grid()
In [51]: #most density of survived patients have age 50 to 70 and nodes 0 to 1
In [52]: sns.jointplot(x="nodes",y="year",data=haberman_cancer_alive,kind="kde")
plt.grid()
In [53]: # most density of survived patients having nodes 0 to 1 and operational year 59 to 67
In [54]: sns.jointplot(x="age",y="year",data=haberman_cancer_dead,kind="kde")
plt.grid()
In [55]: #most density of dead patients have age 48 to 55 and operational year 62.5 to 65
In [56]: sns.jointplot(x="nodes",y="year",data=haberman_cancer_dead,kind="kde")
plt.grid()
In [57]: #most density of dead patients have nodes 0 to 10 and operational year 62.5 to 65
In [58]: sns.jointplot(x="age",y="nodes",data=haberman_cancer_dead,kind="kde")
plt.grid()
In [59]: #most density of dead patients have age 40 to 60 and nodes 0 to 6
Conclusion
It is very difficult to predict the survival and deaths of patients by three features (i.e. year,age,nodes) using simple linear or if
else methods as both classes are completely overlapped need more features to reach a solution need to use non linear
methods usefulness of data -> nodes, age, year

Haberman

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Haberman

Uploaded by

Copyright:

Available Formats

Haberman Cancer Survival Data Visualisation

In [1]: # all header files required

In [2]: haberman_cancer=pd.read_csv("D:\haberman (2).csv")

Out[4]: Index(['age', 'year', 'nodes', 'status'], dtype='object')

Out[6]: survived 225

In [11]: haberman_cancer_alive = haberman_cancer.loc[haberman_cancer["status"]=="survived"]

Out[19]: <matplotlib.legend.Legend at 0x17be9d8d1c8>

In [23]: # total deaths keeps on decreasing as years increase

Cumalative Distribution function

Out[26]: <matplotlib.legend.Legend at 0x17beab7db48>

Out[28]: <matplotlib.legend.Legend at 0x17beac1eb48>

Out[30]: <matplotlib.legend.Legend at 0x17beaca4e88>

In [31]: # more 80 percentile of survival occurs at just zero node

In [38]: #combination of both box and pdf plots

In [39]: sns.violinplot(x="status",y="age", data=haberman_cancer)

Out[39]: <matplotlib.axes._subplots.AxesSubplot at 0x17be9d9ec48>

In [40]: sns.violinplot(x="status",y="year", data=haberman_cancer)

Out[40]: <matplotlib.axes._subplots.AxesSubplot at 0x17bea104648>

In [41]: sns.violinplot(x="status",y="nodes", data=haberman_cancer)

Out[41]: <matplotlib.axes._subplots.AxesSubplot at 0x17beaa86fc8>

Out[47]: <seaborn.axisgrid.PairGrid at 0x17be9ff8048>

In [51]: #most density of survived patients have age 50 to 70 and nodes 0 to 1

In [59]: #most density of dead patients have age 40 to 60 and nodes 0 to 6

You might also like