You are on page 1of 1

Haberman Cancer Survival Data Visualisation

In [1]: # all header files required


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import axes3d
import warnings
warnings.filterwarnings("ignore")

In [2]: haberman_cancer=pd.read_csv("D:\haberman (2).csv")

In [3]: haberman_cancer=haberman_cancer.replace({
"status":1
},"survived")
haberman_cancer=haberman_cancer.replace({
"status":2
},"dead")
#to make it more convinient to understand

In [4]: haberman_cancer.columns

Out[4]: Index(['age', 'year', 'nodes', 'status'], dtype='object')

In [5]: haberman_cancer.shape

Out[5]: (305, 4)

In [6]: haberman_cancer["status"].value_counts()

Out[6]: survived 225


dead 80
Name: status, dtype: int64

In [8]: haberman_cancer["age"].value_counts()

Out[8]: 52 14
54 13
50 12
43 11
47 11
53 11
57 11
38 10
49 10
41 10
55 10
65 10
45 9
42 9
61 9
63 8
59 8
56 7
34 7
46 7
48 7
62 7
70 7
58 7
44 7
67 6
51 6
39 6
37 6
60 6
66 5
64 5
69 4
72 4
40 3
30 3
68 2
73 2
74 2
36 2
35 2
33 2
31 2
77 1
71 1
75 1
76 1
78 1
Name: age, dtype: int64

In [9]: haberman_cancer["year"].value_counts()

Out[9]: 58 35
64 31
63 30
66 28
65 28
60 28
59 27
61 26
67 25
62 23
68 13
69 11
Name: year, dtype: int64

In [10]: haberman_cancer["nodes"].value_counts()

Out[10]: 0 136
1 41
3 20
2 19
4 13
6 7
7 7
8 7
5 6
9 6
13 5
14 4
11 4
10 3
15 3
19 3
22 3
23 3
12 2
20 2
46 1
16 1
17 1
18 1
21 1
24 1
25 1
28 1
30 1
35 1
52 1
Name: nodes, dtype: int64

In [11]: haberman_cancer_alive = haberman_cancer.loc[haberman_cancer["status"]=="survived"]


haberman_cancer_dead = haberman_cancer.loc[haberman_cancer["status"]=="dead"]

In [12]: haberman_cancer_alive["age"].value_counts()

Out[12]: 50 10
52 10
54 9
38 9
57 8
49 8
55 8
47 8
41 7
63 7
42 7
59 7
58 7
43 7
45 6
61 6
65 6
37 6
64 5
34 5
70 5
56 5
53 5
39 5
51 4
48 4
62 4
44 4
67 4
60 4
40 3
30 3
46 3
66 3
69 3
72 3
68 2
73 2
36 2
35 2
33 2
31 2
76 1
71 1
74 1
75 1
77 1
Name: age, dtype: int64

In [13]: haberman_cancer_alive["year"].value_counts()

Out[13]: 60 24
58 24
64 23
61 23
66 22
63 22
67 21
59 18
62 16
65 15
68 10
69 7
Name: year, dtype: int64

In [14]: haberman_cancer_alive["nodes"].value_counts()

Out[14]: 0 117
1 33
2 15
3 13
4 10
7 5
8 5
6 4
14 3
10 2
22 2
5 2
9 2
46 1
11 1
30 1
13 1
15 1
16 1
18 1
19 1
20 1
25 1
28 1
12 1
Name: nodes, dtype: int64

In [15]: haberman_cancer_dead["year"].value_counts()

Out[15]: 65 13
58 11
59 9
64 8
63 8
62 7
66 6
69 4
67 4
60 4
68 3
61 3
Name: year, dtype: int64

In [16]: haberman_cancer_dead["age"].value_counts()

Out[16]: 53 6
65 4
52 4
46 4
54 4
43 4
44 3
45 3
41 3
61 3
47 3
48 3
57 3
62 3
42 2
49 2
50 2
51 2
34 2
55 2
56 2
60 2
66 2
67 2
70 2
74 1
59 1
63 1
69 1
72 1
39 1
38 1
78 1
Name: age, dtype: int64

In [17]: haberman_cancer_dead["nodes"].value_counts()

Out[17]: 0 19
1 8
3 7
9 4
2 4
5 4
13 4
23 3
4 3
6 3
11 3
19 2
15 2
7 2
8 2
52 1
10 1
35 1
14 1
17 1
20 1
21 1
22 1
24 1
12 1
Name: nodes, dtype: int64

In [18]: #we will now viualualise data first univariate then mutlivariate
#Aur main focus is to understand the data present infront of us through visualisation

OBJECTIVE
To do analysis of data through viualisation and if possible generate a model to predict the survival of patients having breast
cancer using simple linear methods

UNIVARIATE ANALYSIS

1 dimensional univariate
Scatter Plot

In [19]: plt.scatter(haberman_cancer_alive["age"],np.zeros_like(haberman_cancer_alive["age"]),label=
"age_alive")
plt.scatter(haberman_cancer_dead["age"],np.zeros_like(haberman_cancer_dead["age"]),label="ag
e_dead")

plt.legend()
#not a good as most points got mixed
#very hard to make a sense
# makes no sense to do this for other features

Out[19]: <matplotlib.legend.Legend at 0x17be9d8d1c8>

2 dimensional Univariate
Histogram

In [20]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"age")\
.add_legend()
plt.show()

In [21]: # women in early 30's ,late 70's and 80's have very less probability of dying by breast canc
er
# women of age 40 to 55 have more probability to death than survival
# completely overlapped

In [22]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"year")\
.add_legend()
plt.show()

In [23]: # total deaths keeps on decreasing as years increase


#complete overlapping of both the classes

In [24]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"nodes")\
.add_legend()
plt.show()

In [25]: # with zero nodes probability of survival is much more than deaths
# as the nodes increases probability of deaths surpasses survival
#partially overlapped

Cumalative Distribution function

In [26]: counts,bins_edges=np.histogram(haberman_cancer_alive["age"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[1:],cdf,label="cdf_alive")
plt.plot(bins_edges[1:],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["age"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.xlabel("age")
plt.plot(bins_edges[1:],cdf,label="cdf_dead")
plt.plot(bins_edges[1:],pdf,label="pdf_dead")
plt.legend()

Out[26]: <matplotlib.legend.Legend at 0x17beab7db48>

In [27]: # survival probability is much more than death till 40 years of age

In [28]: counts,bins_edges=np.histogram(haberman_cancer_alive["year"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_alive")
plt.plot(bins_edges[:-1],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["year"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_dead")
plt.plot(bins_edges[:-1],pdf,label="pdf_dead")
plt.xlabel("year")
plt.legend()

Out[28]: <matplotlib.legend.Legend at 0x17beac1eb48>

In [29]: # more 20 percentile of dead patients and 20 percentile of survived patients have operationa
l year 1958

In [30]: counts,bins_edges=np.histogram(haberman_cancer_alive["nodes"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.grid()
plt.plot(bins_edges[:-1],cdf,label="cdf_alive")
plt.plot(bins_edges[:-1],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["nodes"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_dead")
plt.xlabel("nodes")
plt.plot(bins_edges[:-1],pdf,label="pdf_dead")

plt.legend()

Out[30]: <matplotlib.legend.Legend at 0x17beaca4e88>

In [31]: # more 80 percentile of survival occurs at just zero node


# more than of 20 percentile deaths occured of women having 10 nodes
# 90 percentile of survived women have less than 5 nodes

Box Plot

In [32]: sns.boxplot(x="status",y="age",data=haberman_cancer)
plt.grid()
plt.title("Age")
plt.show()

In [33]: #more than 50% of the total cases have came from people having age betwwen 45 to 60.
# 50% of total deaths occured of women having age group of 45 to 60
# all cases have occured to women after reaching the age 30

In [34]: sns.boxplot(x="status",y="year",data=haberman_cancer)
plt.grid()
plt.title("Operational Year")
plt.show()

In [35]: #5o percentile of deaths due to breast cancer occured just in 5 years
#25 percentile of deaths occured in 2years between 58 to 60

In [36]: sns.boxplot(x="status",y="nodes",data=haberman_cancer)
plt.grid()
plt.title("Axil Nodes")
plt.show()

In [37]: #have outliers cant use mean for any central tendency calculations in case of nodes
#75% of of survivers have nodes between 0 to 3 and 50% of dead persons have more than 3 node
s

Violen Plots

In [38]: #combination of both box and pdf plots

In [39]: sns.violinplot(x="status",y="age", data=haberman_cancer)

Out[39]: <matplotlib.axes._subplots.AxesSubplot at 0x17be9d9ec48>

In [40]: sns.violinplot(x="status",y="year", data=haberman_cancer)

Out[40]: <matplotlib.axes._subplots.AxesSubplot at 0x17bea104648>

In [41]: sns.violinplot(x="status",y="nodes", data=haberman_cancer)

Out[41]: <matplotlib.axes._subplots.AxesSubplot at 0x17beaa86fc8>

In [42]: # it shows it has almost 100 percent probability to die if it has nodes more than 50

Bivariate analysis
Scatter Plot

In [43]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.scatterplot,"nodes","age")\
.add_legend()
plt.show()

In [44]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.scatterplot,"year","age")\
.add_legend()
plt.show()

In [45]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.scatterplot,"nodes","year")\
.add_legend()
plt.show()

In [46]: # In all of these plots both the classes are 80 to 90 percent overlapping each other
#very difficult to reach in conclusion through scatter plots

Pair Plots

In [47]: sns.set_style("whitegrid")
sns.pairplot(haberman_cancer,hue="status",size=3)
#helps us to visualise our scatter plots together

Out[47]: <seaborn.axisgrid.PairGrid at 0x17be9ff8048>

Contour Plot

In [48]: sns.jointplot(x="age",y="year",data=haberman_cancer_alive,kind="kde")
plt.grid()

In [49]: #most density of survived patients have age 50 to 60 and operational year 60 to 62.5

In [50]: sns.jointplot(x="age",y="nodes",data=haberman_cancer_alive,kind="kde")
plt.grid()

In [51]: #most density of survived patients have age 50 to 70 and nodes 0 to 1

In [52]: sns.jointplot(x="nodes",y="year",data=haberman_cancer_alive,kind="kde")
plt.grid()

In [53]: # most density of survived patients having nodes 0 to 1 and operational year 59 to 67

In [54]: sns.jointplot(x="age",y="year",data=haberman_cancer_dead,kind="kde")
plt.grid()

In [55]: #most density of dead patients have age 48 to 55 and operational year 62.5 to 65

In [56]: sns.jointplot(x="nodes",y="year",data=haberman_cancer_dead,kind="kde")
plt.grid()

In [57]: #most density of dead patients have nodes 0 to 10 and operational year 62.5 to 65

In [58]: sns.jointplot(x="age",y="nodes",data=haberman_cancer_dead,kind="kde")
plt.grid()

In [59]: #most density of dead patients have age 40 to 60 and nodes 0 to 6

Conclusion
It is very difficult to predict the survival and deaths of patients by three features (i.e. year,age,nodes) using simple linear or if
else methods as both classes are completely overlapped need more features to reach a solution need to use non linear
methods usefulness of data -> nodes, age, year

You might also like