Professional Documents
Culture Documents
In [3]: haberman_cancer=haberman_cancer.replace({
"status":1
},"survived")
haberman_cancer=haberman_cancer.replace({
"status":2
},"dead")
#to make it more convinient to understand
In [4]: haberman_cancer.columns
In [5]: haberman_cancer.shape
Out[5]: (305, 4)
In [6]: haberman_cancer["status"].value_counts()
In [8]: haberman_cancer["age"].value_counts()
Out[8]: 52 14
54 13
50 12
43 11
47 11
53 11
57 11
38 10
49 10
41 10
55 10
65 10
45 9
42 9
61 9
63 8
59 8
56 7
34 7
46 7
48 7
62 7
70 7
58 7
44 7
67 6
51 6
39 6
37 6
60 6
66 5
64 5
69 4
72 4
40 3
30 3
68 2
73 2
74 2
36 2
35 2
33 2
31 2
77 1
71 1
75 1
76 1
78 1
Name: age, dtype: int64
In [9]: haberman_cancer["year"].value_counts()
Out[9]: 58 35
64 31
63 30
66 28
65 28
60 28
59 27
61 26
67 25
62 23
68 13
69 11
Name: year, dtype: int64
In [10]: haberman_cancer["nodes"].value_counts()
Out[10]: 0 136
1 41
3 20
2 19
4 13
6 7
7 7
8 7
5 6
9 6
13 5
14 4
11 4
10 3
15 3
19 3
22 3
23 3
12 2
20 2
46 1
16 1
17 1
18 1
21 1
24 1
25 1
28 1
30 1
35 1
52 1
Name: nodes, dtype: int64
In [12]: haberman_cancer_alive["age"].value_counts()
Out[12]: 50 10
52 10
54 9
38 9
57 8
49 8
55 8
47 8
41 7
63 7
42 7
59 7
58 7
43 7
45 6
61 6
65 6
37 6
64 5
34 5
70 5
56 5
53 5
39 5
51 4
48 4
62 4
44 4
67 4
60 4
40 3
30 3
46 3
66 3
69 3
72 3
68 2
73 2
36 2
35 2
33 2
31 2
76 1
71 1
74 1
75 1
77 1
Name: age, dtype: int64
In [13]: haberman_cancer_alive["year"].value_counts()
Out[13]: 60 24
58 24
64 23
61 23
66 22
63 22
67 21
59 18
62 16
65 15
68 10
69 7
Name: year, dtype: int64
In [14]: haberman_cancer_alive["nodes"].value_counts()
Out[14]: 0 117
1 33
2 15
3 13
4 10
7 5
8 5
6 4
14 3
10 2
22 2
5 2
9 2
46 1
11 1
30 1
13 1
15 1
16 1
18 1
19 1
20 1
25 1
28 1
12 1
Name: nodes, dtype: int64
In [15]: haberman_cancer_dead["year"].value_counts()
Out[15]: 65 13
58 11
59 9
64 8
63 8
62 7
66 6
69 4
67 4
60 4
68 3
61 3
Name: year, dtype: int64
In [16]: haberman_cancer_dead["age"].value_counts()
Out[16]: 53 6
65 4
52 4
46 4
54 4
43 4
44 3
45 3
41 3
61 3
47 3
48 3
57 3
62 3
42 2
49 2
50 2
51 2
34 2
55 2
56 2
60 2
66 2
67 2
70 2
74 1
59 1
63 1
69 1
72 1
39 1
38 1
78 1
Name: age, dtype: int64
In [17]: haberman_cancer_dead["nodes"].value_counts()
Out[17]: 0 19
1 8
3 7
9 4
2 4
5 4
13 4
23 3
4 3
6 3
11 3
19 2
15 2
7 2
8 2
52 1
10 1
35 1
14 1
17 1
20 1
21 1
22 1
24 1
12 1
Name: nodes, dtype: int64
In [18]: #we will now viualualise data first univariate then mutlivariate
#Aur main focus is to understand the data present infront of us through visualisation
OBJECTIVE
To do analysis of data through viualisation and if possible generate a model to predict the survival of patients having breast
cancer using simple linear methods
UNIVARIATE ANALYSIS
1 dimensional univariate
Scatter Plot
In [19]: plt.scatter(haberman_cancer_alive["age"],np.zeros_like(haberman_cancer_alive["age"]),label=
"age_alive")
plt.scatter(haberman_cancer_dead["age"],np.zeros_like(haberman_cancer_dead["age"]),label="ag
e_dead")
plt.legend()
#not a good as most points got mixed
#very hard to make a sense
# makes no sense to do this for other features
2 dimensional Univariate
Histogram
In [20]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"age")\
.add_legend()
plt.show()
In [21]: # women in early 30's ,late 70's and 80's have very less probability of dying by breast canc
er
# women of age 40 to 55 have more probability to death than survival
# completely overlapped
In [22]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"year")\
.add_legend()
plt.show()
In [24]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.distplot,"nodes")\
.add_legend()
plt.show()
In [25]: # with zero nodes probability of survival is much more than deaths
# as the nodes increases probability of deaths surpasses survival
#partially overlapped
In [26]: counts,bins_edges=np.histogram(haberman_cancer_alive["age"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[1:],cdf,label="cdf_alive")
plt.plot(bins_edges[1:],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["age"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.xlabel("age")
plt.plot(bins_edges[1:],cdf,label="cdf_dead")
plt.plot(bins_edges[1:],pdf,label="pdf_dead")
plt.legend()
In [27]: # survival probability is much more than death till 40 years of age
In [28]: counts,bins_edges=np.histogram(haberman_cancer_alive["year"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_alive")
plt.plot(bins_edges[:-1],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["year"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_dead")
plt.plot(bins_edges[:-1],pdf,label="pdf_dead")
plt.xlabel("year")
plt.legend()
In [29]: # more 20 percentile of dead patients and 20 percentile of survived patients have operationa
l year 1958
In [30]: counts,bins_edges=np.histogram(haberman_cancer_alive["nodes"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.grid()
plt.plot(bins_edges[:-1],cdf,label="cdf_alive")
plt.plot(bins_edges[:-1],pdf,label="pdf_alive")
counts,bins_edges=np.histogram(haberman_cancer_dead["nodes"],bins=10,density=True)
pdf=counts/sum(counts)
cdf=np.cumsum(pdf)
plt.plot(bins_edges[:-1],cdf,label="cdf_dead")
plt.xlabel("nodes")
plt.plot(bins_edges[:-1],pdf,label="pdf_dead")
plt.legend()
Box Plot
In [32]: sns.boxplot(x="status",y="age",data=haberman_cancer)
plt.grid()
plt.title("Age")
plt.show()
In [33]: #more than 50% of the total cases have came from people having age betwwen 45 to 60.
# 50% of total deaths occured of women having age group of 45 to 60
# all cases have occured to women after reaching the age 30
In [34]: sns.boxplot(x="status",y="year",data=haberman_cancer)
plt.grid()
plt.title("Operational Year")
plt.show()
In [35]: #5o percentile of deaths due to breast cancer occured just in 5 years
#25 percentile of deaths occured in 2years between 58 to 60
In [36]: sns.boxplot(x="status",y="nodes",data=haberman_cancer)
plt.grid()
plt.title("Axil Nodes")
plt.show()
In [37]: #have outliers cant use mean for any central tendency calculations in case of nodes
#75% of of survivers have nodes between 0 to 3 and 50% of dead persons have more than 3 node
s
Violen Plots
In [42]: # it shows it has almost 100 percent probability to die if it has nodes more than 50
Bivariate analysis
Scatter Plot
In [43]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.scatterplot,"nodes","age")\
.add_legend()
plt.show()
In [44]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.scatterplot,"year","age")\
.add_legend()
plt.show()
In [45]: sns.FacetGrid(haberman_cancer,hue="status",size=10)\
.map(sns.scatterplot,"nodes","year")\
.add_legend()
plt.show()
In [46]: # In all of these plots both the classes are 80 to 90 percent overlapping each other
#very difficult to reach in conclusion through scatter plots
Pair Plots
In [47]: sns.set_style("whitegrid")
sns.pairplot(haberman_cancer,hue="status",size=3)
#helps us to visualise our scatter plots together
Contour Plot
In [48]: sns.jointplot(x="age",y="year",data=haberman_cancer_alive,kind="kde")
plt.grid()
In [49]: #most density of survived patients have age 50 to 60 and operational year 60 to 62.5
In [50]: sns.jointplot(x="age",y="nodes",data=haberman_cancer_alive,kind="kde")
plt.grid()
In [52]: sns.jointplot(x="nodes",y="year",data=haberman_cancer_alive,kind="kde")
plt.grid()
In [53]: # most density of survived patients having nodes 0 to 1 and operational year 59 to 67
In [54]: sns.jointplot(x="age",y="year",data=haberman_cancer_dead,kind="kde")
plt.grid()
In [55]: #most density of dead patients have age 48 to 55 and operational year 62.5 to 65
In [56]: sns.jointplot(x="nodes",y="year",data=haberman_cancer_dead,kind="kde")
plt.grid()
In [57]: #most density of dead patients have nodes 0 to 10 and operational year 62.5 to 65
In [58]: sns.jointplot(x="age",y="nodes",data=haberman_cancer_dead,kind="kde")
plt.grid()
Conclusion
It is very difficult to predict the survival and deaths of patients by three features (i.e. year,age,nodes) using simple linear or if
else methods as both classes are completely overlapped need more features to reach a solution need to use non linear
methods usefulness of data -> nodes, age, year