You are on page 1of 7

import numpy as np #

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
print(os.listdir("../input"))

['pulsar_stars.csv']

İmport library
df=pd.read_csv('../input/pulsar_stars.csv')

Exploratory Data Analysis


# look first five data
df.head()

Mean of the integrated profile ... target_class


0 140.562500 ... 0
1 102.507812 ... 0
2 103.015625 ... 0
3 136.750000 ... 0
4 88.726562 ... 0

[5 rows x 9 columns]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
Mean of the integrated profile 17898 non-null
float64
Standard deviation of the integrated profile 17898 non-null
float64
Excess kurtosis of the integrated profile 17898 non-null
float64
Skewness of the integrated profile 17898 non-null
float64
Mean of the DM-SNR curve 17898 non-null
float64
Standard deviation of the DM-SNR curve 17898 non-null
float64
Excess kurtosis of the DM-SNR curve 17898 non-null
float64
Skewness of the DM-SNR curve 17898 non-null
float64
target_class 17898 non-null int64
dtypes: float64(8), int64(1)
memory usage: 1.2 MB

# summary statistic data


df.describe()

Mean of the integrated profile ... target_class


count 17898.000000 ... 17898.000000
mean 111.079968 ... 0.091574
std 25.652935 ... 0.288432
min 5.812500 ... 0.000000
25% 100.929688 ... 0.000000
50% 115.078125 ... 0.000000
75% 127.085938 ... 0.000000
max 192.617188 ... 1.000000

[8 rows x 9 columns]

df.target_class.value_counts()
sns.countplot(df.target_class)
plt.show()

Logistic Regression
#Set x and y values
y=df.target_class.values
x_df=df.drop(['target_class'],axis=1)
#normalization
x=(x_df-np.min(x_df))/(np.max(x_df)-np.min(x_df))

# train/test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,rando
m_state=1)

from sklearn.linear_model import LogisticRegression


lr=LogisticRegression()
lr.fit(x_train,y_train)
print('lr accuracy :', lr.score(x_test,y_test))

# confusion matrix
y_pred = lr.predict(x_test)
y_true = y_test
from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_true,y_pred)

lr accuracy : 0.9743016759776536

/opt/conda/lib/python3.6/site-packages/sklearn/linear_model/
logistic.py:433: FutureWarning: Default solver will be changed to
'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)

KNN
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
print('knn accuracy :',knn.score(x_test,y_test))
# confisioun matrix
y_pred = knn.predict(x_test)
y_true = y_test

# confisuon matrix
from sklearn.metrics import confusion_matrix
cm_knn = confusion_matrix(y_true,y_pred)

knn accuracy : 0.978584729981378

Find best k value


score_list=[]
for each in range(1,15):
knn2=KNeighborsClassifier(n_neighbors=each)
knn2.fit(x_train,y_train)
score_list.append(knn2.score(x_test,y_test))

plt.plot(range(1,15),score_list)
plt.xlabel('k values')
plt.ylabel('accuracy')
plt.show()

SVM
from sklearn.svm import SVC
svm=SVC(random_state=1)
svm.fit(x_train,y_train)
print('svm accuracy :', svm.score(x_test,y_test))

# confisuon matrix
y_pred = svm.predict(x_test)
y_true = y_test
from sklearn.metrics import confusion_matrix

cm_svm = confusion_matrix(y_true,y_pred)

/opt/conda/lib/python3.6/site-packages/sklearn/svm/base.py:196:
FutureWarning: The default value of gamma will change from 'auto' to
'scale' in version 0.22 to account better for unscaled features. Set
gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)

svm accuracy : 0.9731843575418995

Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(x_train,y_train)
print('nb accuracy : ', nb.score(x_test,y_test))

# confisuon matrix
y_pred = nb.predict(x_test)
y_true = y_test
from sklearn.metrics import confusion_matrix

cm_nb = confusion_matrix(y_true,y_pred)

nb accuracy : 0.9437616387337058

Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)
print('dt.accuracy : ', nb.score(x_test,y_test))

# confisuon matrix
y_pred = dt.predict(x_test)
y_true = y_test
from sklearn.metrics import confusion_matrix

cm_dt = confusion_matrix(y_true,y_pred)

dt.accuracy : 0.9437616387337058

Random Forest
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
print('rf accuracy : ', rf.score(x_test,y_test))

# confision matrix
y_pred = rf.predict(x_test)
y_true = y_test
from sklearn.metrics import confusion_matrix

cm_rf = confusion_matrix(y_true,y_pred)

/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py:246:
FutureWarning: The default value of n_estimators will change from 10
in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)

rf accuracy : 0.9774674115456239
Visualization Confision Matrix
plt.figure(figsize=(20,15))

plt.suptitle("Confusion Matrixes",fontsize=20)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix")
sns.heatmap(cm_lr,cbar=False,annot=True,cmap="Greens",fmt="d")

plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix")
sns.heatmap(cm_knn,cbar=False,annot=True,cmap="Greens",fmt="d")

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix")
sns.heatmap(cm_svm,cbar=False,annot=True,cmap="Greens",fmt="d")

plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(cm_nb,cbar=False,annot=True,cmap="Greens",fmt="d")

plt.subplot(2,3,5)
plt.title("Decision Tree Classifier Confusion Matrix")
sns.heatmap(cm_dt,cbar=False,annot=True,cmap="Greens",fmt="d")

plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix")
sns.heatmap(cm_rf,cbar=False,annot=True,cmap="Greens",fmt="d")

plt.show()
CONCLUSİON: in this data set, KNN algorithm has the highest accuracy for predicte.We can
see that on the conclusion matrix

You might also like