Professional Documents
Culture Documents
Program 6:
Build a Decision Tree Model to help the user decide whether they
should go watch a new comedy show or not.
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
df = pd.read_csv("shows.csv")
df.head()
0 36 10 9 UK NO
1 42 12 4 USA NO
2 23 4 6 N NO
3 52 4 4 USA NO
4 43 21 8 USA YES
d = {'UK': 0, 'USA': 1, 'N': 2}
df['Nationality'] = df['Nationality'].map(d)
d = {'YES': 1, 'NO': 0}
df['Go'] = df['Go'].map(d)
df.head()
0 36 10 9 0 0
1 42 12 4 1 0
2 23 4 6 2 0
3 52 4 4 1 0
4 43 21 8 1 1
df.columns
X = df[['Age', 'Experience', 'Rank', 'Nationality']]
Y = df['Go']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size= 0.25, random_state = 65)
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, Y_train)
Y_pred_dt = dtree.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test, Y_pred_dt))
print(confusion_matrix(Y_test, Y_pred_dt))
accuracy 1.00 4
[[2 0]
[0 2]]
print("0 means don't GO, 1 means GO")
print(dtree.predict([[21, 10, 6, 1]]))
[0]
from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz
import pydot
features = ['Age', 'Experience', 'Rank', 'Nationality']
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, feature_names = features,
filled = True, rounded = True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())
Program 7:
Implement Random Forest Regression and Random Forest
Classification models to predict the salary of an employee based
on their level in a company. ¶
In [1]:
1 import numpy as np
2 import matplotlib.pyplot as plt
3 import pandas as pd
In [2]:
1 df = pd.read_csv("Position_Salaries.csv")
2 df
Out[2]:
3 Manager 4 80000
6 Partner 7 200000
8 C-level 9 500000
9 CEO 10 1000000
In [3]:
1 X = df.iloc[:,1:2].values
2 Y = df.iloc[:,2].values
Out[4]:
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
In [5]:
1 Y_pred = regressor.predict([[6.5]])
In [6]:
1 import numpy as np
2 import matplotlib.pyplot as plt
3 import pandas as pd
In [9]:
1 df = pd.read_csv('Social_Network_Ads.csv')
2 df
Out[9]:
In [10]:
In [11]:
In [12]:
Out[13]:
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
warm_start=False)
In [14]:
1 y_prediction = model.predict(X_test)
In [15]:
'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.
In [17]:
'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.
Program 8:
Implement K-Means clustering to group Mall customers based on
similarity
In [1]:
1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 %matplotlib inline
In [2]:
1 df = pd.read_csv("Mall_Customers.csv")
2 df
Out[2]:
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [3]:
1 df = df.drop('CustomerID', axis=1)
In [4]:
[1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0
0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 0
1 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1
0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 1 0
0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0
1 0 1 0 0 0 0 1 0 0 0 0 1 1 1]
In [5]:
1 df['Genre'] = encoded_sex
2 df.head()
Out[5]:
0 1 19 15 39
1 1 21 15 81
2 0 20 16 6
3 0 23 16 77
4 0 31 17 40
In [6]:
In [7]:
1 pca_reducer = PCA(n_components = 2)
2 reduced_data = pca_reducer.fit_transform(df)
In [8]:
1 import numpy as np
2 import matplotlib.pyplot as plt
3 import matplotlib.image as mpimg
4 import pandas as pd
In [2]:
1 df = pd.read_csv('iris.csv')
2 df.head()
Out[2]:
In [3]:
1 X = df.iloc[:,:4].values
2 y = df['species'].values
In [4]:
In [5]:
Out[6]:
GaussianNB(priors=None, var_smoothing=1e-09)
In [7]:
1 y_pred = nvclassifier.predict(X_test)
2 print(y_pred)
'Iris-versicolor']
In [8]:
1 y_compare = np.vstack((y_test,y_pred)).T
2 y_compare[:5,:]
Out[8]:
array([['Iris-virginica', 'Iris-virginica'],
['Iris-virginica', 'Iris-virginica'],
['Iris-setosa', 'Iris-setosa'],
['Iris-setosa', 'Iris-setosa'],
In [9]:
[[11 0 0]
[ 0 8 1]
[ 0 1 9]]
In [10]:
1 a = cm.shape
2 corrPred = 0
3 falsePred = 0
4
5 for row in range(a[0]):
6 for c in range(a[1]):
7 if row == c:
8 corrPred += cm[row,c]
9 else:
10 falsePred += cm[row,c]
11 print('Correct predictions: ', corrPred)
12 print('False predictions', falsePred)
13 print ('\n\nAccuracy of the Naive Bayes Clasification is: ', corrPred/(cm.sum()))
Correct predictions: 28
False predictions 2
In [11]:
Accuracy : 0.9333333333333333
Program 10:
Build XG-Boost Model to predict the presence of diabetes in
patients.
In [1]:
1 import pandas as pd
2 import numpy as np
3 import matplotlib.pyplot as plt
4 from numpy import loadtxt
5 from xgboost import XGBClassifier
6 from sklearn.model_selection import train_test_split
7 from sklearn.metrics import accuracy_score
In [2]:
1 df = pd.read_csv('diabetes.csv')
2 df.info()
<class 'pandas.core.frame.DataFrame'>
In [3]:
1 X = df.iloc[:,0:8]
2 Y = df.iloc[:,8]
3 X_train,X_test,Y_train,Y_test = train_test_split(X,
4 Y, test_size = 0.33, random_state = 42)
In [4]:
1 model = XGBClassifier()
2 eval_set = [(X_test,Y_test)]
3 model.fit(X_train, Y_train, early_stopping_rounds = 10,
4 eval_metric = 'logloss', eval_set = eval_set,verbose = True)
[0] validation_0-logloss:0.60697
[1] validation_0-logloss:0.56261
[2] validation_0-logloss:0.53835
[3] validation_0-logloss:0.52276
[4] validation_0-logloss:0.51074
[5] validation_0-logloss:0.50890
[6] validation_0-logloss:0.50851
[7] validation_0-logloss:0.51019
[8] validation_0-logloss:0.51279
[9] validation_0-logloss:0.52036
[10] validation_0-logloss:0.52229
[11] validation_0-logloss:0.52651
[12] validation_0-logloss:0.52992
[13] validation_0-logloss:0.53040
[14] validation_0-logloss:0.54070
[15] validation_0-logloss:0.54597
[16] validation_0-logloss:0.54907
C:\Users\Binay Senapati\anaconda3\lib\site-packages\xgboost\sklearn.py:1224:
UserWarning: The use of label encoder in XGBClassifier is deprecated and wil
l be removed in a future release. To remove this warning, do the following:
1) Pass option use_label_encoder=False when constructing XGBClassifier objec
t; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2,
..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
Out[4]:
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=na
n,
In [5]:
1 pred = model.predict(X_test)
In [6]:
Accuracy: 73.23%
In [7]:
In [8]:
1 plot_importance(model)
2 pyplot.show()