You are on page 1of 19

CHANUKYA GOWDA K R19CS076 VI sem 'B' sec

Program 6:
Build a Decision Tree Model to help the user decide whether they
should go watch a new comedy show or not.

import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg

df = pd.read_csv("shows.csv")
df.head()

Age Experience Rank Nationality Go

0 36 10 9 UK NO

1 42 12 4 USA NO

2 23 4 6 N NO

3 52 4 4 USA NO

4 43 21 8 USA YES

d = {'UK': 0, 'USA': 1, 'N': 2}

df['Nationality'] = df['Nationality'].map(d)

d = {'YES': 1, 'NO': 0}

df['Go'] = df['Go'].map(d)

df.head()

Age Experience Rank Nationality Go

0 36 10 9 0 0

1 42 12 4 1 0

2 23 4 6 2 0

3 52 4 4 1 0

4 43 21 8 1 1
df.columns

Index(['Age', 'Experience', 'Rank', 'Nationality', 'Go'], dtype='object')

X = df[['Age', 'Experience', 'Rank', 'Nationality']]

Y = df['Go']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 

                                                  test_size= 0.25, random_state = 65)

dtree = DecisionTreeClassifier()

dtree = dtree.fit(X_train, Y_train)

Y_pred_dt = dtree.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(Y_test, Y_pred_dt))

print(confusion_matrix(Y_test, Y_pred_dt))

precision recall f1-score support

0 1.00 1.00 1.00 2

1 1.00 1.00 1.00 2

accuracy 1.00 4

macro avg 1.00 1.00 1.00 4

weighted avg 1.00 1.00 1.00 4

[[2 0]

[0 2]]

print("0 means don't GO, 1 means GO")

0 means don't GO, 1 means GO

print(dtree.predict([[21, 10, 6, 1]]))

[0]

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not h


"X does not have valid feature names, but"

from IPython.display import Image  

from six import StringIO  

from sklearn.tree import export_graphviz

import pydot 

features = ['Age', 'Experience', 'Rank', 'Nationality']

dot_data = StringIO()  

export_graphviz(dtree, out_file=dot_data, feature_names = features, 

                filled = True, rounded = True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())  

Program 7:
Implement Random Forest Regression and Random Forest
Classification models to predict the salary of an employee based
on their level in a company. ¶
In [1]:

1 import numpy as np
2 import matplotlib.pyplot as plt
3 import pandas as pd

In [2]:

1 df = pd.read_csv("Position_Salaries.csv")
2 df

Out[2]:

Position Level Salary

0 Business Analyst 1 45000

1 Junior Consultant 2 50000

2 Senior Consultant 3 60000

3 Manager 4 80000

4 Country Manager 5 110000

5 Region Manager 6 150000

6 Partner 7 200000

7 Senior Partner 8 300000

8 C-level 9 500000

9 CEO 10 1000000

In [3]:

1 X = df.iloc[:,1:2].values
2 Y = df.iloc[:,2].values

Random Forest Regression


In [4]:

1 from sklearn.ensemble import RandomForestRegressor


2 regressor = RandomForestRegressor(n_estimators = 10, random_state = 62)
3 regressor.fit(X, Y)

Out[4]:

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',

max_depth=None, max_features='auto', max_leaf_nodes=No


ne,

max_samples=None, min_impurity_decrease=0.0,

min_impurity_split=None, min_samples_leaf=1,

min_samples_split=2, min_weight_fraction_leaf=0.0,

n_estimators=10, n_jobs=None, oob_score=False,


random_state=62, verbose=0, warm_start=False)

In [5]:

1 Y_pred = regressor.predict([[6.5]])

In [6]:

1 plt.scatter(X, Y, color = "blue")


2 plt.plot(X, regressor.predict(X), color = "red")
3 plt.title("Finding Salary (Random Forest)")
4 plt.xlabel("Designation")
5 plt.ylabel("Salary")
6 plt.show()
In [7]:

1 X_hr = np.arange(min(X), max(X), 0.1)


2 X_hr = X_hr.reshape((len(X_hr), 1))
3 plt.scatter(X, Y, color = "blue")
4 plt.plot(X_hr, regressor.predict(X_hr), color = "red")
5 plt.title("Finding Salary (Random Forest)")
6 plt.xlabel("Designation")
7 plt.ylabel("Salary")
8 plt.show()

Random Forest Classification


In [8]:

1 import numpy as np
2 import matplotlib.pyplot as plt
3 import pandas as pd
In [9]:

1 df = pd.read_csv('Social_Network_Ads.csv')
2 df

Out[9]:

User ID Gender Age EstimatedSalary Purchased

0 15624510 Male 19 19000 0

1 15810944 Male 35 20000 0

2 15668575 Female 26 43000 0

3 15603246 Female 27 57000 0

4 15804002 Male 19 76000 0

... ... ... ... ... ...

395 15691863 Female 46 41000 1

396 15706071 Male 51 23000 1

397 15654296 Female 50 20000 1

398 15755018 Male 36 33000 0

399 15594041 Female 49 36000 1

400 rows × 5 columns

In [10]:

1 X = df.iloc[:, [2, 3]].values


2 y = df.iloc[:, 4].values

In [11]:

1 from sklearn.model_selection import train_test_split


2 X_train, X_test, y_train, y_test = train_test_split(X, y,
3 test_size = 0.25, random_state = 0)

In [12]:

1 from sklearn.preprocessing import StandardScaler


2 sc = StandardScaler()
3 X_train = sc.fit_transform(X_train)
4 X_test = sc.transform(X_test)
In [13]:

1 from sklearn.ensemble import RandomForestClassifier


2 model = RandomForestClassifier(n_estimators = 10,
3 criterion = 'entropy', random_state = 0)
4 model.fit(X_train, y_train)

Out[13]:

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,

criterion='entropy', max_depth=None, max_features='au


to',

max_leaf_nodes=None, max_samples=None,

min_impurity_decrease=0.0, min_impurity_split=None,

min_samples_leaf=1, min_samples_split=2,

min_weight_fraction_leaf=0.0, n_estimators=10,

n_jobs=None, oob_score=False, random_state=0, verbose


=0,

warm_start=False)

In [14]:

1 y_prediction = model.predict(X_test)

In [15]:

1 from sklearn.metrics import confusion_matrix


2 conf_mat = confusion_matrix(y_test, y_prediction)
In [16]:

1 from matplotlib.colors import ListedColormap


2 X_set, y_set = X_train, y_train
3 X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
4 stop = X_set[:, 0].max() + 1, step = 0.01),
5 np.arange(start = X_set[:, 1].min() - 1,
6 stop = X_set[:, 1].max() + 1, step = 0.01))
7
8 plt.contourf(X1,X2,model.predict(np.array([X1.ravel(),
9 X2.ravel()]).T).reshape(X1.shape),
10 alpha = 0.75, cmap = ListedColormap(('red', 'green')))
11
12 plt.xlim(X1.min(), X1.max())
13 plt.ylim(X2.min(), X2.max())
14
15 for i, j in enumerate(np.unique(y_set)):
16 plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
17 c = ListedColormap(('red', 'green'))(i), label = j)
18
19 plt.title('Random Forest Classification (Training set)')
20 plt.xlabel('Age')
21 plt.ylabel('Salary')
22 plt.legend()
23 plt.show()

'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.

'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.

In [17]:

1 from matplotlib.colors import ListedColormap


2 X_set, y_set = X_test, y_test
3 X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
4 stop = X_set[:, 0].max() + 1, step = 0.01),
5 np.arange(start = X_set[:, 1].min() - 1,
6 stop = X_set[:, 1].max() + 1, step = 0.01))
7
8 plt.contourf(X1, X2, model.predict(np.array([X1.ravel(),
9 X2.ravel()]).T).reshape(X1.shape),
10 alpha = 0.75,cmap = ListedColormap(('red', 'green')))
11
12 plt.xlim(X1.min(), X1.max())
13 plt.ylim(X2.min(), X2.max())
14
15 for i, j in enumerate(np.unique(y_set)):
16 plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
17 c = ListedColormap(('red', 'green'))(i), label = j)
18
19 plt.title('Random Forest Classification (Test set)')
20 plt.xlabel('Age')
21 plt.ylabel('Estimated Salary')
22 plt.legend()
23 plt.show()

'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.

'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.

Program 8:
Implement K-Means clustering to group Mall customers based on
similarity
In [1]:

1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 %matplotlib inline

In [2]:

1 df = pd.read_csv("Mall_Customers.csv")
2 df

Out[2]:

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

... ... ... ... ... ...

195 196 Female 35 120 79

196 197 Female 45 126 28

197 198 Male 32 126 74

198 199 Male 32 137 18

199 200 Male 30 137 83

200 rows × 5 columns

In [3]:

1 df = df.drop('CustomerID', axis=1)
In [4]:

1 from sklearn.preprocessing import LabelEncoder


2 encode = LabelEncoder()
3 encoded_sex = encode.fit_transform(df.iloc[:, 0])
4 print(encoded_sex)

[1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0

0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 0

1 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1

0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 1 0

0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0

1 0 1 0 0 0 0 1 0 0 0 0 1 1 1]

In [5]:

1 df['Genre'] = encoded_sex
2 df.head()

Out[5]:

Genre Age Annual Income (k$) Spending Score (1-100)

0 1 19 15 39

1 1 21 15 81

2 0 20 16 6

3 0 23 16 77

4 0 31 17 40

In [6]:

1 from sklearn.decomposition import PCA

In [7]:

1 pca_reducer = PCA(n_components = 2)
2 reduced_data = pca_reducer.fit_transform(df)

In [8]:

1 from sklearn.cluster import KMeans


2 km = KMeans(n_clusters = 5)
3 cluster = km.fit(reduced_data)
In [9]:

1 plt.scatter(reduced_data[:, 0], reduced_data[:, 1],


2 label = 'Datapoints')
3 plt.scatter(cluster.cluster_centers_[:, 0],
4 cluster.cluster_centers_[:, 1], label = 'Clusters')
5 plt.title("Sklearn version of KMeans")
6 plt.legend()
7 plt.show()
Program 9:
Implement Naïve Bayes Classification models to predict the class
of flowers in Iris Dataset.
In [1]:

1 import numpy as np
2 import matplotlib.pyplot as plt
3 import matplotlib.image as mpimg
4 import pandas as pd

In [2]:

1 df = pd.read_csv('iris.csv')
2 df.head()

Out[2]:

sepal_length sepal_width petal_length petal_width species

0 5.1 3.5 1.4 0.2 Iris-setosa

1 4.9 3.0 1.4 0.2 Iris-setosa

2 4.7 3.2 1.3 0.2 Iris-setosa

3 4.6 3.1 1.5 0.2 Iris-setosa

4 5.0 3.6 1.4 0.2 Iris-setosa

In [3]:

1 X = df.iloc[:,:4].values
2 y = df['species'].values

In [4]:

1 from sklearn.model_selection import train_test_split


2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_stat

In [5]:

1 from sklearn.preprocessing import StandardScaler


2 sc = StandardScaler()
3 X_train = sc.fit_transform(X_train)
4 X_test = sc.transform(X_test)
In [6]:

1 from sklearn.naive_bayes import GaussianNB


2 nvclassifier = GaussianNB()
3 nvclassifier.fit(X_train, y_train)

Out[6]:

GaussianNB(priors=None, var_smoothing=1e-09)

In [7]:

1 y_pred = nvclassifier.predict(X_test)
2 print(y_pred)

['Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'

'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor'

'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'

'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-virginica'

'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa'

'Iris-virginica' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'

'Iris-versicolor' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'

'Iris-versicolor']

In [8]:

1 y_compare = np.vstack((y_test,y_pred)).T
2 y_compare[:5,:]

Out[8]:

array([['Iris-virginica', 'Iris-virginica'],

['Iris-virginica', 'Iris-virginica'],

['Iris-setosa', 'Iris-setosa'],

['Iris-setosa', 'Iris-setosa'],

['Iris-setosa', 'Iris-setosa']], dtype=object)

In [9]:

1 from sklearn.metrics import confusion_matrix


2 cm = confusion_matrix(y_test, y_pred)
3 print(cm)

[[11 0 0]

[ 0 8 1]

[ 0 1 9]]

In [10]:

1 a = cm.shape
2 corrPred = 0
3 falsePred = 0
4
5 for row in range(a[0]):
6 for c in range(a[1]):
7 if row == c:
8 corrPred += cm[row,c]
9 else:
10 falsePred += cm[row,c]
11 print('Correct predictions: ', corrPred)
12 print('False predictions', falsePred)
13 print ('\n\nAccuracy of the Naive Bayes Clasification is: ', corrPred/(cm.sum()))

Correct predictions: 28

False predictions 2

Accuracy of the Naive Bayes Clasification is: 0.9333333333333333

In [11]:

1 from sklearn.metrics import accuracy_score


2 print ("Accuracy : ", accuracy_score(y_test, y_pred))

Accuracy : 0.9333333333333333

Program 10:
Build XG-Boost Model to predict the presence of diabetes in
patients.
In [1]:

1 import pandas as pd
2 import numpy as np
3 import matplotlib.pyplot as plt
4 from numpy import loadtxt
5 from xgboost import XGBClassifier
6 from sklearn.model_selection import train_test_split
7 from sklearn.metrics import accuracy_score

In [2]:

1 df = pd.read_csv('diabetes.csv')
2 df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 768 entries, 0 to 767

Data columns (total 9 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 Pregnancies 768 non-null int64

1 Glucose 768 non-null int64

2 BloodPressure 768 non-null int64

3 SkinThickness 768 non-null int64

4 Insulin 768 non-null int64

5 BMI 768 non-null float64

6 DiabetesPedigreeFunction 768 non-null float64

7 Age 768 non-null int64

8 Outcome 768 non-null int64

dtypes: float64(2), int64(7)

memory usage: 54.1 KB

In [3]:

1 X = df.iloc[:,0:8]
2 Y = df.iloc[:,8]
3 X_train,X_test,Y_train,Y_test = train_test_split(X,
4 Y, test_size = 0.33, random_state = 42)
In [4]:

1 model = XGBClassifier()
2 eval_set = [(X_test,Y_test)]
3 model.fit(X_train, Y_train, early_stopping_rounds = 10,
4 eval_metric = 'logloss', eval_set = eval_set,verbose = True)

[0] validation_0-logloss:0.60697

[1] validation_0-logloss:0.56261

[2] validation_0-logloss:0.53835

[3] validation_0-logloss:0.52276

[4] validation_0-logloss:0.51074

[5] validation_0-logloss:0.50890

[6] validation_0-logloss:0.50851

[7] validation_0-logloss:0.51019

[8] validation_0-logloss:0.51279

[9] validation_0-logloss:0.52036

[10] validation_0-logloss:0.52229

[11] validation_0-logloss:0.52651

[12] validation_0-logloss:0.52992

[13] validation_0-logloss:0.53040

[14] validation_0-logloss:0.54070

[15] validation_0-logloss:0.54597

[16] validation_0-logloss:0.54907

C:\Users\Binay Senapati\anaconda3\lib\site-packages\xgboost\sklearn.py:1224:
UserWarning: The use of label encoder in XGBClassifier is deprecated and wil
l be removed in a future release. To remove this warning, do the following:
1) Pass option use_label_encoder=False when constructing XGBClassifier objec
t; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2,
..., [num_class - 1].

warnings.warn(label_encoder_deprecation_msg, UserWarning)

Out[4]:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,


colsample_bynode=1, colsample_bytree=1, enable_categorical=Fal
se,

gamma=0, gpu_id=-1, importance_type=None,

interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=na
n,

monotone_constraints='()', n_estimators=100, n_jobs=8,


num_parallel_tree=1, objective='binary:logistic',

predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,

scale_pos_weight=1, subsample=1, tree_method='exact',

use_label_encoder=True, validate_parameters=1, verbosity=None)

In [5]:

1 pred = model.predict(X_test)
In [6]:

1 accur = accuracy_score(Y_test, pred)


2 print("Accuracy: %.2f%%" %(accur*100.0))

Accuracy: 73.23%

In [7]:

1 from xgboost import plot_importance


2 from matplotlib import pyplot

In [8]:

1 plot_importance(model)
2 pyplot.show()

You might also like