R19C076 - Chanukya Gowda K - Mlda - Assignment-2

CHANUKYA GOWDA K R19CS076 VI sem 'B' sec
Program 6:
Build a Decision Tree Model to help the user decide whether they
should go watch a new comedy show or not.
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
df = pd.read_csv("shows.csv")
df.head()
Age Experience Rank Nationality Go
0 36 10 9 UK NO
1 42 12 4 USA NO
2 23 4 6 N NO
3 52 4 4 USA NO
4 43 21 8 USA YES
d = {'UK': 0, 'USA': 1, 'N': 2}
df['Nationality'] = df['Nationality'].map(d)
d = {'YES': 1, 'NO': 0}
df['Go'] = df['Go'].map(d)
df.head()
Age Experience Rank Nationality Go
0 36 10 9 0 0
1 42 12 4 1 0
2 23 4 6 2 0
3 52 4 4 1 0
4 43 21 8 1 1
df.columns
Index(['Age', 'Experience', 'Rank', 'Nationality', 'Go'], dtype='object')
X = df[['Age', 'Experience', 'Rank', 'Nationality']]
Y = df['Go']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size= 0.25, random_state = 65)
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, Y_train)
Y_pred_dt = dtree.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test, Y_pred_dt))
print(confusion_matrix(Y_test, Y_pred_dt))
precision recall f1-score support
0 1.00 1.00 1.00 2
1 1.00 1.00 1.00 2
accuracy 1.00 4
macro avg 1.00 1.00 1.00 4
weighted avg 1.00 1.00 1.00 4
[[2 0]
[0 2]]
print("0 means don't GO, 1 means GO")
0 means don't GO, 1 means GO
print(dtree.predict([[21, 10, 6, 1]]))
[0]
/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not h

"X does not have valid feature names, but"
from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz
import pydot
features = ['Age', 'Experience', 'Rank', 'Nationality']
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, feature_names = features,
filled = True, rounded = True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())
Program 7:
Implement Random Forest Regression and Random Forest
Classification models to predict the salary of an employee based
on their level in a company. ¶
In [1]:
1 import numpy as np
2 import matplotlib.pyplot as plt
3 import pandas as pd
In [2]:
1 df = pd.read_csv("Position_Salaries.csv")
2 df
Out[2]:
Position Level Salary
0 Business Analyst 1 45000
1 Junior Consultant 2 50000
2 Senior Consultant 3 60000
3 Manager 4 80000
4 Country Manager 5 110000
5 Region Manager 6 150000
6 Partner 7 200000
7 Senior Partner 8 300000
8 C-level 9 500000
9 CEO 10 1000000
In [3]:
1 X = df.iloc[:,1:2].values
2 Y = df.iloc[:,2].values
Random Forest Regression

In [4]:
1 from sklearn.ensemble import RandomForestRegressor

2 regressor = RandomForestRegressor(n_estimators = 10, random_state = 62)
3 regressor.fit(X, Y)
Out[4]:
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
max_depth=None, max_features='auto', max_leaf_nodes=No

ne,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=None, oob_score=False,

random_state=62, verbose=0, warm_start=False)
In [5]:
1 Y_pred = regressor.predict([[6.5]])
In [6]:
1 plt.scatter(X, Y, color = "blue")

2 plt.plot(X, regressor.predict(X), color = "red")
3 plt.title("Finding Salary (Random Forest)")
4 plt.xlabel("Designation")
5 plt.ylabel("Salary")
6 plt.show()
In [7]:
1 X_hr = np.arange(min(X), max(X), 0.1)

2 X_hr = X_hr.reshape((len(X_hr), 1))
3 plt.scatter(X, Y, color = "blue")
4 plt.plot(X_hr, regressor.predict(X_hr), color = "red")
5 plt.title("Finding Salary (Random Forest)")
6 plt.xlabel("Designation")
7 plt.ylabel("Salary")
8 plt.show()
Random Forest Classification

In [8]:
In [9]:
1 df = pd.read_csv('Social_Network_Ads.csv')
2 df
Out[9]:
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0
... ... ... ... ... ...
395 15691863 Female 46 41000 1
396 15706071 Male 51 23000 1
397 15654296 Female 50 20000 1
398 15755018 Male 36 33000 0
399 15594041 Female 49 36000 1
400 rows × 5 columns
In [10]:
1 X = df.iloc[:, [2, 3]].values

2 y = df.iloc[:, 4].values
In [11]:
1 from sklearn.model_selection import train_test_split

2 X_train, X_test, y_train, y_test = train_test_split(X, y,
3 test_size = 0.25, random_state = 0)
In [12]:
1 from sklearn.preprocessing import StandardScaler

2 sc = StandardScaler()
3 X_train = sc.fit_transform(X_train)
4 X_test = sc.transform(X_test)
In [13]:
1 from sklearn.ensemble import RandomForestClassifier

2 model = RandomForestClassifier(n_estimators = 10,
3 criterion = 'entropy', random_state = 0)
4 model.fit(X_train, y_train)
Out[13]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='entropy', max_depth=None, max_features='au

to',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=0, verbose

=0,
warm_start=False)
In [14]:
1 y_prediction = model.predict(X_test)
In [15]:
1 from sklearn.metrics import confusion_matrix

2 conf_mat = confusion_matrix(y_test, y_prediction)
In [16]:
1 from matplotlib.colors import ListedColormap

2 X_set, y_set = X_train, y_train
3 X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
4 stop = X_set[:, 0].max() + 1, step = 0.01),
5 np.arange(start = X_set[:, 1].min() - 1,
6 stop = X_set[:, 1].max() + 1, step = 0.01))
7
8 plt.contourf(X1,X2,model.predict(np.array([X1.ravel(),
9 X2.ravel()]).T).reshape(X1.shape),
10 alpha = 0.75, cmap = ListedColormap(('red', 'green')))
11
12 plt.xlim(X1.min(), X1.max())
13 plt.ylim(X2.min(), X2.max())
14
15 for i, j in enumerate(np.unique(y_set)):
16 plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
17 c = ListedColormap(('red', 'green'))(i), label = j)
18
19 plt.title('Random Forest Classification (Training set)')
20 plt.xlabel('Age')
21 plt.ylabel('Salary')
22 plt.legend()
23 plt.show()
'c' argument looks like a single numeric RGB or RGBA sequence, which should
be avoided as value-mapping will have precedence in case its length matches
with 'x' & 'y'. Please use a 2-D array with a single row if you really want
to specify the same RGB or RGBA value for all points.
In [17]:
1 from matplotlib.colors import ListedColormap

2 X_set, y_set = X_test, y_test
3 X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
4 stop = X_set[:, 0].max() + 1, step = 0.01),
5 np.arange(start = X_set[:, 1].min() - 1,
6 stop = X_set[:, 1].max() + 1, step = 0.01))
7
8 plt.contourf(X1, X2, model.predict(np.array([X1.ravel(),
9 X2.ravel()]).T).reshape(X1.shape),
10 alpha = 0.75,cmap = ListedColormap(('red', 'green')))
11
12 plt.xlim(X1.min(), X1.max())
13 plt.ylim(X2.min(), X2.max())
14
15 for i, j in enumerate(np.unique(y_set)):
16 plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
17 c = ListedColormap(('red', 'green'))(i), label = j)
18
19 plt.title('Random Forest Classification (Test set)')
20 plt.xlabel('Age')
21 plt.ylabel('Estimated Salary')
22 plt.legend()
23 plt.show()
Program 8:
Implement K-Means clustering to group Mall customers based on
similarity
In [1]:
4 %matplotlib inline
In [2]:
1 df = pd.read_csv("Mall_Customers.csv")
2 df
Out[2]:
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
... ... ... ... ... ...
195 196 Female 35 120 79
196 197 Female 45 126 28
197 198 Male 32 126 74
198 199 Male 32 137 18
199 200 Male 30 137 83
200 rows × 5 columns
In [3]:
1 df = df.drop('CustomerID', axis=1)
In [4]:
1 from sklearn.preprocessing import LabelEncoder

2 encode = LabelEncoder()
3 encoded_sex = encode.fit_transform(df.iloc[:, 0])
4 print(encoded_sex)
[1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0
0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 0
1 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1
0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 1 0
0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0
1 0 1 0 0 0 0 1 0 0 0 0 1 1 1]
In [5]:
1 df['Genre'] = encoded_sex
2 df.head()
Out[5]:
Genre Age Annual Income (k$) Spending Score (1-100)
0 1 19 15 39
1 1 21 15 81
2 0 20 16 6
3 0 23 16 77
4 0 31 17 40
In [6]:
1 from sklearn.decomposition import PCA
In [7]:
1 pca_reducer = PCA(n_components = 2)
2 reduced_data = pca_reducer.fit_transform(df)
In [8]:
1 from sklearn.cluster import KMeans

2 km = KMeans(n_clusters = 5)
3 cluster = km.fit(reduced_data)
In [9]:
1 plt.scatter(reduced_data[:, 0], reduced_data[:, 1],

2 label = 'Datapoints')
3 plt.scatter(cluster.cluster_centers_[:, 0],
4 cluster.cluster_centers_[:, 1], label = 'Clusters')
5 plt.title("Sklearn version of KMeans")
6 plt.legend()
7 plt.show()
Program 9:
Implement Naïve Bayes Classification models to predict the class
of flowers in Iris Dataset.
In [1]:
3 import matplotlib.image as mpimg
In [2]:
1 df = pd.read_csv('iris.csv')
2 df.head()
Out[2]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
In [3]:
1 X = df.iloc[:,:4].values
2 y = df['species'].values
In [4]:

2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_stat
In [5]:
1 from sklearn.preprocessing import StandardScaler

2 sc = StandardScaler()
3 X_train = sc.fit_transform(X_train)
4 X_test = sc.transform(X_test)
In [6]:
1 from sklearn.naive_bayes import GaussianNB

2 nvclassifier = GaussianNB()
3 nvclassifier.fit(X_train, y_train)
Out[6]:
GaussianNB(priors=None, var_smoothing=1e-09)
In [7]:
1 y_pred = nvclassifier.predict(X_test)
2 print(y_pred)
['Iris-virginica' 'Iris-virginica' 'Iris-setosa' 'Iris-setosa'
'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor'
'Iris-versicolor' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-virginica'
'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa'
'Iris-virginica' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'
'Iris-versicolor' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'
'Iris-versicolor']
In [8]:
1 y_compare = np.vstack((y_test,y_pred)).T
2 y_compare[:5,:]
Out[8]:
array([['Iris-virginica', 'Iris-virginica'],
['Iris-virginica', 'Iris-virginica'],
['Iris-setosa', 'Iris-setosa'],
['Iris-setosa', 'Iris-setosa'],
['Iris-setosa', 'Iris-setosa']], dtype=object)
In [9]:
1 from sklearn.metrics import confusion_matrix

2 cm = confusion_matrix(y_test, y_pred)
3 print(cm)
[[11 0 0]
[ 0 8 1]
[ 0 1 9]]
In [10]:
1 a = cm.shape
2 corrPred = 0
3 falsePred = 0
4
5 for row in range(a[0]):
6 for c in range(a[1]):
7 if row == c:
8 corrPred += cm[row,c]
9 else:
10 falsePred += cm[row,c]
11 print('Correct predictions: ', corrPred)
12 print('False predictions', falsePred)
13 print ('\n\nAccuracy of the Naive Bayes Clasification is: ', corrPred/(cm.sum()))
Correct predictions: 28
False predictions 2
Accuracy of the Naive Bayes Clasification is: 0.9333333333333333
In [11]:
1 from sklearn.metrics import accuracy_score

2 print ("Accuracy : ", accuracy_score(y_test, y_pred))
Accuracy : 0.9333333333333333
Program 10:
Build XG-Boost Model to predict the presence of diabetes in
patients.
In [1]:
4 from numpy import loadtxt
5 from xgboost import XGBClassifier
7 from sklearn.metrics import accuracy_score
In [2]:
1 df = pd.read_csv('diabetes.csv')
2 df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [3]:
1 X = df.iloc[:,0:8]
2 Y = df.iloc[:,8]
3 X_train,X_test,Y_train,Y_test = train_test_split(X,
4 Y, test_size = 0.33, random_state = 42)
In [4]:
1 model = XGBClassifier()
2 eval_set = [(X_test,Y_test)]
3 model.fit(X_train, Y_train, early_stopping_rounds = 10,
4 eval_metric = 'logloss', eval_set = eval_set,verbose = True)
[0] validation_0-logloss:0.60697
C:\Users\Binay Senapati\anaconda3\lib\site-packages\xgboost\sklearn.py:1224:
UserWarning: The use of label encoder in XGBClassifier is deprecated and wil
l be removed in a future release. To remove this warning, do the following:
1) Pass option use_label_encoder=False when constructing XGBClassifier objec
t; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2,
..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
Out[4]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,

colsample_bynode=1, colsample_bytree=1, enable_categorical=Fal
se,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=na
n,
monotone_constraints='()', n_estimators=100, n_jobs=8,

num_parallel_tree=1, objective='binary:logistic',
predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
use_label_encoder=True, validate_parameters=1, verbosity=None)
In [5]:
1 pred = model.predict(X_test)
In [6]:
1 accur = accuracy_score(Y_test, pred)

2 print("Accuracy: %.2f%%" %(accur*100.0))
Accuracy: 73.23%
In [7]:
1 from xgboost import plot_importance

2 from matplotlib import pyplot
In [8]:
1 plot_importance(model)
2 pyplot.show()

R19C076 - Chanukya Gowda K - Mlda - Assignment-2

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

R19C076 - Chanukya Gowda K - Mlda - Assignment-2

Uploaded by

Copyright:

Available Formats

CHANUKYA GOWDA K R19CS076 VI sem 'B' sec

Age Experience Rank Nationality Go

Age Experience Rank Nationality Go

Index(['Age', 'Experience', 'Rank', 'Nationality', 'Go'], dtype='object')

precision recall f1-score support

0 1.00 1.00 1.00 2

1 1.00 1.00 1.00 2

macro avg 1.00 1.00 1.00 4

weighted avg 1.00 1.00 1.00 4

0 means don't GO, 1 means GO

/usr/local/lib/python3.7/dist-packages/sklearn/base.py:451: UserWarning: X does not h

Position Level Salary

0 Business Analyst 1 45000

1 Junior Consultant 2 50000

2 Senior Consultant 3 60000

4 Country Manager 5 110000

5 Region Manager 6 150000

7 Senior Partner 8 300000

Random Forest Regression

1 from sklearn.ensemble import RandomForestRegressor

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',

max_depth=None, max_features='auto', max_leaf_nodes=No

n_estimators=10, n_jobs=None, oob_score=False,

1 plt.scatter(X, Y, color = "blue")

1 X_hr = np.arange(min(X), max(X), 0.1)

Random Forest Classification

User ID Gender Age EstimatedSalary Purchased

0 15624510 Male 19 19000 0

1 15810944 Male 35 20000 0

2 15668575 Female 26 43000 0

3 15603246 Female 27 57000 0

4 15804002 Male 19 76000 0

... ... ... ... ... ...

395 15691863 Female 46 41000 1

396 15706071 Male 51 23000 1

397 15654296 Female 50 20000 1

398 15755018 Male 36 33000 0

399 15594041 Female 49 36000 1

400 rows × 5 columns

1 X = df.iloc[:, [2, 3]].values

1 from sklearn.model_selection import train_test_split

1 from sklearn.preprocessing import StandardScaler

1 from sklearn.ensemble import RandomForestClassifier

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,

criterion='entropy', max_depth=None, max_features='au

n_jobs=None, oob_score=False, random_state=0, verbose

1 from sklearn.metrics import confusion_matrix

1 from matplotlib.colors import ListedColormap

1 from matplotlib.colors import ListedColormap

CustomerID Genre Age Annual Income (k$) Spending Score (1-100)

... ... ... ... ... ...

195 196 Female 35 120 79

196 197 Female 45 126 28

197 198 Male 32 126 74

198 199 Male 32 137 18

199 200 Male 30 137 83

200 rows × 5 columns

1 from sklearn.preprocessing import LabelEncoder

Genre Age Annual Income (k$) Spending Score (1-100)

1 from sklearn.decomposition import PCA

1 from sklearn.cluster import KMeans

1 plt.scatter(reduced_data[:, 0], reduced_data[:, 1],

sepal_length sepal_width petal_length petal_width species