Practice Test

Practice_test
April 7, 2024
[87]: import pandas as pd

import numpy as num
import seaborn as sns
import matplotlib.pyplot as plt
[88]: df_train=pd.read_csv('/content/drive/MyDrive/bank.csv')
df_train.head()
[88]: age job marital education default balance housing loan contact \
0 59 admin. married secondary no 2343 yes no unknown
1 56 admin. married secondary no 45 no no unknown
2 41 technician married secondary no 1270 yes no unknown
3 55 services married secondary no 2476 yes no unknown
4 54 admin. married tertiary no 184 no no unknown
day month duration campaign pdays previous poutcome deposit

0 5 may 1042 1 -1 0 unknown yes
[89]: df_train=df_train.drop('duration',axis=1)
[90]: df_train['deposit'].value_counts()
[90]: deposit
no 5873
yes 5289
Name: count, dtype: int64
[91]: df_train.isnull().sum()
[91]: age 0
job 0
marital 0
education 0
1
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
campaign 0
pdays 0
previous 0
poutcome 0
deposit 0
dtype: int64
[92]: df_train.dtypes
[92]: age int64

job object
marital object
education object
default object
balance int64
housing object
loan object
contact object
day int64
month object
campaign int64
pdays int64
previous int64
poutcome object
deposit object
dtype: object
[93]: pip install scikit-learn
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-

packages (1.2.2)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-
packages (from scikit-learn) (1.25.2)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-
Requirement already satisfied: threadpoolctl>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.4.0)
2
[94]: from sklearn.preprocessing import StandardScaler
stdscalar=StandardScaler()
df_train_ready=df_train.copy()
new_cols=['age','balance','day','campaign','pdays','previous']
df_train_ready[new_cols]=stdscalar.fit_transform(df_train_ready[new_cols])
df_train_ready.head()
[94]: age job marital education default balance housing loan \

0 1.491505 admin. married secondary no 0.252525 yes no
1 1.239676 admin. married secondary no -0.459974 no no
2 -0.019470 technician married secondary no -0.080160 yes no
3 1.155733 services married secondary no 0.293762 yes no
4 1.071790 admin. married tertiary no -0.416876 no no
contact day month campaign pdays previous poutcome deposit

0 unknown -1.265746 may -0.554168 -0.481184 -0.36326 unknown yes
[95]: df_train_ready.dtypes
[95]: age float64

job object
marital object
education object
default object
balance float64
housing object
loan object
contact object
day float64
month object
campaign float64
pdays float64
previous float64
poutcome object
deposit object
dtype: object
3
[96]: from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False)
new_cols=['job','marital','education','default','housing','loan','contact','month','poutcome']
df_encoded=pd.DataFrame(encoder.fit_transform(df_train_ready[new_cols]))
df_encoded.columns=encoder.get_feature_names_out(new_cols)
df_train_ready=df_train_ready.drop(new_cols,axis=1)
df_train_ready=pd.concat([df_encoded,df_train_ready],axis=1)
df_train_ready['deposit']=df_train_ready['deposit'].apply(lambda x: 1 if x==␣
↪'yes' else 0)
df_train_ready.head()
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will
be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its
default value.
warnings.warn(
[96]: job_admin. job_blue-collar job_entrepreneur job_housemaid \

0 1.0 0.0 0.0 0.0
1 1.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0
job_management job_retired job_self-employed job_services job_student \

0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 1.0 0.0
4 0.0 0.0 0.0 0.0 0.0
job_technician … poutcome_other poutcome_success poutcome_unknown \

0 0.0 … 0.0 0.0 1.0
1 0.0 … 0.0 0.0 1.0
2 1.0 … 0.0 0.0 1.0
3 0.0 … 0.0 0.0 1.0
4 0.0 … 0.0 0.0 1.0
age balance day campaign pdays previous deposit

0 1.491505 0.252525 -1.265746 -0.554168 -0.481184 -0.36326 1
1 1.239676 -0.459974 -1.265746 -0.554168 -0.481184 -0.36326 1
4
2 -0.019470 -0.080160 -1.265746 -0.554168 -0.481184 -0.36326 1
3 1.155733 0.293762 -1.265746 -0.554168 -0.481184 -0.36326 1
4 1.071790 -0.416876 -1.265746 -0.186785 -0.481184 -0.36326 1
[5 rows x 51 columns]
[97]: from sklearn.model_selection import train_test_split
features=df_train_ready.drop('deposit',axis=1)
target=df_train_ready['deposit']
X_train,X_test,y_train,y_test=train_test_split(features,target,shuffle=True,test_size=0.
↪2,random_state=1)
[98]: from sklearn import metrics

def evalute(model,X_test,y_test):
y_pred=model.predict(X_test)
acc=metrics.accuracy_score(y_test,y_pred)
prec=metrics.precision_score(y_test,y_pred)
rec=metrics.recall_score(y_test,y_pred)
f1=metrics.f1_score(y_test,y_pred)
y_pred_prob=model.predict_proba(X_test)[::,1]
fpr,tpr, _ =metrics.roc_curve(y_test,y_pred_prob)
auc=metrics.roc_auc_score(y_test,y_pred_prob)
cm=metrics.confusion_matrix(y_test,y_pred)
return {'acc':acc,'prec':prec,'rec':rec,'f1':f1,'auc':auc,'cm':cm}
[99]: from sklearn import tree

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
dt=tree.DecisionTreeClassifier(random_state=0)
dt.fit(X_train,y_train)
dt_score=evalute(dt,X_test,y_test)
rfc=RandomForestClassifier(random_state=0)
rfc.fit(X_train,y_train)
rfc_score=evalute(rfc,X_test,y_test)
5
nb=GaussianNB()
nb.fit(X_train,y_train)
nb_score=evalute(nb,X_test,y_test)
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_score=evalute(knn,X_test,y_test)
print("Decision Tree Score:", dt_score)

print("Random Forest Score:", rfc_score)
print("Naive Bayes Score:", nb_score)
print("KNN Score:", knn_score)
Decision Tree Score: {'acc': 0.6336766681594268, 'prec': 0.6215953307392996,

'rec': 0.598314606741573, 'f1': 0.6097328244274809, 'auc': 0.6322045136712157,
'cm': array([[776, 389],
[429, 639]])}
Random Forest Score: {'acc': 0.7205553067622034, 'prec': 0.7488789237668162,
'rec': 0.6254681647940075, 'f1': 0.6816326530612244, 'auc': 0.781940492838887,
'cm': array([[941, 224],
[400, 668]])}
Naive Bayes Score: {'acc': 0.6815942678011644, 'prec': 0.7560975609756098,
'rec': 0.4934456928838951, 'f1': 0.5971671388101983, 'auc': 0.7421999324878237,
'cm': array([[995, 170],
[541, 527]])}
KNN Score: {'acc': 0.6869682042095835, 'prec': 0.6981740064446831, 'rec':
0.6086142322097379, 'f1': 0.6503251625812906, 'auc': 0.7323909758724342, 'cm':
array([[884, 281],
[418, 650]])}
[100]: from sklearn.model_selection import GridSearchCV

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
param_grid_rtc = {
'n_estimators': [100, 300, 500],
'max_depth': [None, 10, 50, 100],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}
param_grid_knn = {
'n_neighbors': [3, 5, 7, 9],
6
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
}
param_grid_dt = {
'max_depth': [None, 10, 50, 100],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}
rtc_grids = RandomForestClassifier(random_state=0)
dt_grids = tree.DecisionTreeClassifier(random_state=0)
knn_grids = KNeighborsClassifier()
grid_search_dt =␣
↪GridSearchCV(estimator=dt_grids,param_grid=param_grid_dt,scoring='recall',cv=5,n_jobs=-1)
grid_search_rtc =␣
↪GridSearchCV(estimator=rtc_grids,param_grid=param_grid_rtc,scoring='recall',cv=5,n_jobs=-1)
grid_search_knn =␣
↪GridSearchCV(estimator=knn_grids,param_grid=param_grid_knn,scoring='recall',cv=5,n_jobs=-1)
grid_search_dt.fit(X_train,y_train)
grid_search_dt.best_params_
[100]: {'max_depth': None,

'max_features': 'log2',
'min_samples_leaf': 1,
'min_samples_split': 2}
[69]: grid_search_rtc.fit(X_train,y_train)
grid_search_rtc.best_params_
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-69-dff0ec5eadd1> in <cell line: 1>()
----> 1 grid_search_rtc.fit(X_train,y_train)
2 grid_search_rtc.best_params_
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪fit(self, X, y, groups, **fit_params)
872 return results

873
--> 874 self._run_search(evaluate_candidates)
875
7
876 # multimetric is determined here because in the case of a␣
↪callable
↪_run_search(self, evaluate_candidates)
1386 def _run_search(self, evaluate_candidates):

1387 """Search all candidates in param_grid"""
-> 1388 evaluate_candidates(ParameterGrid(self.param_grid))
1389
1390
↪evaluate_candidates(candidate_params, cv, more_results)
819 )
820
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),
/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py in␣
↪__call__(self, iterable)
61 for delayed_func, args, kwargs in iterable

62 )
---> 63 return super().__call__(iterable_with_config)
64
65
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in __call__(self,␣
↪iterable)
1950 next(output)
1951
-> 1952 return output if self.return_generator else list(output)
1953
1954 def __repr__(self):
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in _get_outputs(self,␣
↪iterator, pre_dispatch)
1593
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1596
1597 except GeneratorExit:
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in _retrieve(self)
1705 (self._jobs[0].get_status(
1706 timeout=self.timeout) == TASK_PENDING)):
-> 1707 time.sleep(0.01)
8
1708 continue
1709
KeyboardInterrupt:
[101]: grid_search_knn.fit(X_train,y_train)
grid_search_knn.best_params_
[101]: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
[102]: best_grid_dt = grid_search_dt.best_estimator_
eval_dt= evalute(best_grid_dt,X_test,y_test)
print(eval_dt)
{'acc': 0.6309896999552171, 'prec': 0.6168582375478927, 'rec':

0.602996254681648, 'f1': 0.6098484848484849, 'auc': 0.6298243075983347, 'cm':
array([[765, 400],
[424, 644]])}
[103]: df_train['deposit_pred']=best_grid_dt.predict(features)
df_train['deposit_pred']=df_train['deposit_pred'].apply(lambda x: 'yes' if x ==␣
↪0 else 'no')
df_train.head(10)
[103]: age job marital education default balance housing loan \

0 59 admin. married secondary no 2343 yes no
1 56 admin. married secondary no 45 no no
2 41 technician married secondary no 1270 yes no
3 55 services married secondary no 2476 yes no
4 54 admin. married tertiary no 184 no no
5 42 management single tertiary no 0 yes yes
6 56 management married tertiary no 830 yes yes
7 60 retired divorced secondary no 545 yes no
8 37 technician married secondary no 1 yes no
9 28 services single secondary no 5090 yes no
contact day month campaign pdays previous poutcome deposit deposit_pred

0 unknown 5 may 1 -1 0 unknown yes no
4 unknown 5 may 2 -1 0 unknown yes yes
6 unknown 6 may 1 -1 0 unknown yes yes
9
[104]: from sklearn import svm

from sklearn import metrics
svmm=svm.SVC(kernel ='linear')
model_svm=svmm.fit(X_train,y_train)
model_svm_pred=model_svm.predict(X_test)
svm_acc=metrics.accuracy_score(y_test,model_svm_pred)
print(svm_acc)
0.6703985669502911
[105]: df_trains=pd.read_csv('/content/drive/MyDrive/bank.csv')
df_trains.dtypes
[105]: age int64

job object
marital object
education object
default object
balance int64
housing object
loan object
contact object
day int64
month object
duration int64
campaign int64
pdays int64
previous int64
poutcome object
deposit object
dtype: object
[106]: from sklearn.feature_selection import VarianceThreshold
new_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays',␣

↪'previous']
selector = VarianceThreshold(threshold=0)
selector.fit(df_trains[new_cols])
selected_indices = selector.get_support(indices=True)
10
# Get the names of selected features
selected_features = [new_cols[i] for i in selected_indices]
print(selected_features)
['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
[ ]: import pandas as pd
from sklearn.feature_selection import chi2
# Assuming df_train contains your dataset and target_variable is your target␣

↪variable
# Select categorical features

categorical_cols = ['job', 'marital', 'education', 'default', 'housing',␣
↪'loan', 'contact', 'month', 'poutcome']
# Filter the dataset to include only the selected categorical columns

df_categorical = df_train[categorical_cols]
# Perform chi-square test for each categorical column

chi2_values, p_values = chi2(df_categorical, df_train['target_variable'])
# Create a DataFrame to store chi-square test results

chi2_results = pd.DataFrame({'Feature': categorical_cols, 'Chi2': chi2_values,␣
↪'P-value': p_values})
# Filter features based on the conditions (expected frequency > 5)

chi2_results_filtered = chi2_results[chi2_results['Chi2'] > 5]
# Sort the features based on Chi-square scores

chi2_results_filtered = chi2_results_filtered.sort_values(by='Chi2',␣
↪ascending=False)
# Print the selected features

print("Selected features:")
print(chi2_results_filtered)
[116]: from sklearn.feature_selection import chi2
# Define the categorical columns

categorical_cols = ['job', 'marital', 'education', 'default', 'housing',␣
↪'loan', 'contact', 'month', 'poutcome']
# Convert categorical columns to one-hot encoding

df_encoded1 = pd.DataFrame(encoder.fit_transform(df_train[categorical_cols]))
df_encoded1.columns = encoder.get_feature_names_out(categorical_cols)
11
# Perform chi-square test for each categorical column
chi2_values, p_values = chi2(df_encoded1, df_trains['deposit'])
# Combine chi2_values and p_values into a DataFrame for better visualization

chi2_results = pd.DataFrame({'Feature': df_encoded1.columns, 'Chi2':␣
↪chi2_values, 'P-value': p_values})
chi2_results_filtered = chi2_results[chi2_results['Chi2'] > 50]
# Sort the features based on Chi-square scores

chi2_results_filtered = chi2_results_filtered.sort_values(by='Chi2',␣
↪ascending=False)
# Print the selected features

print("Selected features:")
print(chi2_results_filtered)
print(chi2_results.shape)
print(chi2_results_filtered.shape)
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will
be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its
default value.
warnings.warn(
Selected features:
Feature Chi2 P-value
42 poutcome_success 829.110985 2.528999e-182
27 contact_unknown 578.380144 8.441765e-128
22 housing_yes 244.474430 4.160223e-55
36 month_may 242.407391 1.174350e-54
21 housing_no 219.532302 1.143934e-49
35 month_mar 199.685434 2.446118e-45
38 month_oct 192.761120 7.937076e-44
39 month_sep 174.615519 7.263867e-40
25 contact_cellular 155.505749 1.085609e-35
43 poutcome_unknown 150.637570 1.257778e-34
24 loan_yes 118.635404 1.258623e-27
5 job_retired 111.940341 3.682256e-26
8 job_student 107.918053 2.801241e-25
1 job_blue-collar 93.735234 3.606829e-22
28 month_apr 84.743198 3.397460e-20
30 month_dec 83.583952 6.106680e-20
14 marital_single 68.453963 1.298745e-16
17 education_tertiary 66.874790 2.893111e-16
(44, 3)
(18, 3)
12

Practice Test

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Practice Test

Uploaded by

Copyright:

Available Formats

Practice_test

[87]: import pandas as pd

day month duration campaign pdays previous poutcome deposit

[92]: age int64

[93]: pip install scikit-learn

Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-

[94]: age job marital education default balance housing loan \

contact day month campaign pdays previous poutcome deposit

[95]: age float64

[96]: job_admin. job_blue-collar job_entrepreneur job_housemaid \

job_management job_retired job_self-employed job_services job_student \

job_technician … poutcome_other poutcome_success poutcome_unknown \

age balance day campaign pdays previous deposit

[97]: from sklearn.model_selection import train_test_split

[98]: from sklearn import metrics

[99]: from sklearn import tree

print("Decision Tree Score:", dt_score)

Decision Tree Score: {'acc': 0.6336766681594268, 'prec': 0.6215953307392996,

[100]: from sklearn.model_selection import GridSearchCV

[100]: {'max_depth': None,

872 return results

1386 def _run_search(self, evaluate_candidates):

61 for delayed_func, args, kwargs in iterable

[101]: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

[102]: best_grid_dt = grid_search_dt.best_estimator_

{'acc': 0.6309896999552171, 'prec': 0.6168582375478927, 'rec':

[103]: age job marital education default balance housing loan \

contact day month campaign pdays previous poutcome deposit deposit_pred

[104]: from sklearn import svm

[105]: age int64

[106]: from sklearn.feature_selection import VarianceThreshold

new_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays',␣

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Assuming df_train contains your dataset and target_variable is your target␣

# Select categorical features

# Filter the dataset to include only the selected categorical columns

# Perform chi-square test for each categorical column

# Create a DataFrame to store chi-square test results

# Filter features based on the conditions (expected frequency > 5)

# Sort the features based on Chi-square scores

# Print the selected features

[116]: from sklearn.feature_selection import chi2

# Define the categorical columns

# Convert categorical columns to one-hot encoding

# Combine chi2_values and p_values into a DataFrame for better visualization

chi2_results_filtered = chi2_results[chi2_results['Chi2'] > 50]

# Sort the features based on Chi-square scores

# Print the selected features

You might also like