You are on page 1of 12

Practice_test

April 7, 2024

[87]: import pandas as pd


import numpy as num
import seaborn as sns
import matplotlib.pyplot as plt

[88]: df_train=pd.read_csv('/content/drive/MyDrive/bank.csv')
df_train.head()

[88]: age job marital education default balance housing loan contact \
0 59 admin. married secondary no 2343 yes no unknown
1 56 admin. married secondary no 45 no no unknown
2 41 technician married secondary no 1270 yes no unknown
3 55 services married secondary no 2476 yes no unknown
4 54 admin. married tertiary no 184 no no unknown

day month duration campaign pdays previous poutcome deposit


0 5 may 1042 1 -1 0 unknown yes
1 5 may 1467 1 -1 0 unknown yes
2 5 may 1389 1 -1 0 unknown yes
3 5 may 579 1 -1 0 unknown yes
4 5 may 673 2 -1 0 unknown yes

[89]: df_train=df_train.drop('duration',axis=1)

[90]: df_train['deposit'].value_counts()

[90]: deposit
no 5873
yes 5289
Name: count, dtype: int64

[91]: df_train.isnull().sum()

[91]: age 0
job 0
marital 0
education 0

1
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
campaign 0
pdays 0
previous 0
poutcome 0
deposit 0
dtype: int64

[92]: df_train.dtypes

[92]: age int64


job object
marital object
education object
default object
balance int64
housing object
loan object
contact object
day int64
month object
campaign int64
pdays int64
previous int64
poutcome object
deposit object
dtype: object

[93]: pip install scikit-learn

Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-


packages (1.2.2)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-
packages (from scikit-learn) (1.25.2)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-
packages (from scikit-learn) (1.11.4)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-
packages (from scikit-learn) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in
/usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.4.0)

2
[94]: from sklearn.preprocessing import StandardScaler

stdscalar=StandardScaler()

df_train_ready=df_train.copy()

new_cols=['age','balance','day','campaign','pdays','previous']

df_train_ready[new_cols]=stdscalar.fit_transform(df_train_ready[new_cols])

df_train_ready.head()

[94]: age job marital education default balance housing loan \


0 1.491505 admin. married secondary no 0.252525 yes no
1 1.239676 admin. married secondary no -0.459974 no no
2 -0.019470 technician married secondary no -0.080160 yes no
3 1.155733 services married secondary no 0.293762 yes no
4 1.071790 admin. married tertiary no -0.416876 no no

contact day month campaign pdays previous poutcome deposit


0 unknown -1.265746 may -0.554168 -0.481184 -0.36326 unknown yes
1 unknown -1.265746 may -0.554168 -0.481184 -0.36326 unknown yes
2 unknown -1.265746 may -0.554168 -0.481184 -0.36326 unknown yes
3 unknown -1.265746 may -0.554168 -0.481184 -0.36326 unknown yes
4 unknown -1.265746 may -0.186785 -0.481184 -0.36326 unknown yes

[95]: df_train_ready.dtypes

[95]: age float64


job object
marital object
education object
default object
balance float64
housing object
loan object
contact object
day float64
month object
campaign float64
pdays float64
previous float64
poutcome object
deposit object
dtype: object

3
[96]: from sklearn.preprocessing import OneHotEncoder

encoder=OneHotEncoder(sparse=False)

new_cols=['job','marital','education','default','housing','loan','contact','month','poutcome']

df_encoded=pd.DataFrame(encoder.fit_transform(df_train_ready[new_cols]))
df_encoded.columns=encoder.get_feature_names_out(new_cols)

df_train_ready=df_train_ready.drop(new_cols,axis=1)
df_train_ready=pd.concat([df_encoded,df_train_ready],axis=1)

df_train_ready['deposit']=df_train_ready['deposit'].apply(lambda x: 1 if x==␣
↪'yes' else 0)

df_train_ready.head()

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will
be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its
default value.
warnings.warn(

[96]: job_admin. job_blue-collar job_entrepreneur job_housemaid \


0 1.0 0.0 0.0 0.0
1 1.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0

job_management job_retired job_self-employed job_services job_student \


0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 1.0 0.0
4 0.0 0.0 0.0 0.0 0.0

job_technician … poutcome_other poutcome_success poutcome_unknown \


0 0.0 … 0.0 0.0 1.0
1 0.0 … 0.0 0.0 1.0
2 1.0 … 0.0 0.0 1.0
3 0.0 … 0.0 0.0 1.0
4 0.0 … 0.0 0.0 1.0

age balance day campaign pdays previous deposit


0 1.491505 0.252525 -1.265746 -0.554168 -0.481184 -0.36326 1
1 1.239676 -0.459974 -1.265746 -0.554168 -0.481184 -0.36326 1

4
2 -0.019470 -0.080160 -1.265746 -0.554168 -0.481184 -0.36326 1
3 1.155733 0.293762 -1.265746 -0.554168 -0.481184 -0.36326 1
4 1.071790 -0.416876 -1.265746 -0.186785 -0.481184 -0.36326 1

[5 rows x 51 columns]

[97]: from sklearn.model_selection import train_test_split

features=df_train_ready.drop('deposit',axis=1)

target=df_train_ready['deposit']

X_train,X_test,y_train,y_test=train_test_split(features,target,shuffle=True,test_size=0.
↪2,random_state=1)

[98]: from sklearn import metrics


def evalute(model,X_test,y_test):
y_pred=model.predict(X_test)

acc=metrics.accuracy_score(y_test,y_pred)
prec=metrics.precision_score(y_test,y_pred)
rec=metrics.recall_score(y_test,y_pred)
f1=metrics.f1_score(y_test,y_pred)

y_pred_prob=model.predict_proba(X_test)[::,1]
fpr,tpr, _ =metrics.roc_curve(y_test,y_pred_prob)
auc=metrics.roc_auc_score(y_test,y_pred_prob)

cm=metrics.confusion_matrix(y_test,y_pred)

return {'acc':acc,'prec':prec,'rec':rec,'f1':f1,'auc':auc,'cm':cm}

[99]: from sklearn import tree


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

dt=tree.DecisionTreeClassifier(random_state=0)
dt.fit(X_train,y_train)

dt_score=evalute(dt,X_test,y_test)

rfc=RandomForestClassifier(random_state=0)
rfc.fit(X_train,y_train)

rfc_score=evalute(rfc,X_test,y_test)

5
nb=GaussianNB()
nb.fit(X_train,y_train)

nb_score=evalute(nb,X_test,y_test)

knn=KNeighborsClassifier()
knn.fit(X_train,y_train)

knn_score=evalute(knn,X_test,y_test)

print("Decision Tree Score:", dt_score)


print("Random Forest Score:", rfc_score)
print("Naive Bayes Score:", nb_score)
print("KNN Score:", knn_score)

Decision Tree Score: {'acc': 0.6336766681594268, 'prec': 0.6215953307392996,


'rec': 0.598314606741573, 'f1': 0.6097328244274809, 'auc': 0.6322045136712157,
'cm': array([[776, 389],
[429, 639]])}
Random Forest Score: {'acc': 0.7205553067622034, 'prec': 0.7488789237668162,
'rec': 0.6254681647940075, 'f1': 0.6816326530612244, 'auc': 0.781940492838887,
'cm': array([[941, 224],
[400, 668]])}
Naive Bayes Score: {'acc': 0.6815942678011644, 'prec': 0.7560975609756098,
'rec': 0.4934456928838951, 'f1': 0.5971671388101983, 'auc': 0.7421999324878237,
'cm': array([[995, 170],
[541, 527]])}
KNN Score: {'acc': 0.6869682042095835, 'prec': 0.6981740064446831, 'rec':
0.6086142322097379, 'f1': 0.6503251625812906, 'auc': 0.7323909758724342, 'cm':
array([[884, 281],
[418, 650]])}

[100]: from sklearn.model_selection import GridSearchCV


from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

param_grid_rtc = {
'n_estimators': [100, 300, 500],
'max_depth': [None, 10, 50, 100],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}

param_grid_knn = {
'n_neighbors': [3, 5, 7, 9],

6
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
}

param_grid_dt = {
'max_depth': [None, 10, 50, 100],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}

rtc_grids = RandomForestClassifier(random_state=0)
dt_grids = tree.DecisionTreeClassifier(random_state=0)
knn_grids = KNeighborsClassifier()

grid_search_dt =␣
↪GridSearchCV(estimator=dt_grids,param_grid=param_grid_dt,scoring='recall',cv=5,n_jobs=-1)

grid_search_rtc =␣
↪GridSearchCV(estimator=rtc_grids,param_grid=param_grid_rtc,scoring='recall',cv=5,n_jobs=-1)

grid_search_knn =␣
↪GridSearchCV(estimator=knn_grids,param_grid=param_grid_knn,scoring='recall',cv=5,n_jobs=-1)

grid_search_dt.fit(X_train,y_train)
grid_search_dt.best_params_

[100]: {'max_depth': None,


'max_features': 'log2',
'min_samples_leaf': 1,
'min_samples_split': 2}

[69]: grid_search_rtc.fit(X_train,y_train)
grid_search_rtc.best_params_

---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-69-dff0ec5eadd1> in <cell line: 1>()
----> 1 grid_search_rtc.fit(X_train,y_train)
2 grid_search_rtc.best_params_

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪fit(self, X, y, groups, **fit_params)

872 return results


873
--> 874 self._run_search(evaluate_candidates)
875

7
876 # multimetric is determined here because in the case of a␣
↪callable

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪_run_search(self, evaluate_candidates)

1386 def _run_search(self, evaluate_candidates):


1387 """Search all candidates in param_grid"""
-> 1388 evaluate_candidates(ParameterGrid(self.param_grid))
1389
1390

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪evaluate_candidates(candidate_params, cv, more_results)

819 )
820
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),

/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py in␣
↪__call__(self, iterable)

61 for delayed_func, args, kwargs in iterable


62 )
---> 63 return super().__call__(iterable_with_config)
64
65

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in __call__(self,␣
↪iterable)

1950 next(output)
1951
-> 1952 return output if self.return_generator else list(output)
1953
1954 def __repr__(self):

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in _get_outputs(self,␣
↪iterator, pre_dispatch)

1593
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1596
1597 except GeneratorExit:

/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in _retrieve(self)
1705 (self._jobs[0].get_status(
1706 timeout=self.timeout) == TASK_PENDING)):
-> 1707 time.sleep(0.01)

8
1708 continue
1709

KeyboardInterrupt:

[101]: grid_search_knn.fit(X_train,y_train)
grid_search_knn.best_params_

[101]: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

[102]: best_grid_dt = grid_search_dt.best_estimator_

eval_dt= evalute(best_grid_dt,X_test,y_test)

print(eval_dt)

{'acc': 0.6309896999552171, 'prec': 0.6168582375478927, 'rec':


0.602996254681648, 'f1': 0.6098484848484849, 'auc': 0.6298243075983347, 'cm':
array([[765, 400],
[424, 644]])}

[103]: df_train['deposit_pred']=best_grid_dt.predict(features)
df_train['deposit_pred']=df_train['deposit_pred'].apply(lambda x: 'yes' if x ==␣
↪0 else 'no')

df_train.head(10)

[103]: age job marital education default balance housing loan \


0 59 admin. married secondary no 2343 yes no
1 56 admin. married secondary no 45 no no
2 41 technician married secondary no 1270 yes no
3 55 services married secondary no 2476 yes no
4 54 admin. married tertiary no 184 no no
5 42 management single tertiary no 0 yes yes
6 56 management married tertiary no 830 yes yes
7 60 retired divorced secondary no 545 yes no
8 37 technician married secondary no 1 yes no
9 28 services single secondary no 5090 yes no

contact day month campaign pdays previous poutcome deposit deposit_pred


0 unknown 5 may 1 -1 0 unknown yes no
1 unknown 5 may 1 -1 0 unknown yes no
2 unknown 5 may 1 -1 0 unknown yes no
3 unknown 5 may 1 -1 0 unknown yes no
4 unknown 5 may 2 -1 0 unknown yes yes
5 unknown 5 may 2 -1 0 unknown yes no
6 unknown 6 may 1 -1 0 unknown yes yes

9
7 unknown 6 may 1 -1 0 unknown yes no
8 unknown 6 may 1 -1 0 unknown yes no
9 unknown 6 may 3 -1 0 unknown yes no

[104]: from sklearn import svm


from sklearn import metrics

svmm=svm.SVC(kernel ='linear')
model_svm=svmm.fit(X_train,y_train)
model_svm_pred=model_svm.predict(X_test)
svm_acc=metrics.accuracy_score(y_test,model_svm_pred)
print(svm_acc)

0.6703985669502911

[105]: df_trains=pd.read_csv('/content/drive/MyDrive/bank.csv')
df_trains.dtypes

[105]: age int64


job object
marital object
education object
default object
balance int64
housing object
loan object
contact object
day int64
month object
duration int64
campaign int64
pdays int64
previous int64
poutcome object
deposit object
dtype: object

[106]: from sklearn.feature_selection import VarianceThreshold

new_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays',␣


↪'previous']

selector = VarianceThreshold(threshold=0)

selector.fit(df_trains[new_cols])
selected_indices = selector.get_support(indices=True)

10
# Get the names of selected features
selected_features = [new_cols[i] for i in selected_indices]

print(selected_features)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

[ ]: import pandas as pd
from sklearn.feature_selection import chi2

# Assuming df_train contains your dataset and target_variable is your target␣


↪variable

# Select categorical features


categorical_cols = ['job', 'marital', 'education', 'default', 'housing',␣
↪'loan', 'contact', 'month', 'poutcome']

# Filter the dataset to include only the selected categorical columns


df_categorical = df_train[categorical_cols]

# Perform chi-square test for each categorical column


chi2_values, p_values = chi2(df_categorical, df_train['target_variable'])

# Create a DataFrame to store chi-square test results


chi2_results = pd.DataFrame({'Feature': categorical_cols, 'Chi2': chi2_values,␣
↪'P-value': p_values})

# Filter features based on the conditions (expected frequency > 5)


chi2_results_filtered = chi2_results[chi2_results['Chi2'] > 5]

# Sort the features based on Chi-square scores


chi2_results_filtered = chi2_results_filtered.sort_values(by='Chi2',␣
↪ascending=False)

# Print the selected features


print("Selected features:")
print(chi2_results_filtered)

[116]: from sklearn.feature_selection import chi2

# Define the categorical columns


categorical_cols = ['job', 'marital', 'education', 'default', 'housing',␣
↪'loan', 'contact', 'month', 'poutcome']

# Convert categorical columns to one-hot encoding


df_encoded1 = pd.DataFrame(encoder.fit_transform(df_train[categorical_cols]))
df_encoded1.columns = encoder.get_feature_names_out(categorical_cols)

11
# Perform chi-square test for each categorical column
chi2_values, p_values = chi2(df_encoded1, df_trains['deposit'])

# Combine chi2_values and p_values into a DataFrame for better visualization


chi2_results = pd.DataFrame({'Feature': df_encoded1.columns, 'Chi2':␣
↪chi2_values, 'P-value': p_values})

chi2_results_filtered = chi2_results[chi2_results['Chi2'] > 50]

# Sort the features based on Chi-square scores


chi2_results_filtered = chi2_results_filtered.sort_values(by='Chi2',␣
↪ascending=False)

# Print the selected features


print("Selected features:")
print(chi2_results_filtered)
print(chi2_results.shape)
print(chi2_results_filtered.shape)

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will
be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its
default value.
warnings.warn(
Selected features:
Feature Chi2 P-value
42 poutcome_success 829.110985 2.528999e-182
27 contact_unknown 578.380144 8.441765e-128
22 housing_yes 244.474430 4.160223e-55
36 month_may 242.407391 1.174350e-54
21 housing_no 219.532302 1.143934e-49
35 month_mar 199.685434 2.446118e-45
38 month_oct 192.761120 7.937076e-44
39 month_sep 174.615519 7.263867e-40
25 contact_cellular 155.505749 1.085609e-35
43 poutcome_unknown 150.637570 1.257778e-34
24 loan_yes 118.635404 1.258623e-27
5 job_retired 111.940341 3.682256e-26
8 job_student 107.918053 2.801241e-25
1 job_blue-collar 93.735234 3.606829e-22
28 month_apr 84.743198 3.397460e-20
30 month_dec 83.583952 6.106680e-20
14 marital_single 68.453963 1.298745e-16
17 education_tertiary 66.874790 2.893111e-16
(44, 3)
(18, 3)

12

You might also like