Professional Documents
Culture Documents
April 7, 2024
[88]: df_train=pd.read_csv('/content/drive/MyDrive/bank.csv')
df_train.head()
[88]: age job marital education default balance housing loan contact \
0 59 admin. married secondary no 2343 yes no unknown
1 56 admin. married secondary no 45 no no unknown
2 41 technician married secondary no 1270 yes no unknown
3 55 services married secondary no 2476 yes no unknown
4 54 admin. married tertiary no 184 no no unknown
[89]: df_train=df_train.drop('duration',axis=1)
[90]: df_train['deposit'].value_counts()
[90]: deposit
no 5873
yes 5289
Name: count, dtype: int64
[91]: df_train.isnull().sum()
[91]: age 0
job 0
marital 0
education 0
1
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
campaign 0
pdays 0
previous 0
poutcome 0
deposit 0
dtype: int64
[92]: df_train.dtypes
2
[94]: from sklearn.preprocessing import StandardScaler
stdscalar=StandardScaler()
df_train_ready=df_train.copy()
new_cols=['age','balance','day','campaign','pdays','previous']
df_train_ready[new_cols]=stdscalar.fit_transform(df_train_ready[new_cols])
df_train_ready.head()
[95]: df_train_ready.dtypes
3
[96]: from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False)
new_cols=['job','marital','education','default','housing','loan','contact','month','poutcome']
df_encoded=pd.DataFrame(encoder.fit_transform(df_train_ready[new_cols]))
df_encoded.columns=encoder.get_feature_names_out(new_cols)
df_train_ready=df_train_ready.drop(new_cols,axis=1)
df_train_ready=pd.concat([df_encoded,df_train_ready],axis=1)
df_train_ready['deposit']=df_train_ready['deposit'].apply(lambda x: 1 if x==␣
↪'yes' else 0)
df_train_ready.head()
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will
be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its
default value.
warnings.warn(
4
2 -0.019470 -0.080160 -1.265746 -0.554168 -0.481184 -0.36326 1
3 1.155733 0.293762 -1.265746 -0.554168 -0.481184 -0.36326 1
4 1.071790 -0.416876 -1.265746 -0.186785 -0.481184 -0.36326 1
[5 rows x 51 columns]
features=df_train_ready.drop('deposit',axis=1)
target=df_train_ready['deposit']
X_train,X_test,y_train,y_test=train_test_split(features,target,shuffle=True,test_size=0.
↪2,random_state=1)
acc=metrics.accuracy_score(y_test,y_pred)
prec=metrics.precision_score(y_test,y_pred)
rec=metrics.recall_score(y_test,y_pred)
f1=metrics.f1_score(y_test,y_pred)
y_pred_prob=model.predict_proba(X_test)[::,1]
fpr,tpr, _ =metrics.roc_curve(y_test,y_pred_prob)
auc=metrics.roc_auc_score(y_test,y_pred_prob)
cm=metrics.confusion_matrix(y_test,y_pred)
return {'acc':acc,'prec':prec,'rec':rec,'f1':f1,'auc':auc,'cm':cm}
dt=tree.DecisionTreeClassifier(random_state=0)
dt.fit(X_train,y_train)
dt_score=evalute(dt,X_test,y_test)
rfc=RandomForestClassifier(random_state=0)
rfc.fit(X_train,y_train)
rfc_score=evalute(rfc,X_test,y_test)
5
nb=GaussianNB()
nb.fit(X_train,y_train)
nb_score=evalute(nb,X_test,y_test)
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_score=evalute(knn,X_test,y_test)
param_grid_rtc = {
'n_estimators': [100, 300, 500],
'max_depth': [None, 10, 50, 100],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}
param_grid_knn = {
'n_neighbors': [3, 5, 7, 9],
6
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
}
param_grid_dt = {
'max_depth': [None, 10, 50, 100],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt', 'log2']
}
rtc_grids = RandomForestClassifier(random_state=0)
dt_grids = tree.DecisionTreeClassifier(random_state=0)
knn_grids = KNeighborsClassifier()
grid_search_dt =␣
↪GridSearchCV(estimator=dt_grids,param_grid=param_grid_dt,scoring='recall',cv=5,n_jobs=-1)
grid_search_rtc =␣
↪GridSearchCV(estimator=rtc_grids,param_grid=param_grid_rtc,scoring='recall',cv=5,n_jobs=-1)
grid_search_knn =␣
↪GridSearchCV(estimator=knn_grids,param_grid=param_grid_knn,scoring='recall',cv=5,n_jobs=-1)
grid_search_dt.fit(X_train,y_train)
grid_search_dt.best_params_
[69]: grid_search_rtc.fit(X_train,y_train)
grid_search_rtc.best_params_
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-69-dff0ec5eadd1> in <cell line: 1>()
----> 1 grid_search_rtc.fit(X_train,y_train)
2 grid_search_rtc.best_params_
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪fit(self, X, y, groups, **fit_params)
7
876 # multimetric is determined here because in the case of a␣
↪callable
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪_run_search(self, evaluate_candidates)
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py in␣
↪evaluate_candidates(candidate_params, cv, more_results)
819 )
820
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),
/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py in␣
↪__call__(self, iterable)
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in __call__(self,␣
↪iterable)
1950 next(output)
1951
-> 1952 return output if self.return_generator else list(output)
1953
1954 def __repr__(self):
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in _get_outputs(self,␣
↪iterator, pre_dispatch)
1593
1594 with self._backend.retrieval_context():
-> 1595 yield from self._retrieve()
1596
1597 except GeneratorExit:
/usr/local/lib/python3.10/dist-packages/joblib/parallel.py in _retrieve(self)
1705 (self._jobs[0].get_status(
1706 timeout=self.timeout) == TASK_PENDING)):
-> 1707 time.sleep(0.01)
8
1708 continue
1709
KeyboardInterrupt:
[101]: grid_search_knn.fit(X_train,y_train)
grid_search_knn.best_params_
eval_dt= evalute(best_grid_dt,X_test,y_test)
print(eval_dt)
[103]: df_train['deposit_pred']=best_grid_dt.predict(features)
df_train['deposit_pred']=df_train['deposit_pred'].apply(lambda x: 'yes' if x ==␣
↪0 else 'no')
df_train.head(10)
9
7 unknown 6 may 1 -1 0 unknown yes no
8 unknown 6 may 1 -1 0 unknown yes no
9 unknown 6 may 3 -1 0 unknown yes no
svmm=svm.SVC(kernel ='linear')
model_svm=svmm.fit(X_train,y_train)
model_svm_pred=model_svm.predict(X_test)
svm_acc=metrics.accuracy_score(y_test,model_svm_pred)
print(svm_acc)
0.6703985669502911
[105]: df_trains=pd.read_csv('/content/drive/MyDrive/bank.csv')
df_trains.dtypes
selector = VarianceThreshold(threshold=0)
selector.fit(df_trains[new_cols])
selected_indices = selector.get_support(indices=True)
10
# Get the names of selected features
selected_features = [new_cols[i] for i in selected_indices]
print(selected_features)
[ ]: import pandas as pd
from sklearn.feature_selection import chi2
11
# Perform chi-square test for each categorical column
chi2_values, p_values = chi2(df_encoded1, df_trains['deposit'])
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868:
FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will
be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its
default value.
warnings.warn(
Selected features:
Feature Chi2 P-value
42 poutcome_success 829.110985 2.528999e-182
27 contact_unknown 578.380144 8.441765e-128
22 housing_yes 244.474430 4.160223e-55
36 month_may 242.407391 1.174350e-54
21 housing_no 219.532302 1.143934e-49
35 month_mar 199.685434 2.446118e-45
38 month_oct 192.761120 7.937076e-44
39 month_sep 174.615519 7.263867e-40
25 contact_cellular 155.505749 1.085609e-35
43 poutcome_unknown 150.637570 1.257778e-34
24 loan_yes 118.635404 1.258623e-27
5 job_retired 111.940341 3.682256e-26
8 job_student 107.918053 2.801241e-25
1 job_blue-collar 93.735234 3.606829e-22
28 month_apr 84.743198 3.397460e-20
30 month_dec 83.583952 6.106680e-20
14 marital_single 68.453963 1.298745e-16
17 education_tertiary 66.874790 2.893111e-16
(44, 3)
(18, 3)
12