Comparing Machine Learning Algorithms for Credit Limit Prediction

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
import plotly.graph_objs as go
import plotly.figure_factory as ff
# Importing dataset and examining it

dataset = pd.read_csv("/content/sample_data/ML_CA2/LimitPrediction.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the
columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())
# Converting Categorical features into Numerical features

dataset['Gender'] = dataset['Gender'].map({'Female': 1, ' Male': 0})
dataset['Student'] = dataset['Student'].map({'No': 1, 'Yes': 0})
dataset['Married'] = dataset['Married'].map({'No': 1, 'Yes': 0})
print(dataset.info())
categorical_features = ['Ethnicity']
final_data = pd.get_dummies(dataset, columns = categorical_features)
print(final_data.info())
print(final_data.head(2))
# Plotting Correlation Heatmap

corrs = final_data.corr()
figure = ff.create_annotated_heatmap(
z=corrs.values,
x=list(corrs.columns),
y=list(corrs.index),
annotation_text=corrs.round(2).values,
showscale=True)
figure.show()
# Dividing dataset into label and feature sets

X = final_data.drop(['Rating','Ethnicity_Caucasian', 'Limit'], axis = 1) # Features
Y = final_data['Limit'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)
# Implementing Linear Regression with L1 Regularization
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter', along
with the regularization parameter alpha using Grid Search
sgdr = SGDRegressor(random_state = 1, penalty = 'elasticnet')
grid_param = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[40000, 50000, 80000,
100000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [1]}
gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)
best_parameters = gd_sr.best_params_
print("Best parameters: ", best_parameters)
best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

print("Best result: ", best_result)
best_model = gd_sr.best_estimator_
print("Intercept: ", best_model.intercept_)
print(pd.DataFrame(zip(X.columns, best_model.coef_),
columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=Fals
e))
# Implementing Linear Regression with L2 Regularization
100000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [0]}

e))
# Implementing Linear Regression with Elasticnet Regularization
100000],'alpha': [.001, .01, .1, 1,10, 100], 'l1_ratio': [0.25,0.5,0.75]}

e))
# Implementing Random Forest Regression
# Tuning the random forest parameter 'n_estimators' and implementing cross-
validation using Grid Search
rfr = RandomForestRegressor(criterion='squared_error', max_features='sqrt',
random_state=1)
grid_param = {'n_estimators': [375,400,450,475]}
gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

# Building random forest using the tuned parameter

rfr = RandomForestRegressor(n_estimators=150, criterion='squared_error',
max_features='sqrt', random_state=1)
rfr.fit(X_scaled,Y)
featimp = pd.Series(rfr.feature_importances_,
index=list(X)).sort_values(ascending=False)
print(featimp)
# Selecting features with higher sifnificance and redefining feature set

X_ = dataset[['Balance', 'Income', 'Age', 'Customer Id', 'Education', 'Cards']]
feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)
# Tuning the random forest parameter 'n_estimators' and implementing cross-

rfr = RandomForestRegressor(criterion='squared_error', max_features='sqrt',
random_state=1)
grid_param = {'n_estimators': [475,500,550,600]}
gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)
gd_sr.fit(X_scaled_, Y)
print('****Using Significant variables***')


###################################################################################
#
# Implementing Support Vector Regression
# Tuning the SVR parameters 'kernel', 'C', 'epsilon' and implementing cross-
svr = SVR()
grid_param = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C':
[1,200,300,500,1000],'Epsilon':[100,1000,10000]}
gd_sr = GridSearchCV(estimator=svr, param_grid=grid_param, scoring='r2', cv=5)



Comparing Machine Learning Algorithms for Credit Limit Prediction

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Comparing Machine Learning Algorithms for Credit Limit Prediction

Uploaded by

Copyright:

Available Formats

import numpy as np

# Importing dataset and examining it

# Converting Categorical features into Numerical features

# Plotting Correlation Heatmap

# Dividing dataset into label and feature sets

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

# Building random forest using the tuned parameter

# Selecting features with higher sifnificance and redefining feature set

# Tuning the random forest parameter 'n_estimators' and implementing cross-

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

print('**Using Significant variables*')

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

gd_sr = GridSearchCV(estimator=svr, param_grid=grid_param, scoring='r2', cv=5)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator

You might also like