You are on page 1of 14

Supervised Learning

Example Log Transformation


Import Libraries
import seaborn as sns
import numpy as np
import pandas as pd

Load Data Set and make a copy


tips =sns.load_dataset('tips')
tips1= tips
tips1

Create Box plot to check outliers


sns.boxplot (data = tips1 , x = 'day', y = 'total_bill' )

Create dist plot


sns.distplot(tips1['total_bill'])

Apply log Transformation to address outliers


tips1['total_bill'] = np.log10(tips1['total_bill'])

Create box plot and check outlier again


sns.boxplot (data = tips1 , x = 'day', y = 'total_bill' )

Create dist plot


sns.distplot(tips1['total_bill'])

Save the result in .xls


tips1.to_excel('C:\\Noble\\Training\\DS Temporary Files\\tips.xlsx')
Simple Linear regression –
Import the Libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

Load the Data Set

os.chdir('C:\\Noble\\Training\\Top Mentor\\Training\\Data Set\\')


os.getcwd()

df1= pd.read_csv('Salary_Data.csv')
print (df1)

create the graph to check the trend


plt.plot(df1["YearsExperience"], df1["Salary"])
plt.show()

Split the data into x and y - Independent and Dependent variable


x = df1.iloc[:,:-1].values
print (x)
y = df1.iloc[:,1].values
print (y)

Split the Data – Train Test split


from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

Model fitting
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)

Prediction
y_pred= reg.predict(x_test)
print (y_pred)

y= mx +c (Coefficient and Interceptor Values)


Y= slope
from sklearn.metrics import r2_score
print ('Coefficient', reg.coef_)
print ('Intercept', reg.intercept_)
Accuracy of the model
r2_score(y_test,y_pred)

Final Result in Data Frame


x_final = pd.DataFrame(x,columns= ['Experience'])
y_final = pd.DataFrame(y,columns= ['Salary'])
y_pred_final = pd.DataFrame(y_pred,columns= ['Salary Prediction'])
result = pd.concat([x_final,y_final,y_pred_final], axis =1)
print (result)
result.to_excel("C:\\Noble\\Training\\DS Temporary Files\\Simple
Regression.xlsx")
Create a Graph with predicted numbers
plt.scatter(x_train,y_train)
plt.plot (x_train,reg.predict(x_train),'red' )

predicted graph on test data


plt.scatter(x_test,y_test)
plt.plot (x_train,reg.predict(x_train),'red' )

Prediction for new set of data


y_pred= reg. predict ([[12], [9.6],[8.5], [2.5]])
print (y_pred)

Linear Regression Prediction with Data Frame

Import Libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

Change directory

os.chdir('C:\\Noble\\Training\\Top Mentor\\Training\\Data Set\\')


os.getcwd()
Load Data Set

df1= pd.read_csv('Salary_Data.csv')
print (df1)

Plot Graph
plt.plot(df1["YearsExperience"], df1["Salary"])
plt.show()

X and Y as Data Frame

x = df1.iloc[:,:-1]
print (x)
y = df1.iloc[:,1]
print (y)

Train Test Split

from sklearn.model_selection import train_test_split


x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

Linear Regression

from sklearn.linear_model import LinearRegression


reg = LinearRegression()
reg.fit(x_train, y_train)
Prediction
y_pred= reg.predict(x_test)
print (y_pred)

Coefficient and Intercept


print ('Coefficient', reg.coef_)
print ('Intercept', reg.intercept_)

Accuracy
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

Export data to excel

y_pred_final = pd.DataFrame(reg.predict(x),columns= ['Salary Prediction'])


result = pd.concat([x,y,y_pred_final], axis =1)
print (result)
result.to_excel("C:\\Noble\\Training\\DS Temporary Files\\Simple
Regression.xlsx")
Multiple Linear regression –
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score

Load Data Set


os.chdir('C:\\Noble\\Training\\Top Mentor\\Training\\Data Set\\')
df1=pd.read_csv('50_Startups.csv')
df1

Split x and y
x = df1.iloc[:,:-1].values
print (x)

y = df1.iloc[:,4].values
print (y)
Label Encoding
from sklearn.preprocessing import LabelEncoder
Label = LabelEncoder()
x[:,3]= Label.fit_transform(x[:,3])
print (x)

One Hot Encoding


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])],
remainder='passthrough')
x = np.array(ct.fit_transform(x))
print (x)

Print X as Data Frame


print (pd.DataFrame(x))

Split the data as train , test split


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split (x,y,test_size =
0.2,random_state= 42)

Create the Model


from sklearn.linear_model import LinearRegression
reg= LinearRegression()
reg.fit(x_train,y_train)

Predictions
y_pred= reg.predict(x_test)
print (y_pred)

Print Result
result = pd.concat([pd.DataFrame(y_pred),pd.DataFrame(y_test)], axis =1)
print (result)
Print Y and Prediction in one data frame - Concat
y_pre= pd.DataFrame(y_pred, columns =['Prediction'])
y_te = pd.DataFrame(y_test,columns= ['Actual'])
x_te = pd.DataFrame(x_test,columns= ['CF','FR','New Y','R&D','Admin','Mark'])
result = pd.concat([x_te,y_te,y_pre], axis =1)
print (result)

Accuracy
r2_score(y_test, y_pred)
Regression Coefficient
reg.coef_

Regression Intercept
reg.intercept_

Ordinary Least Square Method


x=x.astype('float64')
import statsmodels.api as sm
reg_ols = sm.OLS (endog = y, exog = x)
reg_ols = reg_ols.fit()
print (reg_ols.summary())

Tune the Model by removing State Column (P Value Greater than 0.05)
Print the Data Frame
pd.DataFrame(x)

Create the OLS Method by removing the variable which has maximum P
Value – Remove Column 4

x_opt=x[:,[0,1,2,3,5]]
import statsmodels.api as sm
reg_ols = sm.OLS (endog = y, exog =x_opt)
reg_ols = reg_ols.fit()
print (reg_ols.summary())

Create the OLS Method by removing the variable which has maximum P
Value – Remove Column last Column

x_opt=x[:,[0,1,2,3]]
import statsmodels.api as sm
reg_ols = sm.OLS (endog = y, exog =x_opt)
reg_ols = reg_ols.fit()
print (reg_ols.summary())

All the variables with P Value < 0.05 removed , create the model again with
new data set

Train test Split


from sklearn.model_selection import train_test_split
xopt_train,xopt_test,y_train,y_test =train_test_split (x_opt,y,test_size =
0.2,random_state= 42)

Create Model

from sklearn.linear_model import LinearRegression


reg= LinearRegression()
reg.fit(xopt_train,y_train)

Prediction
yopt_pred= reg.predict(xopt_test)
print (yopt_pred)

Print Result
result = pd.concat([pd.DataFrame(yopt_pred),pd.DataFrame(y_test)], axis =1)
print (result)

Print Original Data Frame with Predicted Value

yopt_pre= pd.DataFrame(yopt_pred, columns =['Prediction'])


y_te = pd.DataFrame(y_test,columns= ['Actual'])
x_te = pd.DataFrame(x_test,columns= ['CF','FR','New Y','R&D','Admin','Mark'])
result = pd.concat([x_te,y_te,yopt_pre], axis =1)
print (result)

Check Accuracy
r2_score(y_test, yopt_pred)

Prediction for All 50 records


yfull_pred= reg.predict(x_opt)
print (yfull_pred)

Accuracy
r2_score(y, yfull_pred)

Create the Model with only column R& D Spend


x_opt=x[:,3:4]
x_opt

Train Test Split


from sklearn.model_selection import train_test_split
xopt_train,xopt_test,y_train,y_test =train_test_split (x_opt,y,test_size =
0.2,random_state= 42)

Print Shape
print (xopt_train.shape)

Create Model with one column


from sklearn.linear_model import LinearRegression
freg= LinearRegression()
freg.fit(xopt_train,y_train)
Prediction and Check accuracy
yone_pred= freg.predict(x_opt)
r2_score(y, yone_pred)

Print the result as Graph


import seaborn as sns
sns.regplot( x = yone_pred, y = y, scatter_kws={"color": "b"}, line_kws={"color":
"r"},ci = None)

Prediction for New Data Set

Load new Data Set


df_Predict=pd.read_csv('50_Startups_Predictions.csv')
df_Predict

Count Number of Records


df_Predict.count()

Create Array
x_Predict = df_Predict.values
print (x_Predict)

Label Encoding
Label_Predict = LabelEncoder()
x_Predict[:,3]= Label_Predict.fit_transform(x_Predict[:,3])
print (x_Predict)

One Hot Encoding


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])],
remainder='passthrough')
x_Predict = np.array(ct.fit_transform(x_Predict))
print (x_Predict)

Print X Values
print (pd.DataFrame(x_Predict))

Generate Predicted Values


xone_Predict= x_Predict[:,3:4]
yone_Predict= freg.predict(xone_Predict)
print (yone_Predict)

Display the result as Data Frame – with X

yone_Predict= pd.DataFrame(yone_Predict, columns =['Prediction'])


x_Predict = pd.DataFrame(x_Predict,columns= ['CF','FR','New Y','R&D','Admin','Mark'])
result = pd.concat([x_Predict,yone_Predict], axis =1)
print (result)

Display the result with Actual Input Data Set


yone_Predict= pd.DataFrame(yone_Predict, columns =['Prediction'])
result = pd.concat([df_Predict,yone_Predict], axis =1)
print (result)

You might also like