Practical-5 - Jupyter Notebook

8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
In [ ]:
In [2]:
df=pd.read_csv('diamonds.csv')
In [3]:
df.head()
Out[3]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [4]:
nulls=df.isnull().sum()
print(nulls)
Unnamed: 0 0
carat 0
cut 0
color 0
clarity 0
depth 0
table 0
price 0
x 0
y 0
z 0
dtype: int64
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 1/8
In [5]:
to_drop=['Unnamed: 0']
df.drop(columns=to_drop,inplace=True)
In [6]:
df.head()
Out[6]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [7]:
df.shape
Out[7]:
(53940, 10)
In [8]:
df.dtypes
Out[8]:
carat float64
cut object
color object
clarity object
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object
In [9]:
df.describe()
Out[9]:
carat depth table price x y
count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000
mean 0.797940 61.749405 57.457184 3932.799722 5.731157 5.734526
std 0.474011 1.432621 2.234491 3989.439738 1.121761 1.142135
min 0.200000 43.000000 43.000000 326.000000 0.000000 0.000000
25% 0.400000 61.000000 56.000000 950.000000 4.710000 4.720000
50% 0.700000 61.800000 57.000000 2401.000000 5.700000 5.710000
75% 1.040000 62.500000 59.000000 5324.250000 6.540000 6.540000
max 5.010000 79.000000 95.000000 18823.000000 10.740000 58.900000
 
In [10]:
cols=['cut','color','clarity']
df_new=df.drop(columns=cols,inplace=False)
#mth-2 df_new= df[['carat','depth'...all numerical columns]]
#droping categorical values to find z-score. Z-score is useful in finding outliers.
In [11]:
df_new.dtypes
Out[11]:
carat float64
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object
In [12]:
z_score= (df_new-df_new.mean())/df_new.std()
z_score # z-score is new squeezed value for dataset
Out[12]:
carat depth table price x y z
0 -1.198157 -0.174090 -1.099662 -0.904087 -1.587823 -1.536181 -1.571115
1 -1.240350 -1.360726 1.585514 -0.904087 -1.641310 -1.658759 -1.741159
2 -1.198157 -3.384987 3.375631 -0.903836 -1.498677 -1.457382 -1.741159
3 -1.071577 0.454129 0.242926 -0.902081 -1.364959 -1.317293 -1.287708
4 -1.029384 1.082348 0.242926 -0.901831 -1.240155 -1.212227 -1.117663
... ... ... ... ... ... ... ...
53935 -0.164426 -0.662705 -0.204603 -0.294728 0.016798 0.022304 -0.054887
53936 -0.164426 0.942744 -1.099662 -0.294728 -0.036690 0.013548 0.100987
53937 -0.206619 0.733338 1.137985 -0.294728 -0.063434 -0.047740 0.030135
53938 0.130926 -0.523100 0.242926 -0.294728 0.373380 0.337503 0.285201
53939 -0.101136 0.314525 -1.099662 -0.294728 0.088114 0.118615 0.143498
53940 rows × 7 columns
In [13]:
outliers=df_new[(z_score>3).any(axis=1)]
outliers
#means after z_score SD of all features will be 1 and mean 0. If any row have value of z_
 
Out[13]:
carat depth table price x y z
2 0.23 56.9 65.0 327 4.05 4.07 2.31
91 0.86 55.1 69.0 2757 6.45 6.33 3.52
97 0.96 66.3 62.0 2759 6.27 5.95 4.07
204 0.98 67.9 60.0 2777 6.05 5.97 4.08
227 0.84 55.1 67.0 2782 6.39 6.20 3.47
... ... ... ... ... ... ... ...
53697 0.70 64.5 65.0 2717 5.52 5.45 3.54
53727 0.78 66.9 57.0 2721 5.70 5.66 3.60
53785 0.89 64.3 65.0 2728 6.00 5.95 3.84
53800 0.90 68.7 62.0 2732 5.83 5.79 3.99
53863 1.00 66.8 56.0 2743 6.22 6.12 4.13
2077 rows × 7 columns
In [14]:
z_score.std()
Out[14]:
carat 1.0
depth 1.0
table 1.0
price 1.0
x 1.0
y 1.0
z 1.0
dtype: float64
In [15]:
df.drop(index = outliers.index,inplace=True)
In [16]:
num_cols=df.columns[(df.dtypes)=='float64'].tolist()
cat_cols=df.columns[(df.dtypes)=='object'].tolist()
#seperate features into numerical and categorical types
In [17]:
std_scaler=StandardScaler() #making object of StandardScaler

df[num_cols]=std_scaler.fit_transform(df[num_cols])
In [18]:
df.head()
Out[18]:
carat cut color clarity depth table price x y
0 -1.248109 Ideal E SI2 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623
1 -1.295150 Premium E SI1 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806
3 -1.106987 Premium I VS2 0.508013 0.289551 334 -1.377728 -1.362149 -1.317
4 -1.059946 Good J SI2 1.187779 0.289551 335 -1.245624 -1.248114 -1.134
Very
5 -1.224589 J VVS2 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546
Good
 
In [19]:
df[['cut','color','clarity']]=df[['cut','color','clarity']].apply(LabelEncoder().fit_tra
#assigns numerical value to categorical data
In [20]:
df.head()
Out[20]:
carat cut color clarity depth table price x y z
0 -1.248109 2 1 3 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623059
1 -1.295150 3 1 2 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806163
3 -1.106987 3 5 5 0.508013 0.289551 334 -1.377728 -1.362149 -1.317887
4 -1.059946 1 6 3 1.187779 0.289551 335 -1.245624 -1.248114 -1.134783
5 -1.224589 4 6 7 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546766
In [21]:
from sklearn.model_selection import train_test_split
In [22]:
#seperating data first into i/p and o/p columns and then into test and train
y=df['price']
x=df.drop('price',axis=1)
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.2, random_state=1)
In [23]:
print(len(x_train))
41490
In [24]:
from sklearn import datasets,linear_model, metrics

#obj of linear regression
reg=linear_model.LinearRegression()
#training model using fit
reg.fit(x_train,y_train)
print('Coefficients ', reg.coef_)# coef_ variable stores values of coef. i.e theta-1,the
Coefficients [ 4847.84477929 39.73811533 -242.60007725 258.69401187

-185.36160706 -178.87300891 -2317.7757166 675.34322907
175.75354744]
In [25]:
# y-intercept
print(reg.intercept_)
3109.709922835895
In [26]:
y_pred=reg.predict(x_test)
In [27]:
#coefficient of determination
r_squared=reg.score(x_test,y_test)
print(r_squared)
0.8903048480241464
In [28]:
x_train= x_train.values[:,0].reshape(-1,1)
x_test= x_test.values[:,0].reshape(-1,1)
plt.scatter(x_train,y_train,color='red')
plt.scatter(x_test,y_test,color='green')
plt.plot(x_test,y_pred,color='blue',linewidth=3)
Out[28]:
[<matplotlib.lines.Line2D at 0x262092e0f40>]
In [29]:
print('mean absolute error:' , metrics.mean_absolute_error(y_test,y_pred))

print('mean squared error:' , metrics.mean_squared_error(y_test,y_pred))
print('root mean squared error:' , np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
#r_squared=reg.score(x_test,y_test)
#print(r_squared)
mean absolute error: 745.7533805818331

mean squared error: 1286516.9599848085
root mean squared error: 1134.2473098865205
support vector regression

In [30]:
from sklearn.svm import SVR
In [*]:
svr_rbf = SVR(kernel='rbf',C=1e3,gamma=0.1)
svr_lin=SVR(kernel='linear',C=1e3)
svr_poly=SVR(kernel='poly',C=1e3,degree=2)
y_rbf=svr_rbf.fit(x_train,y_train).predict(x_test)
y_lin=svr_lin.fit(x_train,y_train).predict(x_test)
y_poly=svr_poly.fit(x_train,y_train).predict(x_test)
In [*]:
lw=2
plt.scatter(x_train,y_train,color='darkorange',label='data')
plt.plot(x_test,y_rbf, color='navy',lw=lw,label='RBF model')

plt.xlabel('data')
plt.ylabel('target')
plt.title('support vector regression')
plt.legend
plt.show()
In [ ]:

Practical-5 - Jupyter Notebook

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Practical-5 - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

8/31/23, 11:04 AM practical-5 - Jupyter Notebook

Unnamed: 0 carat cut color clarity depth table price x y z

0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

carat cut color clarity depth table price x y z

0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

carat depth table price x y

count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000

mean 0.797940 61.749405 57.457184 3932.799722 5.731157 5.734526

std 0.474011 1.432621 2.234491 3989.439738 1.121761 1.142135

min 0.200000 43.000000 43.000000 326.000000 0.000000 0.000000

25% 0.400000 61.000000 56.000000 950.000000 4.710000 4.720000

50% 0.700000 61.800000 57.000000 2401.000000 5.700000 5.710000

75% 1.040000 62.500000 59.000000 5324.250000 6.540000 6.540000

max 5.010000 79.000000 95.000000 18823.000000 10.740000 58.900000

carat depth table price x y z

0 -1.198157 -0.174090 -1.099662 -0.904087 -1.587823 -1.536181 -1.571115

1 -1.240350 -1.360726 1.585514 -0.904087 -1.641310 -1.658759 -1.741159

2 -1.198157 -3.384987 3.375631 -0.903836 -1.498677 -1.457382 -1.741159

3 -1.071577 0.454129 0.242926 -0.902081 -1.364959 -1.317293 -1.287708

4 -1.029384 1.082348 0.242926 -0.901831 -1.240155 -1.212227 -1.117663

... ... ... ... ... ... ... ...

53935 -0.164426 -0.662705 -0.204603 -0.294728 0.016798 0.022304 -0.054887

53936 -0.164426 0.942744 -1.099662 -0.294728 -0.036690 0.013548 0.100987

53937 -0.206619 0.733338 1.137985 -0.294728 -0.063434 -0.047740 0.030135

53938 0.130926 -0.523100 0.242926 -0.294728 0.373380 0.337503 0.285201

53939 -0.101136 0.314525 -1.099662 -0.294728 0.088114 0.118615 0.143498

53940 rows × 7 columns

carat depth table price x y z

2 0.23 56.9 65.0 327 4.05 4.07 2.31

91 0.86 55.1 69.0 2757 6.45 6.33 3.52

97 0.96 66.3 62.0 2759 6.27 5.95 4.07

204 0.98 67.9 60.0 2777 6.05 5.97 4.08

227 0.84 55.1 67.0 2782 6.39 6.20 3.47

... ... ... ... ... ... ... ...

53697 0.70 64.5 65.0 2717 5.52 5.45 3.54

53727 0.78 66.9 57.0 2721 5.70 5.66 3.60

53785 0.89 64.3 65.0 2728 6.00 5.95 3.84

53800 0.90 68.7 62.0 2732 5.83 5.79 3.99

53863 1.00 66.8 56.0 2743 6.22 6.12 4.13

2077 rows × 7 columns

std_scaler=StandardScaler() #making object of StandardScaler

carat cut color clarity depth table price x y

0 -1.248109 Ideal E SI2 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623

1 -1.295150 Premium E SI1 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806

3 -1.106987 Premium I VS2 0.508013 0.289551 334 -1.377728 -1.362149 -1.317

4 -1.059946 Good J SI2 1.187779 0.289551 335 -1.245624 -1.248114 -1.134

carat cut color clarity depth table price x y z

0 -1.248109 2 1 3 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623059

1 -1.295150 3 1 2 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806163

3 -1.106987 3 5 5 0.508013 0.289551 334 -1.377728 -1.362149 -1.317887

4 -1.059946 1 6 3 1.187779 0.289551 335 -1.245624 -1.248114 -1.134783

5 -1.224589 4 6 7 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546766

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.2, random_state=1)

from sklearn import datasets,linear_model, metrics