You are on page 1of 8

8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [ ]:

In [2]:

df=pd.read_csv('diamonds.csv')

In [3]:

df.head()

Out[3]:

Unnamed: 0 carat cut color clarity depth table price x y z

0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

In [4]:

nulls=df.isnull().sum()
print(nulls)

Unnamed: 0 0
carat 0
cut 0
color 0
clarity 0
depth 0
table 0
price 0
x 0
y 0
z 0
dtype: int64

In [ ]:

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 1/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [5]:

to_drop=['Unnamed: 0']
df.drop(columns=to_drop,inplace=True)

In [6]:

df.head()

Out[6]:

carat cut color clarity depth table price x y z

0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

In [7]:

df.shape

Out[7]:

(53940, 10)

In [8]:

df.dtypes

Out[8]:

carat float64
cut object
color object
clarity object
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 2/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [9]:

df.describe()

Out[9]:

carat depth table price x y

count 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000 53940.000000

mean 0.797940 61.749405 57.457184 3932.799722 5.731157 5.734526

std 0.474011 1.432621 2.234491 3989.439738 1.121761 1.142135

min 0.200000 43.000000 43.000000 326.000000 0.000000 0.000000

25% 0.400000 61.000000 56.000000 950.000000 4.710000 4.720000

50% 0.700000 61.800000 57.000000 2401.000000 5.700000 5.710000

75% 1.040000 62.500000 59.000000 5324.250000 6.540000 6.540000

max 5.010000 79.000000 95.000000 18823.000000 10.740000 58.900000

 

In [10]:

cols=['cut','color','clarity']
df_new=df.drop(columns=cols,inplace=False)
#mth-2 df_new= df[['carat','depth'...all numerical columns]]
#droping categorical values to find z-score. Z-score is useful in finding outliers.

In [11]:

df_new.dtypes

Out[11]:

carat float64
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 3/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [12]:

z_score= (df_new-df_new.mean())/df_new.std()
z_score # z-score is new squeezed value for dataset

Out[12]:

carat depth table price x y z

0 -1.198157 -0.174090 -1.099662 -0.904087 -1.587823 -1.536181 -1.571115

1 -1.240350 -1.360726 1.585514 -0.904087 -1.641310 -1.658759 -1.741159

2 -1.198157 -3.384987 3.375631 -0.903836 -1.498677 -1.457382 -1.741159

3 -1.071577 0.454129 0.242926 -0.902081 -1.364959 -1.317293 -1.287708

4 -1.029384 1.082348 0.242926 -0.901831 -1.240155 -1.212227 -1.117663

... ... ... ... ... ... ... ...

53935 -0.164426 -0.662705 -0.204603 -0.294728 0.016798 0.022304 -0.054887

53936 -0.164426 0.942744 -1.099662 -0.294728 -0.036690 0.013548 0.100987

53937 -0.206619 0.733338 1.137985 -0.294728 -0.063434 -0.047740 0.030135

53938 0.130926 -0.523100 0.242926 -0.294728 0.373380 0.337503 0.285201

53939 -0.101136 0.314525 -1.099662 -0.294728 0.088114 0.118615 0.143498

53940 rows × 7 columns

In [13]:

outliers=df_new[(z_score>3).any(axis=1)]
outliers
#means after z_score SD of all features will be 1 and mean 0. If any row have value of z_
 

Out[13]:

carat depth table price x y z

2 0.23 56.9 65.0 327 4.05 4.07 2.31

91 0.86 55.1 69.0 2757 6.45 6.33 3.52

97 0.96 66.3 62.0 2759 6.27 5.95 4.07

204 0.98 67.9 60.0 2777 6.05 5.97 4.08

227 0.84 55.1 67.0 2782 6.39 6.20 3.47

... ... ... ... ... ... ... ...

53697 0.70 64.5 65.0 2717 5.52 5.45 3.54

53727 0.78 66.9 57.0 2721 5.70 5.66 3.60

53785 0.89 64.3 65.0 2728 6.00 5.95 3.84

53800 0.90 68.7 62.0 2732 5.83 5.79 3.99

53863 1.00 66.8 56.0 2743 6.22 6.12 4.13

2077 rows × 7 columns

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 4/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [14]:

z_score.std()

Out[14]:

carat 1.0
depth 1.0
table 1.0
price 1.0
x 1.0
y 1.0
z 1.0
dtype: float64

In [15]:

df.drop(index = outliers.index,inplace=True)

In [16]:

num_cols=df.columns[(df.dtypes)=='float64'].tolist()
cat_cols=df.columns[(df.dtypes)=='object'].tolist()
#seperate features into numerical and categorical types

In [17]:

std_scaler=StandardScaler() #making object of StandardScaler


df[num_cols]=std_scaler.fit_transform(df[num_cols])

In [18]:

df.head()

Out[18]:

carat cut color clarity depth table price x y

0 -1.248109 Ideal E SI2 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623

1 -1.295150 Premium E SI1 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806

3 -1.106987 Premium I VS2 0.508013 0.289551 334 -1.377728 -1.362149 -1.317

4 -1.059946 Good J SI2 1.187779 0.289551 335 -1.245624 -1.248114 -1.134

Very
5 -1.224589 J VVS2 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546
Good

 

In [19]:

df[['cut','color','clarity']]=df[['cut','color','clarity']].apply(LabelEncoder().fit_tra
#assigns numerical value to categorical data

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 5/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [20]:

df.head()

Out[20]:

carat cut color clarity depth table price x y z

0 -1.248109 2 1 3 -0.171754 -1.126114 326 -1.613627 -1.599720 -1.623059

1 -1.295150 3 1 2 -1.455756 1.705217 326 -1.670243 -1.732761 -1.806163

3 -1.106987 3 5 5 0.508013 0.289551 334 -1.377728 -1.362149 -1.317887

4 -1.059946 1 6 3 1.187779 0.289551 335 -1.245624 -1.248114 -1.134783

5 -1.224589 4 6 7 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546766

In [21]:

from sklearn.model_selection import train_test_split

In [22]:

#seperating data first into i/p and o/p columns and then into test and train
y=df['price']
x=df.drop('price',axis=1)

x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.2, random_state=1)

In [23]:

print(len(x_train))

41490

In [24]:

from sklearn import datasets,linear_model, metrics


#obj of linear regression
reg=linear_model.LinearRegression()
#training model using fit
reg.fit(x_train,y_train)
print('Coefficients ', reg.coef_)# coef_ variable stores values of coef. i.e theta-1,the

Coefficients [ 4847.84477929 39.73811533 -242.60007725 258.69401187


-185.36160706 -178.87300891 -2317.7757166 675.34322907
175.75354744]

In [25]:

# y-intercept

print(reg.intercept_)

3109.709922835895

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 6/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [26]:

y_pred=reg.predict(x_test)

In [27]:

#coefficient of determination
r_squared=reg.score(x_test,y_test)
print(r_squared)

0.8903048480241464

In [28]:

x_train= x_train.values[:,0].reshape(-1,1)
x_test= x_test.values[:,0].reshape(-1,1)

plt.scatter(x_train,y_train,color='red')
plt.scatter(x_test,y_test,color='green')
plt.plot(x_test,y_pred,color='blue',linewidth=3)

Out[28]:

[<matplotlib.lines.Line2D at 0x262092e0f40>]

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 7/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook

In [29]:

print('mean absolute error:' , metrics.mean_absolute_error(y_test,y_pred))


print('mean squared error:' , metrics.mean_squared_error(y_test,y_pred))
print('root mean squared error:' , np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

#r_squared=reg.score(x_test,y_test)
#print(r_squared)

mean absolute error: 745.7533805818331


mean squared error: 1286516.9599848085
root mean squared error: 1134.2473098865205

support vector regression


In [30]:

from sklearn.svm import SVR

In [*]:

svr_rbf = SVR(kernel='rbf',C=1e3,gamma=0.1)
svr_lin=SVR(kernel='linear',C=1e3)
svr_poly=SVR(kernel='poly',C=1e3,degree=2)
y_rbf=svr_rbf.fit(x_train,y_train).predict(x_test)
y_lin=svr_lin.fit(x_train,y_train).predict(x_test)
y_poly=svr_poly.fit(x_train,y_train).predict(x_test)

In [*]:

lw=2
plt.scatter(x_train,y_train,color='darkorange',label='data')

plt.plot(x_test,y_rbf, color='navy',lw=lw,label='RBF model')


plt.xlabel('data')
plt.ylabel('target')
plt.title('support vector regression')
plt.legend
plt.show()

In [ ]:

localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 8/8

You might also like