Professional Documents
Culture Documents
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
In [ ]:
In [2]:
df=pd.read_csv('diamonds.csv')
In [3]:
df.head()
Out[3]:
In [4]:
nulls=df.isnull().sum()
print(nulls)
Unnamed: 0 0
carat 0
cut 0
color 0
clarity 0
depth 0
table 0
price 0
x 0
y 0
z 0
dtype: int64
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 1/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [5]:
to_drop=['Unnamed: 0']
df.drop(columns=to_drop,inplace=True)
In [6]:
df.head()
Out[6]:
In [7]:
df.shape
Out[7]:
(53940, 10)
In [8]:
df.dtypes
Out[8]:
carat float64
cut object
color object
clarity object
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 2/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [9]:
df.describe()
Out[9]:
In [10]:
cols=['cut','color','clarity']
df_new=df.drop(columns=cols,inplace=False)
#mth-2 df_new= df[['carat','depth'...all numerical columns]]
#droping categorical values to find z-score. Z-score is useful in finding outliers.
In [11]:
df_new.dtypes
Out[11]:
carat float64
depth float64
table float64
price int64
x float64
y float64
z float64
dtype: object
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 3/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [12]:
z_score= (df_new-df_new.mean())/df_new.std()
z_score # z-score is new squeezed value for dataset
Out[12]:
In [13]:
outliers=df_new[(z_score>3).any(axis=1)]
outliers
#means after z_score SD of all features will be 1 and mean 0. If any row have value of z_
Out[13]:
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 4/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [14]:
z_score.std()
Out[14]:
carat 1.0
depth 1.0
table 1.0
price 1.0
x 1.0
y 1.0
z 1.0
dtype: float64
In [15]:
df.drop(index = outliers.index,inplace=True)
In [16]:
num_cols=df.columns[(df.dtypes)=='float64'].tolist()
cat_cols=df.columns[(df.dtypes)=='object'].tolist()
#seperate features into numerical and categorical types
In [17]:
In [18]:
df.head()
Out[18]:
Very
5 -1.224589 J VVS2 0.810131 -0.182337 336 -1.623063 -1.618726 -1.546
Good
In [19]:
df[['cut','color','clarity']]=df[['cut','color','clarity']].apply(LabelEncoder().fit_tra
#assigns numerical value to categorical data
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 5/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [20]:
df.head()
Out[20]:
In [21]:
In [22]:
#seperating data first into i/p and o/p columns and then into test and train
y=df['price']
x=df.drop('price',axis=1)
In [23]:
print(len(x_train))
41490
In [24]:
In [25]:
# y-intercept
print(reg.intercept_)
3109.709922835895
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 6/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [26]:
y_pred=reg.predict(x_test)
In [27]:
#coefficient of determination
r_squared=reg.score(x_test,y_test)
print(r_squared)
0.8903048480241464
In [28]:
x_train= x_train.values[:,0].reshape(-1,1)
x_test= x_test.values[:,0].reshape(-1,1)
plt.scatter(x_train,y_train,color='red')
plt.scatter(x_test,y_test,color='green')
plt.plot(x_test,y_pred,color='blue',linewidth=3)
Out[28]:
[<matplotlib.lines.Line2D at 0x262092e0f40>]
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 7/8
8/31/23, 11:04 AM practical-5 - Jupyter Notebook
In [29]:
#r_squared=reg.score(x_test,y_test)
#print(r_squared)
In [*]:
svr_rbf = SVR(kernel='rbf',C=1e3,gamma=0.1)
svr_lin=SVR(kernel='linear',C=1e3)
svr_poly=SVR(kernel='poly',C=1e3,degree=2)
y_rbf=svr_rbf.fit(x_train,y_train).predict(x_test)
y_lin=svr_lin.fit(x_train,y_train).predict(x_test)
y_poly=svr_poly.fit(x_train,y_train).predict(x_test)
In [*]:
lw=2
plt.scatter(x_train,y_train,color='darkorange',label='data')
In [ ]:
localhost:8888/notebooks/Documents/21dce026/practical-5.ipynb 8/8