Professional Documents
Culture Documents
Tonk Rajasthan
Faculty of Mathematics and Computing
Information Technology
Submitted by:
Name: SHELLY SHARMA
Smart Card ID: BTBTI20249
Roll no. : 2016820
LAB-1
LOGIC:
import pandas as pd
df = pd.read_csv(r'diabetes.csv')
print(df)
import pandas as pd
df=pd.read_csv('diabetes.csv')
df.isnull()
Steps to learn if there is any duplicate row:
bool_series=pd.isnull(df["Insulin"])
print(bool_series)
bool_series=pd.isnull(df.iloc[1])
print(bool_series)
LAB-2
LOGIC:
df = pd.DataFrame({'date':['2021-01-01', '2021-
01-02', '2021-01-03', '2021-01-04','2021-01-01',
'2021-01-02', '2021-01-03', '2021-01-04'], 'fruit':
['apple', 'apple', 'apple', 'apple', 'mango', 'mango',
'mango', 'mango'], 'price': [0.80, None, None, 1.20,
None, 2.10, 2.00, 1.80]})
df['date'] = pd.to_datetime(df['date'])
Steps to find null values:
df.price.mean()
Steps to calculate median of the specified value:
df['price'].fillna(value = df.price.median(),
inplace = True)
df.price.median()
df['price'].fillna(df.groupby('fruit')
['price'].transform('mean'), inplace = True)
Steps to group data in the similar categories:
df['price'].fillna(df.groupby('fruit')
['price'].transform('median'), inplace = True)
df['price'].fillna(method='ffill', inplace=True)
df
df
df['price']=df.groupby('fruit')['price'].ffill()
df
df
df
Steps to backward fill the values but with a limit
df
filldf['price'] = df.groupby('fruit')
['price'].bfill().ffill()
df
Steps to Interpolation
df['price'].interpolate(method='linear',
inplace=True)
df
df['price']=df.groupby('fruit')['price'].apply(lambda
x:x.interpolate(method='linear'))
df
df['price']=df.groupby('fruit')['price'].apply(lambda
x:x.interpolate(method='linear')).bfill()
df
df['weekday']=df['date'].apply(lambda x: False if
x.day_name() in ['Saturday', 'Sunday'] else True)
df
mean_price=df.groupby('fruit')
['price'].transform('mean')
df
df['price'].fillna((mean_price).where(cond =
df.weekday, other = mean_price*1.25), inplace = True)
df
LAB-3
AIM: To learn mapping
df=pd.DataFrame([['green','M',10.1,'class1'],
['red','L',13.5,'class2'],['blue','XL',15.3,'class1']])
df
df.columns=['color','size','price','classlabel']
df
import numpy as np
size_mapping={'XL':3,'L':2,'M':1}
df['size']=df['size'].map(size_mapping)
df
np.unique(df['classlabel'])
class_mapping
df['classlabel']=df['classlabel'].map(class_mapping)
class_mapping
print (df)
LAB-4
import sys
if sys.version_info[0]>=3:
nicode=str
import pandas as pd
csv_data=’’’ A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,’’’
df=pd.read_csv(StringIO(csv_data))
df
df.isnull()
df.isnull().sum()
df.values
df.dropna()
df.dropna(axis=1)
df.dropna(how=’all’)
df.dropna(thresh=4)
df.dropna(subset=[‘C’])
import numpy as np
imputer= SimpleImputer(missing_values=
np.nan,strategy=’mean’)
imputer.fit([[1,2],[np.nan,3],[7,6]])
SimpleImputer()
X=[[np.nan,2],[6,np.nan],[7,6]]
print(imputer.transform(X))
LAB-5
LOGIC:
import numpy as np
scaler = preprocessing.StandardScaler().fit(X_train)
scaler
scaler.mean_
scaler.scale_
X_scaled = scaler.transform(X_train)
X_scaled
X_scaled.mean(axis=0)
X_scaled.std(axis=0)
LAB-6 & LAB-7
LOGIC:
import pandas as pd
import numpy as np
df=pd.read_csv(‘diabetes.csv’)
print(df)
x=df.DiabetesPedigreeFunction.max()
y=df.Glucose.max()
x=df.DiabetesPedigreeFunction.min()
x
df.to_numpy()
x=df.DiabetesPedigreeFunction.to_numpy()
y=df.Glucose.to_numpy()
x=x.reshape(-1,1)
regression_model=LinearRegression()
regression_model.fit(x,y)
y_predicted=regression_model.predict(x)
y_predicted
plt.scatter(x,y,color=’red’)
plt.plot(x,y_predicted,color=’green’)
plt.xlabel(‘x’)
plt.ylabel(‘Y’)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
print(scaler.fit(x))
print(scaler.data_max_)
print(scaler.transform(x))
plt.scatter(x,y,color=’red’)
plt.plot(x,y_predicted,color=’green’)
plt.xlabel(‘x’)
plt.ylabel(‘Y’)
y=df.Glucose.to_numpy
y=y.reshape(-1,1)
y=df.Glucose.to_numpy()
y=y.reshape(-1,1)
regression_model=LinearRegression()
regression_model.fit(x,y)
plt.scatter(x,y,color=’red’)
plt.plot(x,y_predicted,color=’green’)
plt.xlabel(‘x’)
plt.ylabel(‘Y’)
scaler=MinMaxScaler()
print(scaler.fit(y))
print(scaler.data_max_)
y=scaler.transform(y)
y
error=y-y_predicted
error
error=y-y_predicted
se=np.sum(error**2)
print(n)
x_mean=np.mean(x)
y_mean=np.mean(y)
x_mean,y_mean
Sxy=np.sum(x*y)-n*x_mean*y_mean
Sxx=np.sum(x*x)-n*x_mean*x_mean
Sxx,Sxy
b1=Sxy/Sxx
b0=y_mean-b1*x_mean
print(“Slope :”,b1)
print(“Intercept is”,b0)
plt.scatter(x,y)
y_pred=b0+b1*x
y_pred
plt.scatter(x,y,color=’red’)
plt.plot(x,y_pred,color=’green’)
plt.xlabel(‘x’)
plt.ylabel(‘Y’)
error=y-y_pred
se=np.sum(error**2)
mse=se/n
mse
rmse=np.sqrt(mse)
SSt=np.sum((y-y_mean)**2)
R2=1-(se/SSt)
regression_model=LinearRegression()
regression_model.fit(x,y)
y_predicted=regression_model.predict(x)
y_predicted
plt.scatter(x,y,color=’red’)
plt.plot(x,y_pred,color=’green’)
plt.xlabel(‘x’)
plt.ylabel(‘Y’)
x=df[[‘DiabetesPedigreeFunction’,’Age’]]
x.to_numpy()
regression_model=LinearRegression()
regression_model.fit(x,y)
y_predicted=regression_model.predict(x)
y_predicted
LAB-8 & LAB-9 & LAB-10
LOGIC:
import numpy as np
import statsmodels.api as sm
x= np.array([1,2,3,4,5])
y= np.array([7,14,15,18,19])
n= np.size(x)
x_mean= np.mean(x)
y_mean= np.mean(y)
x_mean, y_mean
Sxy= np.sum(x*y)-n*x_mean*y_mean
Sxx= np.sum(x*x)-n*x_mean*x_mean
b1= Sxy/Sxx
b0= y_mean-b1*x_mean
print('slope b1 is', b1)
print('intercept b0 is',b0)
plt.scatter(x,y)
plt.scatter(x,y, color='red')
plt.plot(x,y_pred,color='green')
plt.xlabel('X')
plt.ylabel('Y')
error= y- y_pred
se = np.sum (error**2)
rmse = np.sqrt(mse)
SSt=np.sum((y-y_mean)**2)
R2=1- (se/SSt)
x= x.reshape(-1,1)
regression_model= LinearRegression()
import pandas as pd
df= pd.read_csv(r'diabetes.csv')
print (df)
Sxy= np.sum(x*y)-n*x_mean*y_mean
Sxx= np.sum(x*x)-n*x_mean*x_mean
b1= Sxy/Sxx
b0= y_mean-b1*x_mean
print('intercept b0 is',b0)
plt.scatter(x,y)
y_pred= b1*x+b0
plt.scatter(x,y, color='red')
plt.plot(x,y_pred,color='green')
plt.xlabel('X')
plt.ylabel('Y')
Steps to CALCULATE MEAN:
error= y- y_pred
se = np.sum (error**2)
mse= se/n
SSt=np.sum((y-y_mean)**2)
R2=1- (se/SSt)
mse= mean_squared_error(y,y_predicted)
rmse= np.sqrt(mean_squared_error(y,y_predicted))
r2= r2_score(y,y_predicted)
print('Slope:', regression_model.coef_)
print('Intercept:', regression_model.intercept_)
scaler = MinMaxScaler()
print(scaler.fit(x))
print(scaler.transform(x))