You are on page 1of 2

#import the libraries

import numpy as py

import matplotlib.pyplot as plt

import pandas as pd

#Import the data set from Desktop

dataset = pd.read_csv('DataSet.csv')

X=dataset.iloc[:,:-1].values

XX=dataset.iloc[:,:-1].values

XXX=dataset.iloc[:,:-1].values

y=dataset.iloc[:,3].values

#Missing Value Handling by MEAN

from sklearn.preprocessing import Imputer

imputer =Imputer(missing_values="NaN", strategy='mean', axis=0)

imputer=imputer.fit(X[:,1:3])

X[:,1:3]= imputer.transform(X[:,1:3])

#Missing Value Handling by MEDIAN

from sklearn.preprocessing import Imputer

imputer =Imputer(missing_values="NaN", strategy='median', axis=0)

imputer=imputer.fit(XX[:,1:3])

XX[:,1:3]= imputer.transform(XX[:,1:3])

#Missing Value Handling by most_frequent

from sklearn.preprocessing import Imputer

imputer =Imputer(missing_values="NaN", strategy='most_frequent', axis=0)

imputer=imputer.fit(XXX[:,1:3])

XXX[:,1:3]= imputer.transform(XXX[:,1:3])
#Concept of Dummy Variable, Handling the conflict of them

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()

X[:,0]=labelencoder_X.fit_transform(X[:,0])

onehotencoder=OneHotEncoder(categorical_features =[0])

X=onehotencoder.fit_transform(X).toarray()

#Training and Testing Data (divide the data into two part)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2,


random_state=0)

#Standard and fit the data for better predication

from sklearn.preprocessing import StandardScaler

sc_X=StandardScaler()

X_test=sc_X.fit_transform(X_test)

X_train=sc_X.fit_transform(X_train)

You might also like