You are on page 1of 7

Name:Fedrick Samuel W

Reg No: 19MIS1112


Course: Machine learning (SWE4012)
Slot: L11 + L12
Faculty: Dr.M. Premalatha
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plot
import sklearn
plot.rcParams['figure.figsize']=[8,5]
plot.rcParams['font.size']='14'
plot.rcParams['font.weight']='bold'
plot.style.use('seaborn-whitegrid')

file = pd.read_csv('Data.csv', delimiter="\t")


file2=file
print("no of rows and cols in dataset: ",file.shape)

file

no of rows and cols in dataset: (10, 3)

Country Salary Purchased


0 France 44.0 No
1 Spain 27.0 Yes
2 Germany 30.0 No
3 Spain 38.0 No
4 Germany 40.0 Yes
5 France 35.0 Yes
6 Spain NaN No
7 France 48.0 Yes
8 Germany 50.0 No
9 France 35.0 Yes

col = pd.get_dummies(file['Purchased'])

file = pd.concat([file, col], axis=1).reindex(file.index)

file.drop('Purchased', axis=1, inplace=True)


file

Country Salary No Yes


0 France 44.0 1 0
1 Spain 27.0 0 1
2 Germany 30.0 1 0
3 Spain 38.0 1 0
4 Germany 40.0 0 1
5 France 35.0 0 1
6 Spain NaN 1 0
7 France 48.0 0 1
8 Germany 50.0 1 0
9 France 35.0 0 1

plot.xlabel("salary")
plot.ylabel("yes")
plot.title("Purchased (yes) vs Salary")
sb.lmplot(x='Yes', y="Salary", data=file, aspect=2, height=6)
plot.title("Purchased (No) vs Salary")
sb.lmplot(x='No', y="Salary", data=file, aspect=2, height=6)

<seaborn.axisgrid.FacetGrid at 0x275dd2ea790>
file.describe()

Salary No Yes
count 9.000000 10.000000 10.000000
mean 38.555556 0.500000 0.500000
std 7.779960 0.527046 0.527046
min 27.000000 0.000000 0.000000
25% 35.000000 0.000000 0.000000
50% 38.000000 0.500000 0.500000
75% 44.000000 1.000000 1.000000
max 50.000000 1.000000 1.000000

plot.figure(figsize=(12,4))
sb.heatmap(df.isnull(),cbar=False,cmap='viridis',yticklabels=False)
plot.title('Missing value in the dataset');
file=file.dropna(how="any")
file.isnull().sum()
file

Country Salary No Yes


0 France 44.0 1 0
1 Spain 27.0 0 1
2 Germany 30.0 1 0
3 Spain 38.0 1 0
4 Germany 40.0 0 1
5 France 35.0 0 1
7 France 48.0 0 1
8 Germany 50.0 1 0
9 France 35.0 0 1

plot.figure(figsize=(12,4))
sb.heatmap(file.isnull(),cbar=False,cmap='viridis',yticklabels=False)
plot.title('Missing value in the dataset');

# correlation plot
corr = file.corr()
sb.heatmap(corr, cmap = 'Wistia', annot= True);
import warnings

warnings.filterwarnings('ignore')

file.shape

(9, 4)

file.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 9
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Country 9 non-null object
1 Salary 9 non-null float64
2 No 9 non-null uint8
3 Yes 9 non-null uint8
dtypes: float64(1), object(1), uint8(2)
memory usage: 234.0+ bytes

# find categorical variables

categorical = [var for var in file.columns if file[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))


print('The categorical variables are :\n\n', categorical)

There are 1 categorical variables

The categorical variables are :

['Country']

# check missing values in categorical variables

df[categorical].isnull().sum()

Country 0
dtype: int64

# view frequency counts of values in categorical variables

for var in categorical:

print(df[var].value_counts())

France 4
Spain 3
Germany 2
Germany 1
Name: Country, dtype: int64

# view frequency distribution of categorical variables

for var in categorical:

print(df[var].value_counts()/np.float(len(df)))

France 0.4
Spain 0.3
Germany 0.2
Germany 0.1
Name: Country, dtype: float64

# find numerical variables

numerical = [var for var in file.columns if file[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

There are 3 numerical variables

The numerical variables are : ['Salary', 'No', 'Yes']


df[numerical].head()

Salary No Yes
0 44.0 1 0
1 27.0 0 1
2 30.0 1 0
3 38.0 1 0
4 40.0 0 1

df[numerical].isnull().sum()

Salary 1
No 0
Yes 0
dtype: int64

from sklearn.metrics import confusion_matrix

file.loc[ file['Salary']>0 , 'Yes'] = 1


b = plot.hist(file['Salary'], bins=200)

You might also like