You are on page 1of 6

import pandas as pd

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
from scipy.stats import zscore, skew, shapiro, probplot
from scipy.stats import zscore, skew, shapiro, probplot
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("Test_Data.csv")

data

age sex bmi health_gradient smoker region


children
0 40.000000 male 29.900000 35760.40000 no southwest
2.0
1 47.000000 male 32.300000 49034.63000 no southwest
1.0
2 54.000000 female 28.880000 45038.93760 no northeast
2.0
3 NaN male 30.568094 0.00000 no northeast
3.0
4 59.130049 male 33.132854 64912.13924 yes northeast
4.0
.. ... ... ... ... ... ...
...
487 51.000000 male 27.740000 39244.88760 no northeast
5.0
488 33.000000 male 42.400000 59326.08000 no southwest
5.0
489 47.769999 male 29.064615 40353.79402 no northeast
5.0
490 41.530738 female 24.260852 24444.53324 no southeast
5.0
491 36.000000 male 33.400000 40160.16000 yes southwest
5.0

[492 rows x 7 columns]

data.isnull().sum()

age 1
sex 0
bmi 1
health_gradient 0
smoker 0
region 0
children 1
dtype: int64

for column in data.columns:


print(f"\nColumn: {column}")
print(data[column].head())

Column: age
0 40.000000
1 47.000000
2 54.000000
3 NaN
4 59.130049
Name: age, dtype: float64

Column: sex
0 male
1 male
2 female
3 male
4 male
Name: sex, dtype: object

Column: bmi
0 29.900000
1 32.300000
2 28.880000
3 30.568094
4 33.132854
Name: bmi, dtype: float64

Column: health_gradient
0 35760.40000
1 49034.63000
2 45038.93760
3 0.00000
4 64912.13924
Name: health_gradient, dtype: float64

Column: smoker
0 no
1 no
2 no
3 no
4 yes
Name: smoker, dtype: object

Column: region
0 southwest
1 southwest
2 northeast
3 northeast
4 northeast
Name: region, dtype: object

Column: children
0 2.0
1 1.0
2 2.0
3 3.0
4 4.0
Name: children, dtype: float64

handle_missing_values_categorical =
SimpleImputer(strategy='most_frequent') #handle strings with mode
data_categorical = data.select_dtypes(exclude='number')
data[data_categorical.columns] =
handle_missing_values_categorical.fit_transform(data_categorical)
#fit_transform calculates the most frequent value for each categorical
column in the training data (data_categorical) and then replaces
missing values with these calculated values.

handle_missing_values_numeric_mean = SimpleImputer(strategy='mean')
#handle numeric with mean
data_numeric = data.select_dtypes(include='number')
data[data_numeric.columns] =
handle_missing_values_numeric_mean.fit_transform(data_numeric)
#fit_transform calculates the most frequent value for each categorical
column in the training data (data_categorical) and then replaces
missing values with these calculated values.

handle_missing_values_numeric_median =
SimpleImputer(strategy='median') #handle numeric with median
data_numeric = data.select_dtypes(include='number')
data[data_numeric.columns] =
handle_missing_values_numeric_median.fit_transform(data_numeric)
#fit_transform calculates the most frequent value for each categorical
column in the training data (data_categorical) and then replaces
missing values with these calculated values.

print(data)

age sex bmi health_gradient smoker region


children
0 40.000000 male 29.900000 35760.40000 no southwest
2.0
1 47.000000 male 32.300000 49034.63000 no southwest
1.0
2 54.000000 female 28.880000 45038.93760 no northeast
2.0
3 38.844276 male 30.568094 0.00000 no northeast
3.0
4 59.130049 male 33.132854 64912.13924 yes northeast
4.0
.. ... ... ... ... ... ...
...
487 51.000000 male 27.740000 39244.88760 no northeast
5.0
488 33.000000 male 42.400000 59326.08000 no southwest
5.0
489 47.769999 male 29.064615 40353.79402 no northeast
5.0
490 41.530738 female 24.260852 24444.53324 no southeast
5.0
491 36.000000 male 33.400000 40160.16000 yes southwest
5.0

[492 rows x 7 columns]

#Calculate Z-Scores:
z_scores = zscore(data.select_dtypes(include='number'), axis=0)

#Identify Outliers:
outliers = (z_scores > 3) | (z_scores < -3)

#Mask Outliers in the DataFrame:


data_no_outliers = data.select_dtypes(include='number').mask(outliers,
np.nan)

for column in data_no_outliers.columns:


print(f"\nColumn: {column}")
print(data_no_outliers[column].head())

Column: age
0 40.000000
1 47.000000
2 54.000000
3 38.844276
4 59.130049
Name: age, dtype: float64

Column: bmi
0 29.900000
1 32.300000
2 28.880000
3 30.568094
4 33.132854
Name: bmi, dtype: float64
Column: health_gradient
0 35760.40000
1 49034.63000
2 45038.93760
3 0.00000
4 64912.13924
Name: health_gradient, dtype: float64

Column: children
0 NaN
1 NaN
2 NaN
3 NaN
4 4.0
Name: children, dtype: float64

skew_before = data_no_outliers['age'].skew()
print(f"\nSkewness before transformation: {skew_before}")

Skewness before transformation: 0.0453252970458881

data

age sex bmi health_gradient smoker region


children
0 40.000000 male 29.900000 35760.40000 no southwest
2.0
1 47.000000 male 32.300000 49034.63000 no southwest
1.0
2 54.000000 female 28.880000 45038.93760 no northeast
2.0
3 38.844276 male 30.568094 0.00000 no northeast
3.0
4 59.130049 male 33.132854 64912.13924 yes northeast
4.0
.. ... ... ... ... ... ...
...
487 51.000000 male 27.740000 39244.88760 no northeast
5.0
488 33.000000 male 42.400000 59326.08000 no southwest
5.0
489 47.769999 male 29.064615 40353.79402 no northeast
5.0
490 41.530738 female 24.260852 24444.53324 no southeast
5.0
491 36.000000 male 33.400000 40160.16000 yes southwest
5.0

[492 rows x 7 columns]


data['health_gradient'] = np.sqrt(data['health_gradient'])

data

age sex bmi health_gradient smoker region


children
0 40.000000 male 29.900000 189.104204 no southwest
2.0
1 47.000000 male 32.300000 221.437644 no southwest
1.0
2 54.000000 female 28.880000 212.223791 no northeast
2.0
3 38.844276 male 30.568094 0.000000 no northeast
3.0
4 59.130049 male 33.132854 254.778608 yes northeast
4.0
.. ... ... ... ... ... ...
...
487 51.000000 male 27.740000 198.103225 no northeast
5.0
488 33.000000 male 42.400000 243.569456 no southwest
5.0
489 47.769999 male 29.064615 200.882538 no northeast
5.0
490 41.530738 female 24.260852 156.347476 no southeast
5.0
491 36.000000 male 33.400000 200.400000 yes southwest
5.0

[492 rows x 7 columns]

You might also like