You are on page 1of 21

Naan Mudhalvan - Project

Project Title:Apllied Data Science

Project Title: A-Reliable-Energy-Consumption-Analysis-


System-For-Energy-Efficient-Appliances

Team ID : NM2023TMID01842

Team Size : 4

Team Leader : SANJAY K

Team member : DHAYANITHI P

Team member : MATHIVANAN N

Team member : HEYRAM T

College Name:Adhiparasakthi college of engineering.

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('household_power_consumption.txt')

df.head()
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity

df.shape

(930143, 1)

Univariate Analsis

Histogram

sns.histplot(df['Date'])
----------------------------------------------------------------------
-----
KeyError Traceback (most recent call
last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:

4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Date'

The above exception was the direct cause of the following exception:

KeyError Traceback (most recent call


last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key,
_check_indexing_error will raise

K E 'D t '
sns.countplot(df['Date'])

----------------------------------------------------------------------
-----
NameError Traceback (most recent call
last)
<ipython-input-160-f96959050710> in <cell line: 1>()
----> 1 sns.countplot(data['Date'])

NameError: name 'data' is not defined

SEARCH STACK OVERFLOW


x = df['Date'].value_counts()
plt.pie(x.values,
        labels=x.index,
        autopct='%1.1f%%')
plt.show()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:

4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas libs hashtable PyObjectHashTable get item()

Bivariate Analysis

sns.scatterplot(x=data['Voltage'],
                y=data['Date'])

---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-166-f93113986f18> in <cell line: 1>()
----> 1 sns.scatterplot(x=data['Voltage'],
2 y=data['Date'])

NameError: name 'data' is not defined

SEARCH STACK OVERFLOW

import matplotlib.pyplot as plt
plt.figure(figsize=(15, 5))
sns.barplot(x=data['Date'], y=data['Voltage'])
plt.xticks(rotation='90')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-163-46fdf18b99e8> in <cell line: 3>()
1 import matplotlib.pyplot as plt
2 plt.figure(figsize=(15, 5))
----> 3 sns.barplot(x=data['Date'], y=data['Voltage'])
4 plt.xticks(rotation='90')

NameError: name 'data' is not defined

SEARCH STACK OVERFLOW


<Figure size 1500x500 with 0 Axes>

sns.FacetGrid(df,hue="Global_reactive_power").map(plt.scatter,"Global_reactive_power","c

Multivariate Analysis

from sklearn import datasets, decomposition
iris = datasets.load_iris()
X = iris.data
y = iris.target
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(X)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y)

<Axes: >
sns.heatmap(data.corr(), annot=True)

---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-165-b699050ce883> in <cell line: 1>()
----> 1 sns.heatmap(data.corr(), annot=True)

NameError: name 'data' is not defined

SEARCH STACK OVERFLOW

sns.pairplot(df,hue="Voltage")

Descriptive Statistics

df.describe()

Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inten

count

unique

top

freq

df['Date'].value_counts()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:

4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Date'

The above exception was the direct cause of the following exception:

KeyError Traceback (most recent call last)


df['Voltage'].value_counts().to_frame()
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will
raise

KeyError: 'Date'

SEARCH STACK OVERFLOW

df['Global_active_power'].value_counts()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:

4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Global_active_power'
'Voltage'

The above exception was the direct cause of the following exception:

KeyError Traceback (most recent call last)


/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will
Handling missing values
raise

KeyError: 'Global_active_power'

df.isnull()
SEARCH STACK OVERFLOW
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte

df.isnull().sum()

Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_me
0
dtype: int64

df.isnull().sum().sum()

filling null values

df2=df.fillna(value=0)
df

Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte

...

930138

930139

930140

930141

930142

930143 rows × 1 columns

df2.isnull().sum().sum()
0

#filling null values with previous value
df4=df.fillna(method = 'pad')
df4

Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte

...

930138

930139

930140

930141

930142

930143 rows × 1 columns

df4.isnull().sum()

Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_me
0
dtype: int64

df5=df.fillna({'Global_reactive_power':'4.216','Global_reactive_power':'0.418','Date':'1
df5
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte

...

930138

Finding the outliars

import pandas as pd

import numpy as np

import plotly.express as px

Histogram

fig = px.histogram(df, x='Global_active_power')

fig.show()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-141-b195d9dee47c> in <cell line: 1>()
----> 1 fig = px.histogram(df, x='Global_active_power')
2
**Box plot 3 fig.show()
3 frames
/usr/local/lib/python3.10/dist-packages/plotly/express/_core.py in
fig = px.box(df, y='Global_active_power')
process_args_into_dataframe(args, wide_mode, var_name, value_name)
fig.show()1206 if argument == "index":
1207 err_msg += "\n To use the index, pass it in
directly as `df.index`."
-> 1208 raise ValueError(err_msg)
1209 elif length and len(df_input[argument]) != length:
*scatterplot
1210* raise ValueError(

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected


fig = px.scatter(x=df['Global_active_power'], y=df['Global_reactive_power'])
one of
['Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;
fig.show()
but received: Global_active_power

SEARCH STACK OVERFLOW

def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

outliers = find_outliers_IQR(df["Global_active_power"])

print("number of outliers: "+ str(len(outliers)))

print("max outlier value: "+ str(outliers.max()))

print("min outlier value: "+ str(outliers.min()))

outliers

outliers = find_outliers_IQR(df[["Global_active_power","Global_reactive_power"]])

outliers

**Drop t he out liar

def drop_outliers_IQR(df):
   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   not_outliers = df[~((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   outliers_dropped = outliers.dropna().reset_index()

   return outliers_dropped

**Cap t he out liars

upper_limit = df['Global_active_power'].mean() + 3*df['Global_active_power'].std()

print(upper_limit)

lower_limit = df['Global_active_power'].mean() - 3*df['Global_active_power'].std()

print(lower_limit)

df['Global_active_power'] = np.where(df['Global_active_power'] > upper_limit,

   upper_limit,

   np.where(

       df['Global_active_power'] < lower_limit,

       lower_limit,

       df['Global_active_power']

   )

df.describe()[['Global_active_power']]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-137-db5887173d2e> in <cell line: 1>()
**Replace
---->out
1 liers using imput at ion as if t hey were missing values
df.describe()[['Global_active_power']]

2 frames
def impute_outliers_IQR(df):
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
_raise_if_missing(self, key, indexer, axis_name)
   q1=df.quantile(0.25)
6128 if use_interval_msg:
6129
   q3=df.quantile(0.75) key = list(key)
-> 6130 raise KeyError(f"None of [{key}] are in the
[{axis_name}]")
   IQR=q3-q1
6131
6132 not_found = list(ensure_index(key)[missing_mask.nonzero()
   upper = df[~(df>(q3+1.5*IQR))].max()
[0]].unique())
   lower = df[~(df<(q1-1.5*IQR))].min()
KeyError: "None of [Index(['Global_active_power'], dtype='object')] are in the
   df = np.where(df > upper,
[columns]"
       df.mean(),
SEARCH STACK OVERFLOW
       np.where(

           df < lower,

           df.mean(),

           df

           )

       )

   return df

df['Global_active_power'] = impute_outliers_IQR(df['Global_active_power'])

df.describe()['Global_active_power']
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:

4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
cat egorical column
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.PyObjectHashTable.get_item()
df = pd.read_csv('household_power_consumption.txt');
df5
KeyError: 'Global_active_power'

The above exception was the direct cause of the following exception:
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte
KeyError Traceback (most recent call last)
0
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
get_loc(self,
1 key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803
2 except KeyError as err:
-> 3804 raise KeyError(key) from err
3
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will
4
raise
...
KeyError: 'Global_active_power'
930138
SEARCH STACK OVERFLOW
930139

930140

930141

930142

930143 rows × 1 columns

df_numeric = df[['Global_active_power','Global_reactive_power','Date','Time']];
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-135-24092e91a73d> in <cell line: 1>()
----> 1 df_numeric =
df[['Global_active_power','Global_reactive_power','Date','Time']];

2 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in
_raise_if_missing(self, key, indexer, axis_name)
6128 if use_interval_msg:
6129 key = list(key)
-> 6130
df_numeric.head(); raise KeyError(f"None of [{key}] are in the
[{axis_name}]")
6131
---------------------------------------------------------------------------
6132
NameError not_found = list(ensure_index(key)[missing_mask.nonzero()
Traceback (most recent call last)
[0]].unique())
<ipython-input-134-2f4b77347dae> in <cell line: 1>()
----> 1 df_numeric.head();
KeyError: "None of [Index(['Global_active_power', 'Global_reactive_power',
'Date', 'Time'],
NameError: dtype='object')]
name 'df_numeric' are
is not in the [columns]"
defined

SEARCH
SEARCH STACK
STACK OVERFLOW
OVERFLOW

print(df['Global_active_power'].unique())
print(df['Global_reactive_power'].unique())
print(df['Date'].unique())
print(df['Time'].unique())
print(df['Voltage'].unique())

**Encode t he labels using label encoding

from sklearn.preprocessing import LabelEncoder

species_encoder = LabelEncoder()

species_encoder.fit(df_categorical['species'])

species_values = species_encoder.transform(df_categorical['species'])

print("Before Encoding:", list(df_categorical['species'][-10:]))
print("After Encoding:", species_values[-10:])
print("The inverse from the encoding result:", species_encoder.inverse_transform(species

island_encoder = LabelEncoder()
island_values =island_encoder.fit_transform(df_categorical['island'])

print("Before Encoding:", list(df_categorical['island'][:5]))
print("After Encoding:",island_values[:5])
print("The inverse from the encoding result:", island_encoder.inverse_transform(island_v
**split t ing int o dependent and independent variables

X = df.iloc[:, :-1].values
print(X)

Y = df.iloc[:, -1].values
print(Y)

**split int o dependent and independent

x=df['Date']
y=df.drop(columns=['Voltage'],axis=1)

x.head()

y.head()

from sklearn.preprocessing import scale

x=df['Time']
y=df.drop(columns=['Global_active_power'],axis=1)
x.head()

x=scale(x)
x

**Scale t he dat a

from sklearn.preprocessing import scale

x=df['Date']
y=df.drop(columns=['Voltage'],axis=1)
x.head()

x=scale(x)
x

x.mean()

x.std()

**perform any of t he clust ering algorit hm


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

scaled_df = StandardScaler().fit_transform(df)

print(scaled_df[:5])

kmeans_kwargs = {
"init": "random",
"n_init": 10,
"random_state": 1,
}

sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_df)
    sse.append(kmeans.inertia_)

plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

kmeans = KMeans(init="random", n_clusters=3, n_init=10, random_state=1)
kmeans.fit(scaled_df)

kmeans.labels_

df['cluster'] = kmeans.labels_
print(df)

**Split t he dat a int o t raining and t est ing

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

x_train

x_train.shape
x_test

x_test.shape

y_train

y_train.shape

y_test

y_test.shape

**Build t he model

from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()

**Train t he model

import numpy
import matplotlib.pyplot as plt
numpy.random.seed(2)

x = numpy.random.normal(3, 1, 100)
y = numpy.random.normal(150, 40, 100) / x

train_x = x[:80]
train_y = y[:80]

test_x = x[80:]
test_y = y[80:]

mymodel = numpy.poly1d(numpy.polyfit(train_x, train_y, 4))

myline = numpy.linspace(0, 6, 100)

plt.scatter(train_x, train_y)
plt.show()

**Test t he model

plt.plot(myline, mymodel(myline))
plt.show()
**Performance using Evaluat ion Met rics.

from sklearn import metrics
from sklearn.metrics import mean_squared_error

For regression problems

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
X_actual = [1, 1, 0, 1, 0, 0, 1, 0, 0, 0]
Y_predic = [1, 0, 1, 1, 1, 0, 1, 1, 0, 0]
results = confusion_matrix(X_actual, Y_predic)
print ('Confusion Matrix :')
print(results)
print ('Accuracy Score is',accuracy_score(X_actual, Y_predic))
print ('Classification Report : ')
print (classification_report(X_actual, Y_predic))
print('AUC-ROC:',roc_auc_score(X_actual, Y_predic))
print('LOGLOSS Value is',log_loss(X_actual, Y_predic))

For classificat ion problems

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
X_actual = [5, -1, 2, 10]
Y_predic = [3.5, -0.9, 2, 9.9]
print ('R Squared =',r2_score(X_actual, Y_predic))
print ('MAE =',mean_absolute_error(X_actual, Y_predic))
print ('MSE =',mean_squared_error(X_actual, Y_predic))

You might also like