You are on page 1of 18

Human Activity Recognition

Smartphone Data using


Importing Necessary Libraries
In [1]: # Import necessary libraries
import numpy as np
import pandas as pd

import seaborn as sns


import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from collections import Counter

from sklearn.manifold import TSNE


from sklearn.decomposition import PCA

from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classificat

Loading the Dataset


In [3]: # Read the training data from a CSV file into a pandas DataFrame
train = pd.read_csv('dataset/train.csv')

# Read the test data from a CSV file into a pandas DataFrame
test = pd.read_csv('dataset/test.csv')

In [4]: # Display the first few rows of the training dataset


train.head()
Out[4]: tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc
mean()-X mean()-Y mean()-Z std()-X std()-Y std()-Z mad()-X
0 0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112
1 0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807
2 0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520
3 0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099
4 0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.99832
5 rows × 563 columns
In [5]: # Display the first few rows of the test dataset
test.head()

Out[5]: tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc


mean()-X mean()-Y mean()-Z std()-X std()-Y std()-Z mad()-X
0 0.257178 -0.023285 -0.014654 -0.938404 -0.920091 -0.667683 -0.95250
1 0.286027 -0.013163 -0.119083 -0.975415 -0.967458 -0.944958 -0.986799
2 0.275485 -0.026050 -0.118152 -0.993819 -0.969926 -0.962748 -0.994403
3 0.270298 -0.032614 -0.117520 -0.994743 -0.973268 -0.967091 -0.995274
4 0.274833 -0.027848 -0.129527 -0.993852 -0.967445 -0.978295 -0.99411
5 rows × 563 columns
In [6]: # Display information about the training dataset, including data types an
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 563 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), int64(1), object(1)
memory usage: 31.6+ MB

In [7]: # Count the occurrences of each unique value in the 'subject' column of t
train['subject'].value_counts()
Out[7]: subject
25 409
21 408
26 392
30 383
28 382
27 376
23 372
17 368
16 366
19 360
1 347
29 344
3 341
15 328
6 325
14 323
22 321
11 316
7 308
5 302
8 281
Name: count, dtype: int64

Data Preprocessing
Checking for Duplicate values
In [9]: # Print the number of duplicate rows in the training dataset
print('Number of duplicates in train :', train.duplicated().sum())

# Print the number of duplicate rows in the test dataset


print('Number of duplicates in test :', test.duplicated().sum())

Number of duplicates in train : 0


Number of duplicates in test : 0

Checking for Missing Values


In [12]: # Print the number of missing values in the training dataset
print('Number of missing values in train :', train.isna().values.sum())

# Print the number of missing values in the test dataset


print('Number of missing values in test :', test.isna().values.sum())

Number of missing values in train : 0


Number of missing values in test : 0

Checking for Activity Imbalance


In [18]: # Set up the figure size for the plot
plt.figure(figsize=(10, 6))

# Set the title of the plot


plt.title('Barplot of Activity')
# Create a countplot of the 'Activity' column from the training dataset
sns.countplot(train['Activity'], order=train['Activity'].value_counts().i

Out[18]: <Axes: title={'center': 'Barplot of Activity'}, xlabel='count', ylabel


='Activity'>

There is almost same number of observtions across all the six activities, so we
can say that there is no imbalance.

Exploratory Data Analysis


What are the main features from the dataset?
In [19]: train.head()

Out[19]: tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc


mean()-X mean()-Y mean()-Z std()-X std()-Y std()-Z mad()-X
0 0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112
1 0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807
2 0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520
3 0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099
4 0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.99832
5 rows × 563 columns
In [23]: # Extract the part of the column names before the first '-' and before th
# count the occurrences of each unique part.
# Convert the Counter dictionary to a pandas DataFrame, setting the orien
# and renaming the column to 'count'.
# Sort the DataFrame by the 'count' column in descending order
pd.DataFrame.from_dict(Counter([col.split('-')[0].split('(')[0] for col i
orient='index').rename(columns={0:'count'}).sort_v

Out[23]: count
fBodyAcc 79
fBodyGyro 79
fBodyAccJerk 79
tGravityAcc 40
tBodyAcc 40
tBodyGyroJerk 40
tBodyGyro 40
tBodyAccJerk 40
tBodyAccMag 13
tGravityAccMag 13
tBodyAccJerkMag 13
tBodyGyroMag 13
tBodyGyroJerkMag 13
fBodyAccMag 13
fBodyBodyAccJerkMag 13
fBodyBodyGyroMag 13
fBodyBodyGyroJerkMag 13
angle 7
subject 1
Activity 1

Analyzing tBodyAccMag-mean feature


In [32]: # Create a FacetGrid with the training dataset, specifying 'Activity' as
# and setting height, and aspect parameters
facetgrid = sns.FacetGrid(train, hue='Activity', height=5, aspect=3)

# Map a kernel density estimate (KDE) plot onto the FacetGrid for the col
# setting hist=False to plot a smooth curve, and add legend
facetgrid.map(sns.distplot, 'tBodyAccMag-mean()', hist=False).add_legend(

# Annotate regions indicating static activities


plt.annotate('Static Activities', xy=(-.98,8), xytext=(-.8,16), arrowprop
plt.annotate('Static Activities', xy=(-.98,13), xytext=(-.8,16), arrowpro
plt.annotate('Static Activities', xy=(-.98,16), xytext=(-.8,16), arrowpro

# Annotate regions indicating dynamic activities


plt.annotate('Dynamic Activities', xy=(-0.2,3.25), xytext=(0.1,9), arrowp
plt.annotate('Dynamic Activities', xy=(0.1,2.18), xytext=(0.1,9), arrowpr
plt.annotate('Dynamic Activities', xy=(-0.01,2.15), xytext=(0.1,9), arrow
Out[32]: Text(0.1, 9, 'Dynamic Activities')

In [37]: # Set up the figure size for the plot


plt.figure(figsize=(10, 7))

# Create a boxplot showing the distribution of 'tBodyAccMag-mean()' colum


sns.boxplot(x='Activity', y='tBodyAccMag-mean()', data=train, showfliers=

# Rotate the x-axis labels for better readability


plt.xticks(rotation=90)

# Set the label for the y-axis


plt.ylabel('Body Acceleration Magnitude Mean')

# Set the title of the plot


plt.title('Boxplot of tBodyAccMag-mean() column across various Activities

# Add horizontal dashed lines to indicate specific values on the y-axis


plt.axhline(y=-0.8, xmin=0.05, dashes=(3, 3))
plt.axhline(y=0.0, xmin=0.35, dashes=(3, 3))

# Display the plot


plt.show()
Analyzing angle between X-axis and gravityMean feature
In [39]: # Set up the figure size for the plot
plt.figure(figsize=(10, 7))

# Create a boxplot showing the distribution of 'angle(X,gravityMean)' col


sns.boxplot(x='Activity', y='angle(X,gravityMean)', data=train, showflier

# Add a horizontal dashed line at y=0.08, spanning from 10% to 90% of the
plt.axhline(y=0.08, xmin=0.1, xmax=0.9, dashes=(3, 3))

# Set the label for the y-axis


plt.ylabel("Angle between X-axis and gravityMean")

# Set the title of the plot


plt.title('Box plot of angle(X,gravityMean) column across various activit

# Rotate the x-axis labels for better readability


plt.xticks(rotation=90)

# Display the plot


plt.show()
Analyzing angle between Y-axis and gravityMean feature
In [40]: # Set up the figure size for the plot
plt.figure(figsize=(10, 7))

# Create a boxplot showing the distribution of 'angle(Y,gravityMean)' col


sns.boxplot(x='Activity', y='angle(Y,gravityMean)', data=train, showflier

# Add a horizontal dashed line at y=-0.35, starting from 1% of the x-axis


plt.axhline(y=-0.35, xmin=0.01, dashes=(3, 3))

# Set the label for the y-axis


plt.ylabel("Angle between Y-axis and gravityMean")

# Set the title of the plot


plt.title('Box plot of angle(X,gravityMean) column across various activit

# Rotate the x-axis labels for better readability


plt.xticks(rotation=90)

# Display the plot


plt.show()
Visualizing Data using PCA
In [41]: # Prepare the data for PCA by dropping 'subject' and 'Activity' columns f
x_for_pca = train.drop(['subject', 'Activity'], axis=1)

# Perform PCA with 2 components on the prepared data


pca = PCA(n_components=2, random_state=0).fit_transform(x_for_pca)

In [42]: # Show PCA values


pca

Out[42]: array([[-5.5202803 , -0.29027701],


[-5.53534954, -0.08253011],
[-5.47498801, 0.28738703],
...,
[ 5.85750527, -3.08184312],
[ 5.42109482, -3.42643002],
[ 5.49797027, -2.78992867]])

In [45]: # Set up the figure size for the plot


plt.figure(figsize=(12, 8))

# Create a scatter plot of the PCA-transformed data, with the first compo
# Color the points based on the 'Activity' column from the training datas
sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=train['Activity'])
# Display the plot
plt.show()

Visualizing Data using TSNE


In [46]: # Prepare the data for t-SNE by dropping 'subject' and 'Activity' columns
x_for_tsne = train.drop(['subject', 'Activity'], axis=1)

# Perform t-SNE with 2 components on the prepared data


tsne = TSNE(n_components=2, random_state=0, n_iter=1000).fit_transform(x_

In [47]: # Show TSNE Values


tsne

Out[47]: array([[ -6.5846224, -62.56863 ],


[-26.474825 , -17.497314 ],
[-28.395891 , -20.305143 ],
...,
[ 23.191902 , -61.020435 ],
[ 23.180101 , -60.762993 ],
[ 23.685352 , -60.009796 ]], dtype=float32)

In [48]: # Set up the figure size for the plot


plt.figure(figsize=(12, 8))

# Create a scatter plot of the t-SNE-transformed data, with the first com
# Color the points based on the 'Activity' column from the training datas
sns.scatterplot(x=tsne[:,0], y=tsne[:,1], hue=train['Activity'])

# Display the plot


plt.show()
ML Models
Getting training and testing data ready
In [49]: # Separate the features from the target labels in the training dataset
X_train = train.drop(['subject', 'Activity'], axis=1) # Features (indepe
y_train = train['Activity'] # Target labels (dependent variable) for tra

# Separate the features from the target labels in the testing dataset
X_test = test.drop(['subject', 'Activity'], axis=1) # Features (independ
y_test = test['Activity'] # Target labels (dependent variable) for testi

In [50]: # Print the size (shape) of the training dataset


print('Training Data Size :', X_train.shape)

# Print the size (shape) of the testing dataset


print('Testing Data Size :', X_test.shape)

Training Data Size : (7352, 561)


Testing Data Size : (999, 561)

Logistic Regression
cross validation model with hyperparameter tuning and
In [51]: # Define the parameters grid for tuning the logistic regression model
parameters = {'max_iter': [100, 200, 500]}

# Create a logistic regression classifier


lr_classifier = LogisticRegression()

# Create a RandomizedSearchCV object to find the best parameters for the


lr_classifier_rs = RandomizedSearchCV(lr_classifier, param_distributions=
# Fit the RandomizedSearchCV object on the training data to find the best
lr_classifier_rs.fit(X_train, y_train)

# Make predictions on the testing data using the tuned logistic regressio
y_pred_lr = lr_classifier_rs.predict(X_test)

In [53]: # Compute the accuracy score by comparing the predicted labels (y_pred_lr
lr_accuracy = accuracy_score(y_test, y_pred_lr)

# Print the accuracy score obtained using logistic regression


print('Accuracy Score Using Logistic Regression :', round(lr_accuracy, 2

Accuracy Score Using Logistic Regression : 0.96

In [62]: # function to plot confusion matrix


def plot_confusion_matrix(cm,labels):
fig, ax = plt.subplots(figsize=(10,6)) # for plotting confusion matri
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
xticklabels=labels, yticklabels=labels,
ylabel='True label',
xlabel='Predicted label')
plt.xticks(rotation = 90)
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, int(cm[i, j]),ha="center", va="center",color="w
fig.tight_layout()

In [63]: # Compute the confusion matrix


cm = confusion_matrix(y_test.values, y_pred_lr)

# Call the plot_confusion_matrix function to visualize the confusion matr


plot_confusion_matrix(cm, np.unique(y_pred_lr))
In [66]: #function to get best random search attributes
def get_best_randomsearch_results(model):
print("Best estimator : ", model.best_estimator_)
print("Best set of parameters : ", model.best_params_)

In [67]: # getting best random search attributes for logistic regression


get_best_randomsearch_results(lr_classifier_rs)

Best estimator : LogisticRegression(max_iter=200)


Best set of parameters : {'max_iter': 200}

Kernel SVM
Validation with Hyperparameter Tuning and Cross
In [58]: # Define the set of hyperparameters for tuning the SVM classifier
parameters = {
'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
'C': [100, 50]
}

# Create a RandomizedSearchCV object to find the best hyperparameters for


svm_rs = RandomizedSearchCV(SVC(), param_distributions=parameters, cv=3,

# Perform randomized search to find the best hyperparameters


svm_rs.fit(X_train, y_train)

# Print out the best estimator and the best set of parameters found by Ra
get_best_randomsearch_results(svm_rs)
Out[58]: ▸ RandomizedSearchCV
▸ estimator: SVC

▸ SVC

In [59]: # Make predictions on the testing data using the tuned SVM model
y_pred_svm = svm_rs.predict(X_test)

In [60]: # Compute the accuracy score by comparing the predicted labels (y_pred_sv
svm_accuracy = accuracy_score(y_test, y_pred_svm)

# Print the accuracy score obtained using the SVM model


print('Accuracy Score Using SVM: ', round(svm_accuracy, 2))

Accuracy Score Using SVM: 0.97

In [64]: # Compute the confusion matrix


cm = confusion_matrix(y_test.values, y_pred_svm)

# Call the plot_confusion_matrix function to visualize the confusion matr


plot_confusion_matrix(cm, np.unique(y_pred_svm))

In [68]: # Call the get_best_randomsearch_results function to print out the best e


get_best_randomsearch_results(svm_rs)

Best estimator : SVC(C=50)


Best set of parameters : {'kernel': 'rbf', 'C': 50}
Decision Tree
Validation Model Hyperparameter Tuning and Cross
In [78]: # Define the set of hyperparameters for tuning the Decision Tree classifi
parameters = {
'max_depth': np.arange(2, 10, 2),
'min_samples_split': [2, 4, 6, 8],
'min_samples_leaf': [1, 3, 5, 7, 9]
}

# Create a Decision Tree classifier


dt_classifier = DecisionTreeClassifier()

# Create a RandomizedSearchCV object to find the best hyperparameters for


dt_classifier_rs = RandomizedSearchCV(dt_classifier, param_distributions=

# Perform randomized search to find the best hyperparameters


dt_classifier_rs.fit(X_train, y_train)

# Print out the best estimator and the best set of parameters found by Ra
get_best_randomsearch_results(dt_classifier_rs)

Out[78]: ▸ RandomizedSearchCV
▸ estimator: DecisionTreeClassifier

▸ DecisionTreeClassifier

In [79]: # Make predictions on the testing data using the tuned Decision Tree mode
y_pred_dt = dt_classifier_rs.predict(X_test)

In [80]: # Compute the accuracy score by comparing the predicted labels (y_pred_dt
dt_accuracy = accuracy_score(y_test, y_pred_dt)

# Print the accuracy score obtained using the Decision Tree model
print('Accuracy Score Using Decision Tree :', round(dt_accuracy, 2))

Accuracy Score Using Decision Tree : 0.85

In [81]: # Compute the confusion matrix


cm = confusion_matrix(y_test.values, y_pred_dt)

# Call the plot_confusion_matrix function to visualize the confusion matr


plot_confusion_matrix(cm, np.unique(y_pred_dt))
In [82]: # Call the get_best_randomsearch_results function to print out the best e
get_best_randomsearch_results(dt_classifier_rs)

Best estimator : DecisionTreeClassifier(max_depth=8, min_samples_leaf=5,


min_samples_split=6)
Best set of parameters : {'min_samples_split': 6, 'min_samples_leaf': 5,
'max_depth': 8}

Random Forest Model with Hyperparameter Tuning and


Cross Validation
In [83]: # Define the set of hyperparameters for tuning the Random Forest classifi
parameters = {
'n_estimators': np.arange(20, 101, 10),
'max_depth': np.arange(2, 17, 2)
}

# Create a Random Forest classifier


rf_classifier = RandomForestClassifier()

# Create a RandomizedSearchCV object to find the best hyperparameters for


rf_classifier_rs = RandomizedSearchCV(rf_classifier, param_distributions=

# Perform randomized search to find the best hyperparameters


rf_classifier_rs.fit(X_train, y_train)

# Print out the best estimator and the best set of parameters found by Ra
get_best_randomsearch_results(rf_classifier_rs)
Out[83]: ▸ RandomizedSearchCV
▸ estimator: RandomForestClassifier

▸ RandomForestClassifier

In [84]: # Make predictions on the testing data using the tuned Random Forest mode
y_pred_rf = rf_classifier_rs.predict(X_test)

In [85]: # Compute the accuracy score by comparing the predicted labels (y_pred_rf
rf_accuracy = accuracy_score(y_test, y_pred_rf)

# Print the accuracy score obtained using the Random Forest model
print('Accuracy Score Using Random Forest :', round(rf_accuracy, 2))

Accuracy Score Using Random Forest : 0.92

In [86]: # Compute the confusion matrix


cm = confusion_matrix(y_test.values, y_pred_rf)

# Call the plot_confusion_matrix function to visualize the confusion matr


plot_confusion_matrix(cm, np.unique(y_pred_rf))

In [87]: # Call the get_best_randomsearch_results function to print out the best e


get_best_randomsearch_results(rf_classifier_rs)

Best estimator : RandomForestClassifier(max_depth=12, n_estimators=70)


Best set of parameters : {'n_estimators': 70, 'max_depth': 12}
In [89]: print('--------------------------------------')
print('Accuracy Scores for all the models :')
print('--------------------------------------')
print('Logistic Regression :', round(lr_accuracy,2))
print('Support Vector Classifier :', round(svm_accuracy,2))
print('Decision Tree Classifier :', round(dt_accuracy,2))
print('Random Forest Classifier :', round(rf_accuracy,2))
print('--------------------------------------')

--------------------------------------
Accuracy Scores for all the models :
--------------------------------------
Logistic Regression : 0.96
Support Vector Classifier : 0.97
Decision Tree Classifier : 0.85
Random Forest Classifier : 0.92
--------------------------------------

Conclusion:
Inbest-performing
summary, the Support
model VectoronClassifier
based the appears
provided to be the
accuracy
scores. However, further evaluation metrics such
precision, recall, and F1-score could provide additional as
insights
scenarios into the models'
with and
imbalanced performance, especially
classes orAdditionally, in
different priorities for
false positives
toof consider the false negatives.
computational complexity and it's essential
interpretability
each model
deployment. when selecting the final model for
In [ ]:

You might also like