Human Activity Recognition Using Smartphone Data

Human Activity Recognition
Smartphone Data using

Importing Necessary Libraries
In [1]: # Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
from sklearn.manifold import TSNE

from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classificat
Loading the Dataset

In [3]: # Read the training data from a CSV file into a pandas DataFrame
train = pd.read_csv('dataset/train.csv')
# Read the test data from a CSV file into a pandas DataFrame
test = pd.read_csv('dataset/test.csv')
In [4]: # Display the first few rows of the training dataset

train.head()
Out[4]: tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc
mean()-X mean()-Y mean()-Z std()-X std()-Y std()-Z mad()-X
0 0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112
1 0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807
2 0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520
3 0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099
4 0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.99832
5 rows × 563 columns
In [5]: # Display the first few rows of the test dataset
test.head()

0 0.257178 -0.023285 -0.014654 -0.938404 -0.920091 -0.667683 -0.95250
1 0.286027 -0.013163 -0.119083 -0.975415 -0.967458 -0.944958 -0.986799
2 0.275485 -0.026050 -0.118152 -0.993819 -0.969926 -0.962748 -0.994403
3 0.270298 -0.032614 -0.117520 -0.994743 -0.973268 -0.967091 -0.995274
4 0.274833 -0.027848 -0.129527 -0.993852 -0.967445 -0.978295 -0.99411
In [6]: # Display information about the training dataset, including data types an
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 563 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), int64(1), object(1)
memory usage: 31.6+ MB
In [7]: # Count the occurrences of each unique value in the 'subject' column of t
train['subject'].value_counts()
Out[7]: subject
25 409
21 408
26 392
30 383
28 382
27 376
23 372
17 368
16 366
19 360
1 347
29 344
3 341
15 328
6 325
14 323
22 321
11 316
7 308
5 302
8 281
Name: count, dtype: int64
Data Preprocessing
Checking for Duplicate values
In [9]: # Print the number of duplicate rows in the training dataset
print('Number of duplicates in train :', train.duplicated().sum())
# Print the number of duplicate rows in the test dataset

print('Number of duplicates in test :', test.duplicated().sum())
Number of duplicates in train : 0

Number of duplicates in test : 0
Checking for Missing Values

In [12]: # Print the number of missing values in the training dataset
print('Number of missing values in train :', train.isna().values.sum())
# Print the number of missing values in the test dataset

print('Number of missing values in test :', test.isna().values.sum())
Number of missing values in train : 0

Number of missing values in test : 0
Checking for Activity Imbalance

In [18]: # Set up the figure size for the plot
plt.figure(figsize=(10, 6))
# Set the title of the plot

plt.title('Barplot of Activity')
# Create a countplot of the 'Activity' column from the training dataset
sns.countplot(train['Activity'], order=train['Activity'].value_counts().i
Out[18]: <Axes: title={'center': 'Barplot of Activity'}, xlabel='count', ylabel

='Activity'>
There is almost same number of observtions across all the six activities, so we
can say that there is no imbalance.
Exploratory Data Analysis

What are the main features from the dataset?
In [19]: train.head()

0 0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112
1 0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807
2 0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520
3 0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099
4 0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.99832
In [23]: # Extract the part of the column names before the first '-' and before th
# count the occurrences of each unique part.
# Convert the Counter dictionary to a pandas DataFrame, setting the orien
# and renaming the column to 'count'.
# Sort the DataFrame by the 'count' column in descending order
pd.DataFrame.from_dict(Counter([col.split('-')[0].split('(')[0] for col i
orient='index').rename(columns={0:'count'}).sort_v
Out[23]: count
fBodyAcc 79
fBodyGyro 79
fBodyAccJerk 79
tGravityAcc 40
tBodyAcc 40
tBodyGyroJerk 40
tBodyGyro 40
tBodyAccJerk 40
tBodyAccMag 13
tGravityAccMag 13
tBodyAccJerkMag 13
tBodyGyroMag 13
tBodyGyroJerkMag 13
fBodyAccMag 13
fBodyBodyAccJerkMag 13
fBodyBodyGyroMag 13
fBodyBodyGyroJerkMag 13
angle 7
subject 1
Activity 1
Analyzing tBodyAccMag-mean feature

In [32]: # Create a FacetGrid with the training dataset, specifying 'Activity' as
# and setting height, and aspect parameters
facetgrid = sns.FacetGrid(train, hue='Activity', height=5, aspect=3)
# Map a kernel density estimate (KDE) plot onto the FacetGrid for the col
# setting hist=False to plot a smooth curve, and add legend
facetgrid.map(sns.distplot, 'tBodyAccMag-mean()', hist=False).add_legend(
# Annotate regions indicating static activities

plt.annotate('Static Activities', xy=(-.98,8), xytext=(-.8,16), arrowprop
plt.annotate('Static Activities', xy=(-.98,13), xytext=(-.8,16), arrowpro
plt.annotate('Static Activities', xy=(-.98,16), xytext=(-.8,16), arrowpro
# Annotate regions indicating dynamic activities

plt.annotate('Dynamic Activities', xy=(-0.2,3.25), xytext=(0.1,9), arrowp
plt.annotate('Dynamic Activities', xy=(0.1,2.18), xytext=(0.1,9), arrowpr
plt.annotate('Dynamic Activities', xy=(-0.01,2.15), xytext=(0.1,9), arrow
Out[32]: Text(0.1, 9, 'Dynamic Activities')

# Create a boxplot showing the distribution of 'tBodyAccMag-mean()' colum

sns.boxplot(x='Activity', y='tBodyAccMag-mean()', data=train, showfliers=
# Rotate the x-axis labels for better readability

plt.xticks(rotation=90)
# Set the label for the y-axis

plt.ylabel('Body Acceleration Magnitude Mean')

plt.title('Boxplot of tBodyAccMag-mean() column across various Activities
# Add horizontal dashed lines to indicate specific values on the y-axis

plt.axhline(y=-0.8, xmin=0.05, dashes=(3, 3))
plt.axhline(y=0.0, xmin=0.35, dashes=(3, 3))
# Display the plot

plt.show()
Analyzing angle between X-axis and gravityMean feature
# Create a boxplot showing the distribution of 'angle(X,gravityMean)' col

sns.boxplot(x='Activity', y='angle(X,gravityMean)', data=train, showflier
# Add a horizontal dashed line at y=0.08, spanning from 10% to 90% of the
plt.axhline(y=0.08, xmin=0.1, xmax=0.9, dashes=(3, 3))

plt.ylabel("Angle between X-axis and gravityMean")

plt.title('Box plot of angle(X,gravityMean) column across various activit

# Display the plot

plt.show()
Analyzing angle between Y-axis and gravityMean feature
# Create a boxplot showing the distribution of 'angle(Y,gravityMean)' col

sns.boxplot(x='Activity', y='angle(Y,gravityMean)', data=train, showflier
# Add a horizontal dashed line at y=-0.35, starting from 1% of the x-axis

plt.axhline(y=-0.35, xmin=0.01, dashes=(3, 3))

plt.ylabel("Angle between Y-axis and gravityMean")

plt.title('Box plot of angle(X,gravityMean) column across various activit

# Display the plot

plt.show()
Visualizing Data using PCA
In [41]: # Prepare the data for PCA by dropping 'subject' and 'Activity' columns f
x_for_pca = train.drop(['subject', 'Activity'], axis=1)
# Perform PCA with 2 components on the prepared data

pca = PCA(n_components=2, random_state=0).fit_transform(x_for_pca)
In [42]: # Show PCA values

pca
Out[42]: array([[-5.5202803 , -0.29027701],

[-5.53534954, -0.08253011],
[-5.47498801, 0.28738703],
...,
[ 5.85750527, -3.08184312],
[ 5.42109482, -3.42643002],
[ 5.49797027, -2.78992867]])

# Create a scatter plot of the PCA-transformed data, with the first compo
# Color the points based on the 'Activity' column from the training datas
sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=train['Activity'])
# Display the plot
plt.show()
Visualizing Data using TSNE

In [46]: # Prepare the data for t-SNE by dropping 'subject' and 'Activity' columns
x_for_tsne = train.drop(['subject', 'Activity'], axis=1)
# Perform t-SNE with 2 components on the prepared data

tsne = TSNE(n_components=2, random_state=0, n_iter=1000).fit_transform(x_
In [47]: # Show TSNE Values

tsne
Out[47]: array([[ -6.5846224, -62.56863 ],

[-26.474825 , -17.497314 ],
[-28.395891 , -20.305143 ],
...,
[ 23.191902 , -61.020435 ],
[ 23.180101 , -60.762993 ],
[ 23.685352 , -60.009796 ]], dtype=float32)

# Create a scatter plot of the t-SNE-transformed data, with the first com
# Color the points based on the 'Activity' column from the training datas
sns.scatterplot(x=tsne[:,0], y=tsne[:,1], hue=train['Activity'])
# Display the plot

plt.show()
ML Models
Getting training and testing data ready
In [49]: # Separate the features from the target labels in the training dataset
X_train = train.drop(['subject', 'Activity'], axis=1) # Features (indepe
y_train = train['Activity'] # Target labels (dependent variable) for tra
# Separate the features from the target labels in the testing dataset
X_test = test.drop(['subject', 'Activity'], axis=1) # Features (independ
y_test = test['Activity'] # Target labels (dependent variable) for testi
In [50]: # Print the size (shape) of the training dataset

print('Training Data Size :', X_train.shape)
# Print the size (shape) of the testing dataset

print('Testing Data Size :', X_test.shape)
Training Data Size : (7352, 561)

Testing Data Size : (999, 561)
Logistic Regression
cross validation model with hyperparameter tuning and
In [51]: # Define the parameters grid for tuning the logistic regression model
parameters = {'max_iter': [100, 200, 500]}
# Create a logistic regression classifier

lr_classifier = LogisticRegression()
# Create a RandomizedSearchCV object to find the best parameters for the

lr_classifier_rs = RandomizedSearchCV(lr_classifier, param_distributions=
# Fit the RandomizedSearchCV object on the training data to find the best
lr_classifier_rs.fit(X_train, y_train)
# Make predictions on the testing data using the tuned logistic regressio
y_pred_lr = lr_classifier_rs.predict(X_test)
In [53]: # Compute the accuracy score by comparing the predicted labels (y_pred_lr
lr_accuracy = accuracy_score(y_test, y_pred_lr)
# Print the accuracy score obtained using logistic regression

print('Accuracy Score Using Logistic Regression :', round(lr_accuracy, 2
Accuracy Score Using Logistic Regression : 0.96
In [62]: # function to plot confusion matrix

def plot_confusion_matrix(cm,labels):
fig, ax = plt.subplots(figsize=(10,6)) # for plotting confusion matri
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
xticklabels=labels, yticklabels=labels,
ylabel='True label',
xlabel='Predicted label')
plt.xticks(rotation = 90)
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, int(cm[i, j]),ha="center", va="center",color="w
fig.tight_layout()
In [63]: # Compute the confusion matrix

cm = confusion_matrix(y_test.values, y_pred_lr)
# Call the plot_confusion_matrix function to visualize the confusion matr

plot_confusion_matrix(cm, np.unique(y_pred_lr))
In [66]: #function to get best random search attributes
def get_best_randomsearch_results(model):
print("Best estimator : ", model.best_estimator_)
print("Best set of parameters : ", model.best_params_)
In [67]: # getting best random search attributes for logistic regression

get_best_randomsearch_results(lr_classifier_rs)
Best estimator : LogisticRegression(max_iter=200)

Best set of parameters : {'max_iter': 200}
Kernel SVM
Validation with Hyperparameter Tuning and Cross
In [58]: # Define the set of hyperparameters for tuning the SVM classifier
parameters = {
'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
'C': [100, 50]
}
# Create a RandomizedSearchCV object to find the best hyperparameters for

svm_rs = RandomizedSearchCV(SVC(), param_distributions=parameters, cv=3,
# Perform randomized search to find the best hyperparameters

svm_rs.fit(X_train, y_train)
# Print out the best estimator and the best set of parameters found by Ra
get_best_randomsearch_results(svm_rs)
Out[58]: ▸ RandomizedSearchCV
▸ estimator: SVC
▸ SVC
In [59]: # Make predictions on the testing data using the tuned SVM model
y_pred_svm = svm_rs.predict(X_test)
In [60]: # Compute the accuracy score by comparing the predicted labels (y_pred_sv
svm_accuracy = accuracy_score(y_test, y_pred_svm)
# Print the accuracy score obtained using the SVM model

print('Accuracy Score Using SVM: ', round(svm_accuracy, 2))
Accuracy Score Using SVM: 0.97

cm = confusion_matrix(y_test.values, y_pred_svm)

plot_confusion_matrix(cm, np.unique(y_pred_svm))
In [68]: # Call the get_best_randomsearch_results function to print out the best e

get_best_randomsearch_results(svm_rs)
Best estimator : SVC(C=50)

Best set of parameters : {'kernel': 'rbf', 'C': 50}
Decision Tree
Validation Model Hyperparameter Tuning and Cross
In [78]: # Define the set of hyperparameters for tuning the Decision Tree classifi
parameters = {
'max_depth': np.arange(2, 10, 2),
'min_samples_split': [2, 4, 6, 8],
'min_samples_leaf': [1, 3, 5, 7, 9]
}
# Create a Decision Tree classifier

dt_classifier = DecisionTreeClassifier()

dt_classifier_rs = RandomizedSearchCV(dt_classifier, param_distributions=

dt_classifier_rs.fit(X_train, y_train)
get_best_randomsearch_results(dt_classifier_rs)
▸ estimator: DecisionTreeClassifier
▸ DecisionTreeClassifier
In [79]: # Make predictions on the testing data using the tuned Decision Tree mode
y_pred_dt = dt_classifier_rs.predict(X_test)
In [80]: # Compute the accuracy score by comparing the predicted labels (y_pred_dt
dt_accuracy = accuracy_score(y_test, y_pred_dt)
# Print the accuracy score obtained using the Decision Tree model
print('Accuracy Score Using Decision Tree :', round(dt_accuracy, 2))
Accuracy Score Using Decision Tree : 0.85

cm = confusion_matrix(y_test.values, y_pred_dt)

plot_confusion_matrix(cm, np.unique(y_pred_dt))
get_best_randomsearch_results(dt_classifier_rs)
Best estimator : DecisionTreeClassifier(max_depth=8, min_samples_leaf=5,

min_samples_split=6)
Best set of parameters : {'min_samples_split': 6, 'min_samples_leaf': 5,
'max_depth': 8}
Random Forest Model with Hyperparameter Tuning and

Cross Validation
In [83]: # Define the set of hyperparameters for tuning the Random Forest classifi
parameters = {
'n_estimators': np.arange(20, 101, 10),
'max_depth': np.arange(2, 17, 2)
}
# Create a Random Forest classifier

rf_classifier = RandomForestClassifier()

rf_classifier_rs = RandomizedSearchCV(rf_classifier, param_distributions=

rf_classifier_rs.fit(X_train, y_train)
get_best_randomsearch_results(rf_classifier_rs)
▸ estimator: RandomForestClassifier
▸ RandomForestClassifier
In [84]: # Make predictions on the testing data using the tuned Random Forest mode
y_pred_rf = rf_classifier_rs.predict(X_test)
In [85]: # Compute the accuracy score by comparing the predicted labels (y_pred_rf
rf_accuracy = accuracy_score(y_test, y_pred_rf)
# Print the accuracy score obtained using the Random Forest model
print('Accuracy Score Using Random Forest :', round(rf_accuracy, 2))
Accuracy Score Using Random Forest : 0.92

cm = confusion_matrix(y_test.values, y_pred_rf)

plot_confusion_matrix(cm, np.unique(y_pred_rf))

get_best_randomsearch_results(rf_classifier_rs)
Best estimator : RandomForestClassifier(max_depth=12, n_estimators=70)

Best set of parameters : {'n_estimators': 70, 'max_depth': 12}
In [89]: print('--------------------------------------')
print('Accuracy Scores for all the models :')
print('--------------------------------------')
print('Logistic Regression :', round(lr_accuracy,2))
print('Support Vector Classifier :', round(svm_accuracy,2))
print('Decision Tree Classifier :', round(dt_accuracy,2))
print('Random Forest Classifier :', round(rf_accuracy,2))
print('--------------------------------------')
--------------------------------------
Accuracy Scores for all the models :
--------------------------------------
Logistic Regression : 0.96
Support Vector Classifier : 0.97
Decision Tree Classifier : 0.85
Random Forest Classifier : 0.92
--------------------------------------
Conclusion:
Inbest-performing
summary, the Support
model VectoronClassifier
based the appears
provided to be the
accuracy
scores. However, further evaluation metrics such
precision, recall, and F1-score could provide additional as
insights
scenarios into the models'
with and
imbalanced performance, especially
classes orAdditionally, in
different priorities for
false positives
toof consider the false negatives.
computational complexity and it's essential
interpretability
each model
deployment. when selecting the final model for
In [ ]:

Human Activity Recognition Using Smartphone Data

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Human Activity Recognition Using Smartphone Data

Uploaded by

Copyright:

Available Formats

Human Activity Recognition

Smartphone Data using

import seaborn as sns

from collections import Counter

from sklearn.manifold import TSNE

from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classificat

Loading the Dataset

In [4]: # Display the first few rows of the training dataset

Out[5]: tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc

# Print the number of duplicate rows in the test dataset

Number of duplicates in train : 0

Checking for Missing Values

# Print the number of missing values in the test dataset

Number of missing values in train : 0

Checking for Activity Imbalance

# Set the title of the plot

Out[18]: <Axes: title={'center': 'Barplot of Activity'}, xlabel='count', ylabel

Exploratory Data Analysis

Out[19]: tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc- tBodyAcc

Analyzing tBodyAccMag-mean feature

# Annotate regions indicating static activities

# Annotate regions indicating dynamic activities

In [37]: # Set up the figure size for the plot

# Create a boxplot showing the distribution of 'tBodyAccMag-mean()' colum

# Rotate the x-axis labels for better readability

# Set the label for the y-axis

# Set the title of the plot

# Add horizontal dashed lines to indicate specific values on the y-axis

# Display the plot

# Create a boxplot showing the distribution of 'angle(X,gravityMean)' col

# Set the label for the y-axis

# Set the title of the plot

# Rotate the x-axis labels for better readability

# Display the plot

# Create a boxplot showing the distribution of 'angle(Y,gravityMean)' col

# Add a horizontal dashed line at y=-0.35, starting from 1% of the x-axis

# Set the label for the y-axis

# Set the title of the plot

# Rotate the x-axis labels for better readability

# Display the plot

# Perform PCA with 2 components on the prepared data

In [42]: # Show PCA values

Out[42]: array([[-5.5202803 , -0.29027701],

In [45]: # Set up the figure size for the plot

Visualizing Data using TSNE

# Perform t-SNE with 2 components on the prepared data

In [47]: # Show TSNE Values

Out[47]: array([[ -6.5846224, -62.56863 ],

In [48]: # Set up the figure size for the plot

# Display the plot

In [50]: # Print the size (shape) of the training dataset

# Print the size (shape) of the testing dataset

Training Data Size : (7352, 561)

# Create a logistic regression classifier

# Create a RandomizedSearchCV object to find the best parameters for the

# Print the accuracy score obtained using logistic regression

Accuracy Score Using Logistic Regression : 0.96

In [62]: # function to plot confusion matrix

In [63]: # Compute the confusion matrix

# Call the plot_confusion_matrix function to visualize the confusion matr

In [67]: # getting best random search attributes for logistic regression