You are on page 1of 9

Data Modification and Cleaning

-- first, I added a new column in the DailyActivity table

ALTER TABLE `Fitabase.DailyActivity`


ADD COLUMN WeekDay STRING

-- next, I added the values for which day of the week is the ActivityDate

UPDATE `Fitabase.DailyActivity`
SET WeekDay = FORMAT_DATE('%A', ActivityDate)
WHERE WeekDay is NULL

-- then, I checked for NULL values for DailyActivity and SleepDay

SELECT *
FROM
`Fitabase.DailyActivity`
WHERE
Id IS NULL OR
ActivityDate IS NULL OR
WeekDay IS NULL OR
TotalSteps IS NULL OR
TotalDistance IS NULL OR
TrackerDistance IS NULL OR
LoggedActivitiesDistance IS NULL OR
VeryActiveDistance IS NULL OR
ModeratelyActiveDistance IS NULL OR
LightActiveDistance IS NULL OR
SedentaryActiveDistance IS NULL OR
VeryActiveMinutes IS NULL OR
FairlyActiveMinutes IS NULL OR
LightlyActiveMinutes IS NULL OR
SedentaryMinutes IS NULL OR
Calories IS NULL

-- result was "There is no data to display."

SELECT *
FROM `Fitabase.SleepDay`
WHERE
Id IS NULL OR
SleepDay IS NULL OR
TotalSleepRecords IS NULL OR
TotalMinutesAsleep IS NULL OR
TotalTimeInBed IS NULL

-- result was "There is no data to display."

-- since the id and date of DailyActivity and SleepDay tables are formatted in the same way, I joined the 2 tables together

-- first, I added the columns with their respective names

ALTER TABLE `Fitabase.DailyActivity`


ADD COLUMN TotalSleepRecords INT,
ADD COLUMN TotalMinutesAsleep INT,
ADD COLUMN TotalTimeInBed INT,
ADD COLUMN WeightKg FLOAT64

-- then, I transferred the data from SleepDay and WeightLog table to DailyActivity table

UPDATE Fitabase.DailyActivity AS t1
SET
TotalSleepRecords = t2.TotalSleepRecords,
TotalMinutesAsleep = t2.TotalMinutesAsleep,
TotalTimeInBed = t2.TotalTimeInBed
FROM Fitabase.SleepDay AS t2
WHERE t1.Id = t2.Id AND t1.ActivityDate = t2.SleepDay

UPDATE Fitabase.DailyActivity AS t1
SET
WeightKg = t2.WeightKg
FROM Fitabase.WeightLog AS t2
WHERE t1.Id = t2.Id AND t1.ActivityDate = t2.Date

-- lastly, I removed all rows with 0 data for TotalSteps because we won’t be needing that and I converted the Id to string and
arranged the columns on how I wanted it to be viewed

SELECT
CAST(Id AS STRING) AS Id,
ActivityDate, WeekDay, TotalSteps, TotalDistance, TrackerDistance,
LoggedActivitiesDistance, VeryActiveDistance, ModeratelyActiveDistance, LightActiveDistance, SedentaryActiveDistance,
VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes, SedentaryMinutes, Calories,
TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed, WeightKg
FROM `Fitabase.DailyActivity`
WHERE TotalSteps > 0
ORDER BY Id ASC, ActivityDate

-- saved the table to DailyData

-- I then worked with the DailyData table and obtain the data for the number of times a user used the device

SELECT Id,
COUNT (Id) AS LogIn_Count
FROM `Fitabase.DailyData`
GROUP BY Id
ORDER BY LogIn_Count DESC

-- the data was saved to a new table LogIn

-- next, I added a new column to the DailyData table, LogIn_Count, and filled the corresponding data from LogIn table

ALTER TABLE `Fitabase.DailyData`


ADD COLUMN LogIn_Count INT
UPDATE `Fitabase.DailyData` AS t1
SET
LogIn_Count = t2.LogIn_Count
FROM `Fitabase.LogIn` AS t2
WHERE t1.Id = t2.Id

-- after that, I worked with the hourly data and combined all to one table but first I checked for NULL values

SELECT *
FROM `Fitabase.HourlyCalories`
WHERE
Id IS NULL OR
ActivityDate is NULL OR
ActivityTime is NULL OR
Calories is NULL

-- result was "There is no data to display."

SELECT *
FROM `Fitabase.HourlyIntensities`
WHERE
Id IS NULL OR
ActivityDate is NULL OR
ActivityTime is NULL OR
TotalIntensity is NULL OR
AverageIntensity is NULL

-- result was "There is no data to display."

SELECT *
FROM `Fitabase.HourlySteps`
WHERE
Id IS NULL OR
ActivityDate is NULL OR
ActivityTime is NULL OR
StepTotal is NULL

-- result was "There is no data to display."

-- next, I combined all the tables together and arranged the data in ASC order

SELECT
CAST(t1.Id AS STRING) AS Id,
t1.ActivityDate,
t1.ActivityTime,
t1.Calories,
t2.TotalIntensity,
t2.AverageIntensity,
t3.StepTotal
FROM `Fitabase.HourlyCalories` AS t1
INNER JOIN `Fitabase.HourlyIntensities` AS t2
ON
t1.Id = t2.Id AND
t1.ActivityDate = t2.ActivityDate AND
t1.ActivityTime = t2.ActivityTime
INNER JOIN `Fitabase.HourlySteps` AS t3
ON
t1.Id = t3.Id AND
t1.ActivityDate = t3.ActivityDate AND
t1.ActivityTime = t3.ActivityTime
ORDER BY Id ASC, ActivityDate, ActivityTime

-- next, I combined all the tables together, converted Id to STRING and arranged the data in ASC order then saved the table to
HourlyData

-- lastly, I updated the HourlyData table with the data that I will be needing later in the analysis, that is DaysCount

SELECT Id,
COUNT(DISTINCT(ActivityDate)) AS DaysCount
FROM `Fitabase.HourlyData`
WHERE StepTotal>0
GROUP BY Id
ORDER BY DaysCount DESC

-- the data was saved to a new table Days

-- next, I added a new column to the HourlyData table, DaysCount, and filled the corresponding data from Days table

ALTER TABLE `Fitabase.HourlyData`


ADD COLUMN DaysCount INT

UPDATE `Fitabase.HourlyData` AS t1
SET
DaysCount = t2.DaysCount
FROM `Fitabase.Days` AS t2
WHERE t1.Id = t2.Id
Data Analysis

Working on DailyData table

User’s Consistency and Categorization

-- first, let's see how frequent the surveyed users used the device during the 31 days period and let’s classify the users by the
frequency of their usage.

SELECT
LogIn_Count AS Days_Logged,
COUNT(DISTINCT Id) AS User_Count
FROM `Fitabase.DailyData`
GROUP BY LogIn_Count
ORDER BY LogIn_Count DESC

SELECT
User_Type,
COUNT(Id) AS Id_Count
FROM
(
SELECT
Id,
LogIn_Count,
CASE
WHEN LogIn_Count BETWEEN 21 AND 31 THEN 'Consistent Users'
WHEN LogIn_Count BETWEEN 11 AND 20 THEN 'Frequent Users'
WHEN LogIn_Count BETWEEN 1 AND 10 THEN 'Occasional Users'
END AS User_Type
FROM `Fitabase.DailyData`
GROUP BY Id, LogIn_Count
ORDER BY LogIn_Count
)
GROUP BY User_Type
ORDER BY Id_Count DESC

-- result showed that majority of users have completed the 31 day period of using the device

User’s Weight
-- next, let's check weight vs total activity

SELECT
Id,
ROUND(AVG(VeryActiveMinutes + FairlyActiveMinutes + LightlyActiveMinutes),2) AS Avg_Total_Activity_Mins,
ROUND(AVG(WeightKg),2) AS Avg_Weight
FROM `Fitabase.DailyData`
WHERE WeightKg IS NOT NULL
GROUP BY Id

User Activity
-- first, let's check which days do users usually use the device and how active they are during those days

SELECT
WeekDay AS Day_Of_The_Week,
ROUND(AVG(VeryActiveMinutes),2) AS Very_Active_Mins,
ROUND(AVG(FairlyActiveMinutes),2) AS Fairly_Active_Mins,
ROUND(AVG(LightlyActiveMinutes),2) AS Lightly_Active_Mins,
ROUND(AVG(SedentaryMinutes),2) AS Sedentary_Mins
FROM `Fitabase.DailyData`
GROUP BY Day_Of_The_Week
ORDER BY
CASE
WHEN Day_Of_The_Week = 'Sunday' THEN 1
WHEN Day_Of_The_Week = 'Monday' THEN 2
WHEN Day_Of_The_Week = 'Tuesday' THEN 3
WHEN Day_Of_The_Week = 'Wednesday' THEN 4
WHEN Day_Of_The_Week = 'Thursday' THEN 5
WHEN Day_Of_The_Week = 'Friday' THEN 6
WHEN Day_Of_The_Week = 'Saturday' THEN 7
END ASC

-- next, let's categorize the users activity level based on the steps they do in a day

SELECT
User_Activity_Level,
Count(Id) AS Total,
FROM
(
SELECT
Id AS Id,
Count(Id) AS Total,
ROUND(AVG(TotalSteps),2) AS Avg_Steps,
CASE
WHEN AVG(TotalSteps) < 2500 THEN 'Basal'
WHEN AVG(TotalSteps) BETWEEN 2500 AND 4999 THEN 'Limited'
WHEN AVG(TotalSteps) BETWEEN 5000 AND 7499 THEN 'Low'
WHEN AVG(TotalSteps) BETWEEN 7500 AND 9999 THEN 'Somewhat Active'
WHEN AVG(TotalSteps) BETWEEN 10000 AND 12499 THEN 'Active'
WHEN AVG(TotalSteps) >= 12500 THEN 'Very Active'
END AS User_Activity_Level
FROM `Fitabase.DailyData`
GROUP BY Id
ORDER BY Avg_Steps
)
GROUP BY User_Activity_Level
ORDER BY USer_Activity_Level ASC

-- next, let's check the relationship of Steps, Distance, and Calories

SELECT
WeekDay AS Day_Of_The_Week,
ROUND(AVG(TotalSteps),2) AS Avg_Steps,
ROUND(AVG(TotalDistance),2) AS Avg_Distance,
ROUND(AVG(Calories),2) AS Avg_Calories
FROM `Fitabase.DailyData`
GROUP BY Day_Of_The_Week
ORDER BY
CASE
WHEN Day_Of_The_Week = 'Sunday' THEN 1
WHEN Day_Of_The_Week = 'Monday' THEN 2
WHEN Day_Of_The_Week = 'Tuesday' THEN 3
WHEN Day_Of_The_Week = 'Wednesday' THEN 4
WHEN Day_Of_The_Week = 'Thursday' THEN 5
WHEN Day_Of_The_Week = 'Friday' THEN 6
WHEN Day_Of_The_Week = 'Saturday' THEN 7
END ASC

User’s Sleep Pattern


-- first, let's check the sleep patterns of user per day of the week then compare it to total activity time, steps, and calories

SELECT
WeekDay AS Day_Of_The_Week,
ROUND(AVG(TotalMinutesAsleep),2) AS Avg_Time_Asleep,
ROUND(AVG(1440 - TotalMinutesAsleep),2) AS Avg_Time_Awake,
ROUND(AVG(TotalTimeInBed),2) AS Avg_Time_In_Bed,
ROUND(AVG(TotalTimeInBed - TotalMinutesAsleep),2) AS Avg_Awake_In_Bed,
ROUND(AVG(VeryActiveMinutes + FairlyActiveMinutes + LightlyActiveMinutes),2) AS Avg_Total_Activity_Mins,
ROUND(AVG(TotalSteps),2) AS Avg_Steps,
ROUND(AVG(Calories),2) AS Avg_Calories
FROM `Fitabase.DailyData`
GROUP BY Day_Of_The_Week
ORDER BY
CASE
WHEN Day_Of_The_Week = 'Sunday' THEN 1
WHEN Day_Of_The_Week = 'Monday' THEN 2
WHEN Day_Of_The_Week = 'Tuesday' THEN 3
WHEN Day_Of_The_Week = 'Wednesday' THEN 4
WHEN Day_Of_The_Week = 'Thursday' THEN 5
WHEN Day_Of_The_Week = 'Friday' THEN 6
WHEN Day_Of_The_Week = 'Saturday' THEN 7
END ASC

-- then, let's get the sleep, calories, steps and total activity of each user

SELECT
Id,
ROUND(AVG(TotalMinutesAsleep),2) AS Avg_Time_Asleep,
ROUND(AVG(VeryActiveMinutes + FairlyActiveMinutes + LightlyActiveMinutes),2) AS Avg_Total_Activity_Mins,
ROUND(AVG(TotalSteps),2) AS Avg_Steps,
ROUND(AVG(Calories),2) AS Avg_Calories
FROM `Fitabase.DailyData`
WHERE TotalMinutesAsleep IS NOT NULL
GROUP BY Id
Working on HourlyData table

-- I want to check the daily data of certain users who have completed 31 days of data and check the time of the day where the
users are most active

SELECT *
FROM `Fitabase.HourlyData`
WHERE DaysCount = 31
ORDER BY Id, ActivityDate, ActivityTime ASC

-- here is the summarized version

SELECT
ActivityTime,
ROUND(AVG(Calories),3) AS Avg_Calories,
ROUND(AVG(StepTotal),3) AS Avg_Steps
FROM `Fitabase.HourlyData`
WHERE DaysCount = 31
GROUP BY ActivityTime
ORDER BY ActivityTime ASC

-- next, I want to know the minimum and maximum time the users use the device
-- total hours per Id per ActivityDate

SELECT
MIN(Avg_Time) AS Min_Hours,
Max(Avg_Time) AS Max_Hours
FROM
(
SELECT
ActivityDate,
ROUND(AVG(TotalHours),3) AS Avg_Time
FROM
(
SELECT
Id, ActivityDate,
COUNT(ActivityTime) AS TotalHours
FROM Fitabase.HourlyData
WHERE StepTotal>0 AND ActivityDate < '2016-05-12'
GROUP BY Id, ActivityDate
ORDER BY Id, ActivityDate
)
GROUP BY ActivityDate
ORDER BY ActivityDate ASC
)

-- MaxUsage vs Steps per Id from 2016-04-12 to 2016-05-12 to check how long the device battery should last

SELECT
Id,
MAX(CountOfTime) AS MaxUsage,
ROUND(AVG(StepTotal),2) AS AvgTotalSteps
FROM (
SELECT
Id AS Id,
ActivityDate AS ActivityDate,
COUNT(ActivityTime) AS CountOfTime,
SUM(StepTotal) AS StepTotal
FROM `Fitabase.HourlyData`
WHERE StepTotal>0
GROUP BY Id, ActivityDate
)
GROUP BY Id
ORDER BY MaxUsage DESC

You might also like