You are on page 1of 5

#####################################################################################################

############################# RETAIL ANALYSIS WITH WALMART DATA ######################################


#####################################################################################################

rm(list=ls())

##### Calling Libraries into the current workspace #####

library(dplyr)
library(lubridate)
library(tibble)
library(plyr)

##### Importing Data #####

walmart <- read.csv("Walmart_Store_sales.csv")


View (walmart)
# Head and structure Walmart data #
head (walmart)
dim (walmart)
str (walmart)

##### Data Preparation - Converting 'Date' column into Date Variable #####

walmart$Date = as.Date(walmart$Date,format=c("%d-%m-%Y"))
View(walmart)
head (walmart)
str (walmart)

############################### ----- 1 ----- ##########################################################

# Which store has maximum sales?


store_sales = aggregate(Weekly_Sales~Store,data=walmart, sum) # Aggregate sales data storewise and get total sale
head(store_sales, 10)

# Method 1

which.max(store_sales$Weekly_Sales) # Get index position of maximum value of Weekly_Sales

store_sales %>%
group_by(Store, Weekly_Sales) %>%
summarise(max=max(Weekly_Sales, na.rm=TRUE))

# Method 2
arrange(store_sales, desc(Weekly_Sales))
arrange(store_sales, desc(Weekly_Sales))[1,]

# Answer : : Store 20 has the highest sale (sale value = 301397792)


# Store Weekly_Sales
# 1 20 301397792
############################### ----- 2 ----- ##########################################################

# Which store has maximum standard deviation i.e., the sales vary a lot.
# Also, find out the coefficient of mean to standard deviation

store_mean = aggregate(Weekly_Sales~Store,walmart, mean) # Aggregate sales data storewise and get mean values
head(store_mean, 10)

library(plyr)
store_mean = plyr::rename(store_mean, c("Weekly_Sales" = "Mean_Sales")) # Renaming the mean values column
head(store_mean)

store_std = aggregate(Weekly_Sales~Store,walmart,sd) # Aggregate sales data storewise and get standard deviation
head(store_std, 10)
store_std = plyr::rename(store_std, c("Weekly_Sales" = "Std_Sales")) # Renaming the column
head(store_std)

arrange(store_std,desc(Std_Sales))[1,]

# Answer : : Store 14 has the maximum Standard Deviation of 317569.9


# Store Std_Sales
# 1 14 317569.9

Store_std_mean = cbind(store_mean, Std_Sales = store_std$Std_Sales)


head(Store_std_mean)

Cov = mutate(Store_std_mean, coeff=(Std_Sales/Mean_Sales))


head(Cov)

arrange(Cov, desc(coeff))[1,]

# Answer :: Store 35 has the highest coefficient of variation = 0.2296811


# Store Mean_Sales Std_Sales coeff
# 1 35 919725 211243.5 0.2296811
############################### ----- 3 ----- ##########################################################

# Which store/s has good quarterly growth rate in Q3’2012?

# converting the dates to quarters:

Stores_q_Flag = mutate(walmart, Q_Flag = ifelse((Date >= '2012-04-01' & Date<='2012-06-30'), "Q2_2012",


ifelse ((Date >= '2012-07-01' & Date<='2012-09-30'), "Q3_2012", "-")))
head(Stores_q_Flag)
# This will create a column named "Q_Flag" and also label them accordingly

#Start and end date of each quarter


aggregate(Date~Q_Flag,Stores_q_Flag, min)
aggregate(Date~Q_Flag,Stores_q_Flag, max)
# Summarizing and reshaping
Stores_q_Flag_sum = aggregate(Weekly_Sales~Store+Q_Flag, Stores_q_Flag ,sum)
head(Stores_q_Flag_sum)

Stores_q_Flag_sum_r = reshape(Stores_q_Flag_sum, idvar="Store", timevar="Q_Flag",direction="wide")


head(Stores_q_Flag_sum_r)

Stores_q_Flag_sum_r_GR = mutate(Stores_q_Flag_sum_r,
GR=((Weekly_Sales.Q3_2012-Weekly_Sales.Q2_2012)/Weekly_Sales.Q2_2012))
head(Stores_q_Flag_sum_r_GR)

arrange(Stores_q_Flag_sum_r_GR, desc(GR))[1,]

## Answer :: Store 7 had highest growth rate of 13.33%


# Store Weekly_Sales.-Weekly_Sales.Q2_2012 Weekly_Sales.Q3_2012 GR
#1 7 66044628 7290859 8262787 13.33078

############################### ----- 4 ----- ##########################################################

# Some holidays have a negative impact on sales. Find out holidays which have
# higher sales than the mean sales in non-holiday season for all stores together?

non_holiday_sales = filter(walmart, Holiday_Flag==0)


head(non_holiday_sales)

Average_non_holiday_sales = mean(non_holiday_sales$Weekly_Sales) # This will compute the average non holiday sales
Average_non_holiday_sales ## 1041256

Holiday_Sales_exceeding_avg = filter(walmart, Weekly_Sales>Average_non_holiday_sales & Holiday_Flag==1)


head(Holiday_Sales_exceeding_avg)
dim(Holiday_Sales_exceeding_avg)
unique(Holiday_Sales_exceeding_avg$Date)
summary(as.factor(Holiday_Sales_exceeding_avg$Date))

## Answer :: Mean sales in non-holiday season for all stores together is 1041256
############################### ----- 5 ----- ##########################################################

# Provide a monthly and semester view of sales in units and give insights?

walmart_s <- walmart


walmart_s$Date =as.Date(walmart_s$Date,format=c("%d-%m-%Y"))
View(walmart_s)
walmart_s_month_year = transform(walmart_s,Year_Sale =as.numeric(format(Date,"%Y"))
,Month_Sale =as.numeric(format(Date,"%m")))
View(walmart_s_month_year)

Summarized_View = aggregate(Weekly_Sales~Month_Sale+Year_Sale,walmart_s_month_year,sum)
View(Summarized_View)

Insight_data = arrange(Summarized_View,desc(Weekly_Sales))
View(Insight_data)

## Insights - Walmart recorded the highest sales in Dec 2010 and Dec 2011 and the lowest sales in Jan 2011 and Jan 2012.
## December is the month of highest sales and is followed by the lowest sale in month of January. Walmart can plan its
inventory accordingly.
############################### ***LINEAR MODEL***##########################################
library(corrplot)

corrplot(cor(walmart[-c(1,2)]))
cor_data = cor(walmart[-c(1,2)])
corrplot(cor_data, method = c("number"), type="lower")

lm_walmart =lm(Weekly_Sales~Holiday_Flag + Temperature + Fuel_Price + CPI + Unemployment, walmart)


summary(lm_walmart)

## R-Squared value is very low:: 2.5 % of variation in weekly sales

## Now, Dropping insignificant varieables - which are temperature and fuel price

lm_walmart =lm(Weekly_Sales~Holiday_Flag + CPI + Unemployment, walmart)


summary(lm_walmart)

library(lubridate)
library(tibble)

(walmart$Date)[1:5]
yday(walmart$Date-1)[1:5]
(walmart$Date - yday(walmart$Date))[1:5]
yday(walmart$Date - yday(walmart$Date))[1:5]
(walmart$Date - yday(walmart$Date)[1])[1:5]
yday(walmart$Date - yday(walmart$Date)[1])[1:5]
walmart = add_column(walmart, Days=yday(walmart$Date - yday(walmart$Date)), after = 2)
head(walmart)
summary(walmart)

lm_walmart =lm(Weekly_Sales~Holiday_Flag + CPI + Unemployment + Days, walmart)


summary(lm_walmart)

lm_walmart =lm(Weekly_Sales~ CPI + Unemployment + Days, walmart)


summary(lm_walmart)

unique(data %>% select(Date))


ranked_dates <- dates %>% mutate(day=rank(Date))

******************

You might also like