You are on page 1of 5

Applied Exercise 5

1. According to the data characteristics, we can remove the ID and gender, because
the valid data in this group is related to customer.
2. We can choose 3 cluster to describe
3. According to the K-means plot, we can see it elbow plot with 3 clusters.
4. I don’t think we can use 2, because there may have too many members.
Code&Results

# Step1: Load the dataset and libraries

customer_spend <- read.csv(file.choose())


summary(customer_spend)

customerid gender age HouseHoldIncome10k


Min. : 1.00 Length:278 Min. :18.00 Min. : 15.00
1st Qu.: 70.25 Class :character 1st Qu.:30.00 1st Qu.: 48.25
Median :139.50 Mode :character Median :36.00 Median : 74.00
Mean :139.50 Mean :38.32 Mean : 71.17
3rd Qu.:208.75 3rd Qu.:47.00 3rd Qu.: 88.00
Max. :278.00 Max. :70.00 Max. :137.00
SpendingScorePerWeek
Min. : 1.00
1st Qu.:26.00
Median :50.00
Mean :50.53
3rd Qu.:75.00
Max. :99.00

library(dendextend)
library(reader)
library(psych)
library(dplyr)
library(ggplot2)

library(tibble)
library(cluster)
library(tidyr)
library(purrr)

# Step 2: Remove the ID and and Gender

customer_spend1 <- customer_spend[,3:5]


summary(customer_spend1)
age HouseHoldIncome10k SpendingScorePerWeek
Min. :18.00 Min. : 15.00 Min. : 1.00
1st Qu.:30.00 1st Qu.: 48.25 1st Qu.:26.00
Median :36.00 Median : 74.00 Median :50.00
Mean :38.32 Mean : 71.17 Mean :50.53
3rd Qu.:47.00 3rd Qu.: 88.00 3rd Qu.:75.00
Max. :70.00 Max. :137.00 Max. :99.00

# Step 3: Calculate Euclidean distance between customers and plot


dist_customers <- dist(customer_spend1)
hc_customers <- hclust(dist_customers,method = "complete")

plot(hc_customers)

# Step 4: Create cluster


clust_customers <- cutree(hc_customers,h = 100)

segment_customers <- mutate(customer_spend, cluster = clust_customers)

count(segment_customers,cluster)

cluster n
1 1 44
2 2 120
3 3 114

# Step 5 : Create dendrogram


dend_customers <- as.dendrogram(hc_customers)

segment_customers %>% group_by(cluster) %>% summarise_all(funs(mean(.)))

cluster customerid gender age HouseHoldIncome10k SpendingScorePerWeek


* <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 22.5 NA 35.7 25.7 49.1
2 2 105. NA 40.3 61.6 50.3
3 3 221. NA 37.2 98.8 51.3

# K means
library(animation)

ani.options(interval =1)
par(mar = c(3,3,1,1.5),mgp = c(1.5,0.5,0))
kmeans.ani()

tot_withinss <- map_dbl(1:10,function(k){


model <- kmeans(x = customer_spend1,centers = k)
model$tot.withinss
})

elbow_df <- data.frame(


k = 1:10,
tot_withinss = tot_withinss
)

ggplot(elbow_df,aes(x = k, y = tot_withinss)) + geom_line() +


scale_x_continuous(breaks = 1:10)

You might also like