You are on page 1of 17

Analítica cross selling

Prof José Antonio Taquía Gutiérrez


Procedimiento
Cargamos el dataset de productos y describimos su contenido:
Los datos constan de tres columnas:
Member_number: una identificación que puede ayudar a
distinguir diferentes compras por diferentes clientes.
Fecha: la fecha de la transacción
ItemDescription: la descripción del artículo real que se compró.

>dat <- read.csv(file="dat_groceries.csv", header=TRUE, sep=",")


>dat.data<-data.frame(dat)
>dim(dat.data)
>head(dat.data)
Procesamiento de los datos
 print(typeof(dat))
 df_sorted <- dat[order(dat$Member_number),]
 #convert member number to numeric
 df_sorted$Member_number <- as.numeric(df_sorted$Member_number)
 #convert item description to categorical format
 df_sorted$itemDescription <- as.factor(df_sorted$itemDescription)
Procesamiento de los datos
 library(plyr)
 library(dplyr)
 if(sessionInfo()['basePkgs']=="dplyr" | sessionInfo()
['otherPkgs']=="dplyr"){ detach(package:dplyr, unload=TRUE)}
Procesamiento de los datos
 #group all the items that were bought together; by the same
customer on the same date
 library(plyr)
 df_itemList <- ddply(dat, c("Member_number","Date"),
function(df1)paste(df1$itemDescription,collapse = ","))

 #remove member number and date


 df_itemList$Member_number <- NULL
 df_itemList$Date <- NULL
Ddply
Esta función divide el data frame con variables.
 # Summarize a dataset by two variables
 dfx <- data.frame(
group = c(rep('A', 8), rep('B', 15), rep('C', 6)),
sex = sample(c("M", "F"), size = 29, replace = TRUE),
age = runif(n = 29, min = 18, max = 54))
 # Note the use of the '.' function to allow>
 # group and sex to be used without quoting
 > ddply(dfx, .(group, sex), summarize, mean = round(mean(age), 2),
sd = round(sd(age), 2))
group sex mean sd1
A F 31.59 8.982
A M 34.27 7.933
B F 46.21 9.014
B M 42.74 9.255
C F 41.59 1.546
C M 31.59 15.01
 colnames(df_itemList) <- c("itemList")
 #write to csv format
 write.csv(df_itemList,"ItemList100.csv",quote = FALSE, row.names = TRUE)
 #load package required
 library(arules)

 #convert csv file to basket format


 txn = read.transactions(file="ItemList100.csv", rm.duplicates= FALSE, format="b
 print(typeof(txn)) #S4
 #remove quotes from transactions
 txn@itemInfo$labels <- gsub("\"","",txn@itemInfo$labels)
 #run apriori algorithm
 basket_rules <- apriori(txn,parameter = list(minlen=2,sup = 0.001, conf
= 0.01, target="rules"))
 #basket_rules <- apriori(txn,parameter = list(minlen=2,sup = 0.00001,
conf = 0.01, target="rules"),appearance = list(lhs = "CLEMENTINES")))

 #check if tm is attched; if yes then detach


 if(sessionInfo()['basePkgs']=="tm" | sessionInfo()['otherPkgs']=="tm"){
detach(package:sentiment, unload=TRUE)
detach(package:tm, unload=TRUE)
}
 #view rules
 inspect(basket_rules)
 #convert to datframe and view; optional
 df_basket <- as(basket_rules,"data.frame")
 df_basket$confidence <- df_basket$confidence * 100
 df_basket$support <- df_basket$support * nrow(df)
# Rules for recommendations:

# split lhs and rhs into two columns

library(reshape2)

#Reshape2 is an R package written by Hadley Wickham that makes it easy to


transform #data between wide and long formats.

df_basket <- transform(df_basket, rules = colsplit(rules, pattern = "=>", names =


c("lhs","rhs")))

# Remove curly brackets around rules


df_basket$rules$lhs <- gsub("[[:punct:]]", "", df_basket$rules$lhs)
df_basket$rules$rhs <- gsub("[[:punct:]]", "", df_basket$rules$rhs)
 # convert to chracter
 df_basket$rules$lhs <- as.character(df_basket$rules$lhs)
 df_basket$rules$rhs <- as.character(df_basket$rules$rhs)

 library(stringi)
 library(dplyr)
 df_basket$rules %>%
 filter(stri_detect_fixed(lhs, "yogurt")) %>%
 select(rhs)
Visualización de resultados
 #plot the rules
 library(arulesViz)
 plot(basket_rules)

 set.seed(8000)
 plot(basket_rules, method = "grouped", control = list(k = 5))

 plot(basket_rules[1:10,], method="graph",
control=list(type="items"))
Visualización de resultados

 plot(basket_rules[1:10,], method="paracoord",
control=list(alpha=.5, reorder=TRUE))

 itemFrequencyPlot(txn, topN = 5)

 plot(basket_rules[1:10,],measure=c("support","lift"),shading=
"confidence",interactive=T)

You might also like