################################################# # C04: 關聯性分析 # # 吳漢銘 國立臺北大學統計學系 # # http://www.hmwu.idv.tw/ # ################################################# # 11/71 no.item <- 5 sum(choose(no.item, 0:no.item)) 2^no.item # 30/71 # The AdultUCI datset contains the questionnaire data of the “Adult” database (originally called the “Census Income” Database) with 48842 observations on the 15 variables. library(arules) data(AdultUCI) head(AdultUCI) data(Adult) ?Adult #see how to create transactions from AdultUCI Adult class(Adult) ?transactions # 31/71 str(Adult) inspect(Adult[1:2]) # 32/71 summary(Adult) # 33/71 # creating transactions form a list a.list <- list( c("a","b","c"), c("a","b"), c("a","b","d"), c("c","e"), c("a","b","d","e") ) names(a.list) <- paste0("Customer", c(1:5)) a.list # 34/71 alist.trans <- as(a.list, "transactions") summary(alist.trans) # analyze transactions image(alist.trans) # 35/71 a.matrix <- matrix(c( 1,1,1,0,0, 1,1,0,0,0, 1,1,0,1,0, 0,0,1,0,1), ncol = 5) dimnames(a.matrix) <- list(paste("Customer", letters[1:4]), paste0("Item", c(1:5))) a.matrix amatirx.trans <- as(a.matrix, "transactions") amatirx.trans inspect(amatirx.trans) summary(amatirx.trans) # 36/71 # creating transactions from data.frame a.df <- data.frame( age = as.factor(c(6, 8, NA, 9, 16)), grade = as.factor(c("A", "C", "F", NA, "C")), pass = c(TRUE, TRUE, FALSE, TRUE, TRUE)) # note: factors are translated to # logicals and NAs are ignored a.df adf.trans <- as(a.df, "transactions") inspect(adf.trans) #items transactionID as(adf.trans, "data.frame") # creating transactions from (IDs, items) a.df2 <- data.frame( TID = c(1, 1, 2, 2, 2, 3), item = c("a", "b", "a", "b", "c", "b")) a.df2 a.df2.s <- split(a.df2[, "item"], a.df2[,"TID"]) a.df2.s adf2.trans <- as(a.df2.s, "transactions") inspect(adf2.trans) # 37/71 data(AdultUCI) summary(AdultUCI) # remove attributes AdultUCI[["fnlwgt"]] <- NULL AdultUCI[["education-num"]] <- NULL # 38/71 # map metric attributes AdultUCI[["age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15, 25, 45, 65, 100)), labels = c("Young", "Middle-aged", "Senior", "Old")) AdultUCI[["hours-per-week"]] <- ordered(cut(AdultUCI[["hours-per-week"]], c(0, 25, 40, 60, 168)), labels = c("Part-time", "Full-time", "Over-time", "Workaholic")) AdultUCI[["capital-gain"]] <- ordered(cut(AdultUCI[["capital-gain"]], c(-Inf, 0, median(AdultUCI[["capital-gain"]][AdultUCI[["capital-gain"]] > 0]), Inf)), labels = c("None", "Low", "High")) AdultUCI[["capital-loss"]] <- ordered(cut(AdultUCI[["capital-loss"]], c(-Inf, 0, median(AdultUCI[["capital-loss"]][AdultUCI[["capital-loss"]] > 0]), Inf)), labels = c("None", "Low", "High")) summary(AdultUCI[c("age", "hours-per-week", "capital-gain", "capital-loss")]) # create transactions MyAdult <- as(AdultUCI, "transactions") MyAdult # 39/71 summary(MyAdult) inspect(MyAdult[1:2]) # 40/71 library(arules) data(Groceries) ?Groceries str(Groceries) Groceries@itemInfo # 41/71 summary(Groceries) inspect(Groceries[1:4]) # 42/71 rule0 <- apriori(Groceries) # 43/71 rule1 <- apriori(Groceries, parameter=list(support=0.005, confidence=0.64)) inspect(rule1) # 44/71 str(rule1) rule1@quality # 45/71 rule2 <- apriori(Groceries, parameter=list(support=0.001, confidence=0.5)) rule2.sorted_sup <- sort(rule2, by="support") inspect(rule2.sorted_sup[1:5]) # 46/71 # Select a subset of rules using partial matching on the items # in the right-hand-side and a quality measure rule2.sub <- subset(rule2, subset = rhs %pin% "whole milk" & lift > 1.3) rule2.sub # Display the top 3 support rules inspect(head(rule2.sub, n = 3, by = "support")) # Display the first 3 rules inspect(rule2.sub[1:3]) # Get labels for the first 3 rules labels(rule2.sub[1:3]) labels(rule2.sub[1:3], itemSep = " + ", setStart = "", setEnd="", ruleSep = " ---> ") # 47/71 rule2.sorted_con <- sort(rule2, by="confidence") inspect(rule2.sorted_con[1:5]) rule2.sorted_lift <- sort(rule2, by="lift") inspect(rule2.sorted_lift[1:5]) # 48/71 rule.freq_item <- apriori(Groceries, parameter=list(support=0.001, target="frequent itemsets"), control=list(sort=-1)) rule.freq_item inspect(rule.freq_item[1:5]) # 49/71 rule.fi_eclat <- eclat(Groceries, parameter=list(minlen=1, maxlen=3, support=0.001, target="frequent itemsets"), control=list(sort=-1)) rule.fi_eclat rule.fi_eclat <- eclat(Groceries, parameter=list(minlen=3, maxlen=5, support=0.001, target="frequent itemsets"), control=list(sort=-1)) rule.fi_eclat inspect(rule.fi_eclat[1:5]) # 50/71 itemFrequencyPlot(x, type = c("relative", "absolute"), weighted = FALSE, support = NULL, topN = NULL, population = NULL, popCol = "black", popLwd = 1, lift = FALSE, horiz = FALSE, names = TRUE, cex.names = graphics::par("cex.axis"), xlab = NULL, ylab = NULL, mai = NULL, ...) # 51/71 library(arulesViz) rule.a <- apriori(Groceries, parameter=list(support=0.002, confidence=0.5)) plot(rule.a) plot(rule.a, measure=c("support", "lift"), shading="confidence", col=rainbow(100)[80:1], cex=0.3) # 52/71 plot(rule.a, interactive=TRUE) # 53/71 plot(rule.a, method="two-key plot") # 54/71 plot(rule.a[1:20], method="matrix") plot(rule.a[1:20], method="matrix", measure="lift") # 55/71 plot(rule.a[1:20], method="graph") plot(rule.a[1:20], method="paracoord") # 56/71 plot(rule.a[1:20], method="grouped") # 57/71 Titanic # 58/71 str(Titanic) Titanic.df <- as.data.frame(Titanic) Titanic.df Titanic.raw <- NULL for(i in 1:4) { Titanic.raw <- cbind(Titanic.raw, rep(as.character(Titanic.df[,i]), Titanic.df$Freq)) } Titanic.raw <- as.data.frame(Titanic.raw) names(Titanic.raw) <- names(Titanic.df)[1:4] dim(Titanic.raw) str(Titanic.raw) head(Titanic.raw) summary(Titanic.raw) # 59/71 str(Titanic.raw) head(Titanic.raw) summary(Titanic.raw) # 60/71 library(arules) # find association rules with default settings rules.all <- apriori(Titanic.raw) quality(rules.all) <- round(quality(rules.all), digits=3) rules.all inspect(rules.all) # or use arules::inspect(rules.all) # 61/71 inspect(rules.all) # or use > arules::inspect(rules.all) # 62/71 # All other items can appear in the lhs, as set with default="lhs". # set minlen to 2 to exclude empty at the left-hand side (lhs) of the first rule rules <- apriori(Titanic.raw, control = list(verbose=F), parameter = list(minlen=2, supp=0.005, conf=0.8), appearance = list(rhs=c("Survived=No", "Survived=Yes"), default="lhs")) quality(rules) <- round(quality(rules), digits=3) # Rules are sorted by lift to make high-lift rules appear first rules.sorted <- sort(rules, by="lift") inspect(rules.sorted) # 64/71 # find redundant rules # finding subsets in associations and itemMatrix objects subset.matrix <- is.subset(rules.sorted, rules.sorted) subset.matrix[lower.tri(subset.matrix, diag=T)] <- NA redundant <- colSums(subset.matrix, na.rm=T) = 1 which(redundant) # 65/71 # remove redundant rules rules.pruned <- rules.sorted[!redundant] inspect(rules.pruned) # 66/71 rules <- apriori(Titanic.raw, parameter = list(minlen=3, supp=0.002, conf=0.2), appearance = list(rhs=c("Survived=Yes"), lhs=c("Class=1st", "Class=2nd", "Class=3rd", "Age=Child", "Age=Adult"), default="none"), control = list(verbose=F)) rules.sorted <- sort(rules, by="confidence") inspect(rules.sorted) # 67/71 plot(rules.all) plot(rules.all, cex=2) x <- rules.all@quality$support y <- rules.all@quality$confidence text(x, y, rownames(rules.all@quality)) # 68/71 plot(rules.all, method="grouped") # 69/71 plot(rules.all, method="graph") plot(rules.all, method="graph", control=list(type="items")) # 70/71 plot(rules.all, method="paracoord", control=list(reorder=TRUE))