-->

arules: How find the data matching an lhs(rule) in

2019-08-10 03:32发布

问题:

I'm finding working with the arule package a bit tricky. I'm using the apriori algorithm to find association rules; something similar to an example in the arules documentation.

data("AdultUCI")
dim(AdultUCI)
AdultUCI[1:2,]

#Ignore everything from here to the last two lines, this is just data preparation

## remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL

## map metric attributes
AdultUCI[[ "age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15,25,45,65,100)),
                              labels = c("Young", "Middle-aged", "Senior", "Old"))

AdultUCI[[ "hours-per-week"]] <- ordered(cut(AdultUCI[[ "hours-per-week"]],
                                             c(0,25,40,60,168)),
                                         labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))

AdultUCI[[ "capital-gain"]] <- ordered(cut(AdultUCI[[ "capital-gain"]],
                                           c(-Inf,0,median(AdultUCI[[ "capital-gain"]][AdultUCI[[ "capital-gain"]]>0]),
                                             Inf)), labels = c("None", "Low", "High"))

AdultUCI[[ "capital-loss"]] <- ordered(cut(AdultUCI[[ "capital-loss"]],
                                           c(-Inf,0, median(AdultUCI[[ "capital-loss"]][AdultUCI[[ "capital-loss"]]>0]),
                                             Inf)), labels = c("None", "Low", "High"))

#resume reading here
rules <- apriori(AdultUCI, parameter=list(support=0.6, confidence=0.75, minlen=4))
inspect(rules)

Which returns the following four rules

lhs                               rhs                             support confidence      lift
1 {race=White,                                                                                  
   capital-gain=None,                                                                           
   native-country=United-States} => {capital-loss=None}            0.680398  0.9457029 0.9920537
2 {race=White,                                                                                  
   capital-loss=None,                                                                           
   native-country=United-States} => {capital-gain=None}            0.680398  0.9083504 0.9901500
3 {race=White,                                                                                  
   capital-gain=None,                                                                           
   capital-loss=None}            => {native-country=United-States} 0.680398  0.9189249 1.0239581
4 {capital-gain=None,                                                                           
   capital-loss=None,                                                                           
   native-country=United-States} => {race=White}                   0.680398  0.8730100 1.0210133

I must be missing something: how do you find the rows in the source data that match an lhs rule using just arules functions?

Is there an easy way to build an SQL WHERE clause from the lhs(rules)?

Thanks

回答1:

This answer is based in the following answer: https://stats.stackexchange.com/questions/21340/finding-suitable-rules-for-new-data-using-arules. The solution is very slow, i´m not sure if will work for large aplications.

library(arules)

rules <- apriori(AdultUCI, parameter=list(support=0.4, confidence=0.75, minlen=4))
inspect(rules)

rec <- function(rules, data, iter){
  basket <- data[iter]
  rulesMatchLHS <- is.subset(rules@lhs,basket)
  suitableRules <-  rulesMatchLHS & !(is.subset(rules@rhs,basket))
  rules <- sort(rules[rulesMatchLHS], decreasing=TRUE, by="lift")
  as(head(rules, 1), "data.frame")
}

recom_loop <- function(rules, data){
  temp <- lapply(seq_along(data), function(x) rec(rules, data, x))
  temp <- do.call("rbind", temp)
  recom <- gsub(".*=> |\\{|\\}", "", temp$rules)
  as.data.frame(cbind(as(data, "data.frame"), recom))  
}

trans <- as(AdultUCI, "transactions")
recom <- recom_loop(rules, trans[1:50])

Here is some example output:

head(recom)
  transactionID
1             1
2             2
3             3
4             4
5             5
6             6
                                                                                                                                                                                                                                                                     items
1      {age=Middle-aged,workclass=State-gov,education=Bachelors,marital-status=Never-married,occupation=Adm-clerical,relationship=Not-in-family,race=White,sex=Male,capital-gain=Low,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
2 {age=Senior,workclass=Self-emp-not-inc,education=Bachelors,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Husband,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Part-time,native-country=United-States,income=small}
3         {age=Middle-aged,workclass=Private,education=HS-grad,marital-status=Divorced,occupation=Handlers-cleaners,relationship=Not-in-family,race=White,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
4             {age=Senior,workclass=Private,education=11th,marital-status=Married-civ-spouse,occupation=Handlers-cleaners,relationship=Husband,race=Black,sex=Male,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
5                {age=Middle-aged,workclass=Private,education=Bachelors,marital-status=Married-civ-spouse,occupation=Prof-specialty,relationship=Wife,race=Black,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=Cuba,income=small}
6        {age=Middle-aged,workclass=Private,education=Masters,marital-status=Married-civ-spouse,occupation=Exec-managerial,relationship=Wife,race=White,sex=Female,capital-gain=None,capital-loss=None,hours-per-week=Full-time,native-country=United-States,income=small}
              recom
1        race=White
2        race=White
3        race=White
4        race=White
5        race=White
6 capital-gain=None


回答2:

As for the first question, transactions supporting may be found using this function (should work faster, than the one from the other response):

supp_trans_ids = function(items, transactions){
  # makes a logical matrix showing which set of items in rows are fully contains in transactions on rows 
  tmp = is.subset(items, transactions)

  tmp2 = lapply(
    seq_len(nrow(tmp)),
    # 'which' alone would leave a name for each index, which is a complete rule (and would use a lot of memory therefore)
    function(i) {
      t = which(tmp[i,])
      names(t) = NULL
      t
    }
  )

  # to easily idenfify sets of items
  names(tmp2) = rownames(tmp)

  tmp2
}

Now, you may find which transactions support each rule's lhs with:

AdultUCI_trans = as(AdultUCI, 'transactions')
trans_supporting = supp_trans_ids(lhs(rules), AdultUCI_trans)

e.g.

> str(trans_supporting)
List of 4
 $ {race=White,capital-gain=None,native-country=United-States}       : int [1:35140] 2 3 6 8 13 17 18 19 20 21 ...
 $ {race=White,capital-loss=None,native-country=United-States}       : int [1:36585] 1 2 3 6 8 9 10 13 17 18 ...
 $ {race=White,capital-gain=None,capital-loss=None}                  : int [1:36164] 2 3 6 8 13 17 18 19 20 21 ...
 $ {capital-gain=None,capital-loss=None,native-country=United-States}: int [1:38066] 2 3 4 6 8 11 13 14 17 18 ...

And data you may find with:

AdultUCI_trans[trans_supporting[[1]]] # transactions supporting
AdultUCI[trans_supporting[[1]],] # data on which these transactions are based


标签: r apriori arules