Faster method than “while” loop to find chain of i

2019-05-06 10:29发布

问题:

I'm analyzing large tables (300 000 - 500 000 rows) that store data output by a disease simulation model. In the model, animals on a landscape infect other animals. For example, in the example pictured below, animal a1 infects every animal on the landscape, and the infection moves from animal to animal, branching off into "chains" of infection.

In my example below, I want to take the table that stores information about each animal (in my example below, table = allanimals) and slice out just the information about animal d2's chain of infection (I've highlighted d2's chain in green) so I can calculate the average habitat value for that chain of infection.

Although my while loop works, it is slow like molasses when the table stores hundreds of thousands of rows, and the chain has 40-100 members.

Any ideas on how to speed this up? Hoping for a tidyverse solution. I know it "looks fast enough" with my example dataset, but it really is slow with my data...

Schematic:

Desired output from sample data below:

   AnimalID InfectingAnimal habitat
1        d2              d1       1
2        d1              c3       1
3        c3              c2       3
4        c2              c1       2
5        c1              b3       3
6        b3              b2       6
7        b2              b1       5
8        b1              a2       4
9        a2              a1       2
10       a1               x       1

Sample code:

library(tidyverse)

# make some data
allanimals <- structure(list(AnimalID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8",
"b1", "b2", "b3", "b4", "b5", "c1", "c2", "c3", "c4", "d1", "d2", "e1", "e2",
"e3", "e4", "e5", "e6", "f1", "f2", "f3", "f4", "f5", "f6", "f7"),
InfectingAnimal = c("x", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a2", "b1",
"b2", "b3", "b4", "b3", "c1", "c2", "c3", "c3", "d1", "b1", "e1", "e2", "e3",
"e4", "e5", "e1", "f1", "f2", "f3", "f4", "f5", "f6"), habitat = c(1L, 2L, 1L,
2L, 2L, 1L, 3L, 2L, 4L, 5L, 6L, 1L, 2L, 3L, 2L, 3L, 2L, 1L, 1L, 2L, 5L, 4L,
1L, 1L, 1L, 1L, 4L, 5L, 4L, 5L, 4L, 3L)), .Names = c("AnimalID",
"InfectingAnimal", "habitat"), class = "data.frame", row.names = c(NA, -32L))

# check it out
head(allanimals)

# Start with animal I'm interested in - say, d2
Focal.Animal <- "d2"

# Make a 1-row data.frame with d2's information
Focal.Animal <- allanimals %>% 
  filter(AnimalID == Focal.Animal)

# This is the animal we start with
Focal.Animal

# Make a new data.frame to store our results of the while loop in
Chain <- Focal.Animal

# make a condition to help while loop
InfectingAnimalInTable <- TRUE

# time it 
ptm <- proc.time()

# Run loop until you find an animal that isn't in the table, then stop
while(InfectingAnimalInTable == TRUE){
    # Who is the next infecting animal?
    NextAnimal <- Chain %>% 
      slice(n()) %>% 
      select(InfectingAnimal) %>% 
      unlist()

    NextRow <- allanimals %>% 
      filter(AnimalID == NextAnimal)


    # If there is an infecting animal in the table, 
    if (nrow(NextRow) > 0) {
      # Add this to the Chain table
      Chain[(nrow(Chain)+1),] <- NextRow
      #Otherwise, if there is no infecting animal in the  table, 
      # define the Infecting animal follows, this will stop the loop.
    } else {InfectingAnimalInTable <- FALSE}
  }

proc.time() - ptm

# did it work? Check out the Chain data.frame
Chain

回答1:

So the problem here is with your data structure. You will need a vector that stores who is infected by who (keeping the who as integers):

allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))

infected <- rep(NA_integer_, length(allanimals_ID))
infected[match(allanimals$AnimalID, allanimals_ID)] <-
  match(allanimals$InfectingAnimal, allanimals_ID)

path <- rep(NA_integer_, length(allanimals_ID))
curOne <- match("d2", allanimals_ID)
i <- 1
while (!is.na(nextOne <- infected[curOne])) {
  path[i] <- curOne
  i <- i + 1
  curOne <- nextOne
}

allanimals[path[seq_len(i - 1)], ]

For extra performance gain, recode this loop with Rcpp :')



回答2:

You can be able to write a function that does this:

path= function(animals,dat){

  .path=function(x,d=""){
    k=match(x,dat[,1])
    d = paste(d,do.call(paste,dat[k,]),sep="\n ")
    ifelse(is.na(k),d,.path(dat[k,2],d))}

  n = .path(animals)
  regmatches(n,gregexpr("(?<=\\n)",n,perl = T)) = animals

  tab = na.omit(read.table(text=n,col.names = c("grp",names(dat))))
  split(tab[-1],tab$grp)# This is not necessary. You can decide to return the tab

}

path("d2",allanimals)
$`d2`
   AnimalID InfectingAnimal habitat
1        d2              d1       1
2        d1              c3       1
3        c3              c2       3
4        c2              c1       2
5        c1              b3       3
6        b3              b2       6
7        b2              b1       5
8        b1              a2       4
9        a2              a1       2
10       a1               x       1

This function can also give the paths for all the other animals in 4 miliseconds:

allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal)
path(allanimals_ID,allanimals)
$`a1`
  AnimalID InfectingAnimal habitat
1       a1               x       1

$a2
  AnimalID InfectingAnimal habitat
3       a2              a1       2
4       a1               x       1

$a3
  AnimalID InfectingAnimal habitat
6       a3              a2       1
7       a2              a1       2
8       a1               x       1

$a4
   AnimalID InfectingAnimal habitat
10       a4              a3       2
11       a3              a2       1
12       a2              a1       2
13       a1               x       1

$a5
   AnimalID InfectingAnimal habitat
15       a5              a4       2
16       a4              a3       2
17       a3              a2       1
18       a2              a1       2
19       a1               x       1

$a6
   AnimalID InfectingAnimal habitat
21       a6              a5       1
22       a5              a4       2
23       a4              a3       2
24       a3              a2       1
25       a2              a1       2
26       a1               x       1

$a7
   AnimalID InfectingAnimal habitat
28       a7              a6       3
29       a6              a5       1
30       a5              a4       2
31       a4              a3       2
32       a3              a2       1
33       a2              a1       2
34       a1               x       1

$a8
   AnimalID InfectingAnimal habitat
36       a8              a7       2
37       a7              a6       3
38       a6              a5       1
39       a5              a4       2
40       a4              a3       2
41       a3              a2       1
42       a2              a1       2
43       a1               x       1

$b1
   AnimalID InfectingAnimal habitat
45       b1              a2       4
46       a2              a1       2
47       a1               x       1

$b2
   AnimalID InfectingAnimal habitat
49       b2              b1       5
50       b1              a2       4
51       a2              a1       2
52       a1               x       1

$b3
   AnimalID InfectingAnimal habitat
54       b3              b2       6
55       b2              b1       5
56       b1              a2       4
57       a2              a1       2
58       a1               x       1

$b4
   AnimalID InfectingAnimal habitat
60       b4              b3       1
61       b3              b2       6
62       b2              b1       5
63       b1              a2       4
64       a2              a1       2
65       a1               x       1

$b5
   AnimalID InfectingAnimal habitat
67       b5              b4       2
68       b4              b3       1
69       b3              b2       6
70       b2              b1       5
71       b1              a2       4
72       a2              a1       2
73       a1               x       1

$c1
   AnimalID InfectingAnimal habitat
75       c1              b3       3
76       b3              b2       6
77       b2              b1       5
78       b1              a2       4
79       a2              a1       2
80       a1               x       1

$c2
   AnimalID InfectingAnimal habitat
82       c2              c1       2
83       c1              b3       3
84       b3              b2       6
85       b2              b1       5
86       b1              a2       4
87       a2              a1       2
88       a1               x       1

$c3
   AnimalID InfectingAnimal habitat
90       c3              c2       3
91       c2              c1       2
92       c1              b3       3
93       b3              b2       6
94       b2              b1       5
95       b1              a2       4
96       a2              a1       2
97       a1               x       1

$c4
    AnimalID InfectingAnimal habitat
99        c4              c3       2
100       c3              c2       3
101       c2              c1       2
102       c1              b3       3
103       b3              b2       6
104       b2              b1       5
105       b1              a2       4
106       a2              a1       2
107       a1               x       1

$d1
    AnimalID InfectingAnimal habitat
109       d1              c3       1
110       c3              c2       3
111       c2              c1       2
112       c1              b3       3
113       b3              b2       6
114       b2              b1       5
115       b1              a2       4
116       a2              a1       2
117       a1               x       1

$d2
    AnimalID InfectingAnimal habitat
119       d2              d1       1
120       d1              c3       1
121       c3              c2       3
122       c2              c1       2
123       c1              b3       3
124       b3              b2       6
125       b2              b1       5
126       b1              a2       4
127       a2              a1       2
128       a1               x       1

$e1
    AnimalID InfectingAnimal habitat
130       e1              b1       2
131       b1              a2       4
132       a2              a1       2
133       a1               x       1

$e2
    AnimalID InfectingAnimal habitat
135       e2              e1       5
136       e1              b1       2
137       b1              a2       4
138       a2              a1       2
139       a1               x       1

$e3
    AnimalID InfectingAnimal habitat
141       e3              e2       4
142       e2              e1       5
143       e1              b1       2
144       b1              a2       4
145       a2              a1       2
146       a1               x       1

$e4
    AnimalID InfectingAnimal habitat
148       e4              e3       1
149       e3              e2       4
150       e2              e1       5
151       e1              b1       2
152       b1              a2       4
153       a2              a1       2
154       a1               x       1

$e5
    AnimalID InfectingAnimal habitat
156       e5              e4       1
157       e4              e3       1
158       e3              e2       4
159       e2              e1       5
160       e1              b1       2
161       b1              a2       4
162       a2              a1       2
163       a1               x       1

$e6
    AnimalID InfectingAnimal habitat
165       e6              e5       1
166       e5              e4       1
167       e4              e3       1
168       e3              e2       4
169       e2              e1       5
170       e1              b1       2
171       b1              a2       4
172       a2              a1       2
173       a1               x       1

$f1
    AnimalID InfectingAnimal habitat
175       f1              e1       1
176       e1              b1       2
177       b1              a2       4
178       a2              a1       2
179       a1               x       1

$f2
    AnimalID InfectingAnimal habitat
181       f2              f1       4
182       f1              e1       1
183       e1              b1       2
184       b1              a2       4
185       a2              a1       2
186       a1               x       1

$f3
    AnimalID InfectingAnimal habitat
188       f3              f2       5
189       f2              f1       4
190       f1              e1       1
191       e1              b1       2
192       b1              a2       4
193       a2              a1       2
194       a1               x       1

$f4
    AnimalID InfectingAnimal habitat
196       f4              f3       4
197       f3              f2       5
198       f2              f1       4
199       f1              e1       1
200       e1              b1       2
201       b1              a2       4
202       a2              a1       2
203       a1               x       1

$f5
    AnimalID InfectingAnimal habitat
205       f5              f4       5
206       f4              f3       4
207       f3              f2       5
208       f2              f1       4
209       f1              e1       1
210       e1              b1       2
211       b1              a2       4
212       a2              a1       2
213       a1               x       1

$f6
    AnimalID InfectingAnimal habitat
215       f6              f5       4
216       f5              f4       5
217       f4              f3       4
218       f3              f2       5
219       f2              f1       4
220       f1              e1       1
221       e1              b1       2
222       b1              a2       4
223       a2              a1       2
224       a1               x       1

$f7
    AnimalID InfectingAnimal habitat
226       f7              f6       3
227       f6              f5       4
228       f5              f4       5
229       f4              f3       4
230       f3              f2       5
231       f2              f1       4
232       f1              e1       1
233       e1              b1       2
234       b1              a2       4
235       a2              a1       2
236       a1               x       1

$x
[1] AnimalID        InfectingAnimal habitat        
<0 rows> (or 0-length row.names)

when comparing it to the while loop above using microbenchmark, this function is twice as fast as thewhile loop`.

microbenchmark::microbenchmark(
  path_= {path= function(animals,dat){

    .path=function(x,d=""){
      k=match(x,dat[,1])
      d = paste(d,do.call(paste,dat[k,]),sep="\n ")
      ifelse(is.na(k),d,.path(dat[k,2],d))}

    n = .path(animals)
    regmatches(n,gregexpr("(?<=\\n)",n,perl = T)) = animals

    tab = na.omit(read.table(text=n,col.names = c("grp",names(dat))))
    split(tab[-1],tab$grp)# This is not necessary. You can decide to return the tab

  }
  path("d2",allanimals)
  },

  answer_above= {allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))

  infected <- rep(NA_integer_, length(allanimals_ID))
  infected[match(allanimals$AnimalID, allanimals_ID)] <-
    match(allanimals$InfectingAnimal, allanimals_ID)

  path <- rep(NA_integer_, length(allanimals_ID))
  curOne <- match("d2", allanimals_ID)
  i <- 1
  while (!is.na(nextOne <- infected[curOne])) {
    path[i] <- curOne
    i <- i + 1
    curOne <- nextOne
  }

  allanimals[path[seq_len(i - 1)], ]}
)
Unit: milliseconds
         expr      min       lq     mean   median       uq       max neval
        path_ 1.347699 1.394348 1.606106 1.448677 1.526331 11.800467   100
 answer_above 2.655575 2.734935 2.897814 2.800926 2.882846  6.433567   100