I'm analyzing large tables (300 000 - 500 000 rows) that store data output by a disease simulation model. In the model, animals on a landscape infect other animals. For example, in the example pictured below, animal a1 infects every animal on the landscape, and the infection moves from animal to animal, branching off into "chains" of infection.
In my example below, I want to take the table that stores information about each animal (in my example below, table = allanimals
) and slice out just the information about animal d2
's chain of infection (I've highlighted d2
's chain in green) so I can calculate the average habitat value for that chain of infection.
Although my while loop works, it is slow like molasses when the table stores hundreds of thousands of rows, and the chain has 40-100 members.
Any ideas on how to speed this up? Hoping for a tidyverse
solution. I know it "looks fast enough" with my example dataset, but it really is slow with my data...
Schematic:
Desired output from sample data below:
AnimalID InfectingAnimal habitat
1 d2 d1 1
2 d1 c3 1
3 c3 c2 3
4 c2 c1 2
5 c1 b3 3
6 b3 b2 6
7 b2 b1 5
8 b1 a2 4
9 a2 a1 2
10 a1 x 1
Sample code:
library(tidyverse)
# make some data
allanimals <- structure(list(AnimalID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8",
"b1", "b2", "b3", "b4", "b5", "c1", "c2", "c3", "c4", "d1", "d2", "e1", "e2",
"e3", "e4", "e5", "e6", "f1", "f2", "f3", "f4", "f5", "f6", "f7"),
InfectingAnimal = c("x", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a2", "b1",
"b2", "b3", "b4", "b3", "c1", "c2", "c3", "c3", "d1", "b1", "e1", "e2", "e3",
"e4", "e5", "e1", "f1", "f2", "f3", "f4", "f5", "f6"), habitat = c(1L, 2L, 1L,
2L, 2L, 1L, 3L, 2L, 4L, 5L, 6L, 1L, 2L, 3L, 2L, 3L, 2L, 1L, 1L, 2L, 5L, 4L,
1L, 1L, 1L, 1L, 4L, 5L, 4L, 5L, 4L, 3L)), .Names = c("AnimalID",
"InfectingAnimal", "habitat"), class = "data.frame", row.names = c(NA, -32L))
# check it out
head(allanimals)
# Start with animal I'm interested in - say, d2
Focal.Animal <- "d2"
# Make a 1-row data.frame with d2's information
Focal.Animal <- allanimals %>%
filter(AnimalID == Focal.Animal)
# This is the animal we start with
Focal.Animal
# Make a new data.frame to store our results of the while loop in
Chain <- Focal.Animal
# make a condition to help while loop
InfectingAnimalInTable <- TRUE
# time it
ptm <- proc.time()
# Run loop until you find an animal that isn't in the table, then stop
while(InfectingAnimalInTable == TRUE){
# Who is the next infecting animal?
NextAnimal <- Chain %>%
slice(n()) %>%
select(InfectingAnimal) %>%
unlist()
NextRow <- allanimals %>%
filter(AnimalID == NextAnimal)
# If there is an infecting animal in the table,
if (nrow(NextRow) > 0) {
# Add this to the Chain table
Chain[(nrow(Chain)+1),] <- NextRow
#Otherwise, if there is no infecting animal in the table,
# define the Infecting animal follows, this will stop the loop.
} else {InfectingAnimalInTable <- FALSE}
}
proc.time() - ptm
# did it work? Check out the Chain data.frame
Chain
So the problem here is with your data structure. You will need a vector that stores who is infected by who (keeping the who as integers):
allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))
infected <- rep(NA_integer_, length(allanimals_ID))
infected[match(allanimals$AnimalID, allanimals_ID)] <-
match(allanimals$InfectingAnimal, allanimals_ID)
path <- rep(NA_integer_, length(allanimals_ID))
curOne <- match("d2", allanimals_ID)
i <- 1
while (!is.na(nextOne <- infected[curOne])) {
path[i] <- curOne
i <- i + 1
curOne <- nextOne
}
allanimals[path[seq_len(i - 1)], ]
For extra performance gain, recode this loop with Rcpp :')
You can be able to write a function that does this:
path= function(animals,dat){
.path=function(x,d=""){
k=match(x,dat[,1])
d = paste(d,do.call(paste,dat[k,]),sep="\n ")
ifelse(is.na(k),d,.path(dat[k,2],d))}
n = .path(animals)
regmatches(n,gregexpr("(?<=\\n)",n,perl = T)) = animals
tab = na.omit(read.table(text=n,col.names = c("grp",names(dat))))
split(tab[-1],tab$grp)# This is not necessary. You can decide to return the tab
}
path("d2",allanimals)
$`d2`
AnimalID InfectingAnimal habitat
1 d2 d1 1
2 d1 c3 1
3 c3 c2 3
4 c2 c1 2
5 c1 b3 3
6 b3 b2 6
7 b2 b1 5
8 b1 a2 4
9 a2 a1 2
10 a1 x 1
This function can also give the paths for all the other animals in 4 miliseconds:
allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal)
path(allanimals_ID,allanimals)
$`a1`
AnimalID InfectingAnimal habitat
1 a1 x 1
$a2
AnimalID InfectingAnimal habitat
3 a2 a1 2
4 a1 x 1
$a3
AnimalID InfectingAnimal habitat
6 a3 a2 1
7 a2 a1 2
8 a1 x 1
$a4
AnimalID InfectingAnimal habitat
10 a4 a3 2
11 a3 a2 1
12 a2 a1 2
13 a1 x 1
$a5
AnimalID InfectingAnimal habitat
15 a5 a4 2
16 a4 a3 2
17 a3 a2 1
18 a2 a1 2
19 a1 x 1
$a6
AnimalID InfectingAnimal habitat
21 a6 a5 1
22 a5 a4 2
23 a4 a3 2
24 a3 a2 1
25 a2 a1 2
26 a1 x 1
$a7
AnimalID InfectingAnimal habitat
28 a7 a6 3
29 a6 a5 1
30 a5 a4 2
31 a4 a3 2
32 a3 a2 1
33 a2 a1 2
34 a1 x 1
$a8
AnimalID InfectingAnimal habitat
36 a8 a7 2
37 a7 a6 3
38 a6 a5 1
39 a5 a4 2
40 a4 a3 2
41 a3 a2 1
42 a2 a1 2
43 a1 x 1
$b1
AnimalID InfectingAnimal habitat
45 b1 a2 4
46 a2 a1 2
47 a1 x 1
$b2
AnimalID InfectingAnimal habitat
49 b2 b1 5
50 b1 a2 4
51 a2 a1 2
52 a1 x 1
$b3
AnimalID InfectingAnimal habitat
54 b3 b2 6
55 b2 b1 5
56 b1 a2 4
57 a2 a1 2
58 a1 x 1
$b4
AnimalID InfectingAnimal habitat
60 b4 b3 1
61 b3 b2 6
62 b2 b1 5
63 b1 a2 4
64 a2 a1 2
65 a1 x 1
$b5
AnimalID InfectingAnimal habitat
67 b5 b4 2
68 b4 b3 1
69 b3 b2 6
70 b2 b1 5
71 b1 a2 4
72 a2 a1 2
73 a1 x 1
$c1
AnimalID InfectingAnimal habitat
75 c1 b3 3
76 b3 b2 6
77 b2 b1 5
78 b1 a2 4
79 a2 a1 2
80 a1 x 1
$c2
AnimalID InfectingAnimal habitat
82 c2 c1 2
83 c1 b3 3
84 b3 b2 6
85 b2 b1 5
86 b1 a2 4
87 a2 a1 2
88 a1 x 1
$c3
AnimalID InfectingAnimal habitat
90 c3 c2 3
91 c2 c1 2
92 c1 b3 3
93 b3 b2 6
94 b2 b1 5
95 b1 a2 4
96 a2 a1 2
97 a1 x 1
$c4
AnimalID InfectingAnimal habitat
99 c4 c3 2
100 c3 c2 3
101 c2 c1 2
102 c1 b3 3
103 b3 b2 6
104 b2 b1 5
105 b1 a2 4
106 a2 a1 2
107 a1 x 1
$d1
AnimalID InfectingAnimal habitat
109 d1 c3 1
110 c3 c2 3
111 c2 c1 2
112 c1 b3 3
113 b3 b2 6
114 b2 b1 5
115 b1 a2 4
116 a2 a1 2
117 a1 x 1
$d2
AnimalID InfectingAnimal habitat
119 d2 d1 1
120 d1 c3 1
121 c3 c2 3
122 c2 c1 2
123 c1 b3 3
124 b3 b2 6
125 b2 b1 5
126 b1 a2 4
127 a2 a1 2
128 a1 x 1
$e1
AnimalID InfectingAnimal habitat
130 e1 b1 2
131 b1 a2 4
132 a2 a1 2
133 a1 x 1
$e2
AnimalID InfectingAnimal habitat
135 e2 e1 5
136 e1 b1 2
137 b1 a2 4
138 a2 a1 2
139 a1 x 1
$e3
AnimalID InfectingAnimal habitat
141 e3 e2 4
142 e2 e1 5
143 e1 b1 2
144 b1 a2 4
145 a2 a1 2
146 a1 x 1
$e4
AnimalID InfectingAnimal habitat
148 e4 e3 1
149 e3 e2 4
150 e2 e1 5
151 e1 b1 2
152 b1 a2 4
153 a2 a1 2
154 a1 x 1
$e5
AnimalID InfectingAnimal habitat
156 e5 e4 1
157 e4 e3 1
158 e3 e2 4
159 e2 e1 5
160 e1 b1 2
161 b1 a2 4
162 a2 a1 2
163 a1 x 1
$e6
AnimalID InfectingAnimal habitat
165 e6 e5 1
166 e5 e4 1
167 e4 e3 1
168 e3 e2 4
169 e2 e1 5
170 e1 b1 2
171 b1 a2 4
172 a2 a1 2
173 a1 x 1
$f1
AnimalID InfectingAnimal habitat
175 f1 e1 1
176 e1 b1 2
177 b1 a2 4
178 a2 a1 2
179 a1 x 1
$f2
AnimalID InfectingAnimal habitat
181 f2 f1 4
182 f1 e1 1
183 e1 b1 2
184 b1 a2 4
185 a2 a1 2
186 a1 x 1
$f3
AnimalID InfectingAnimal habitat
188 f3 f2 5
189 f2 f1 4
190 f1 e1 1
191 e1 b1 2
192 b1 a2 4
193 a2 a1 2
194 a1 x 1
$f4
AnimalID InfectingAnimal habitat
196 f4 f3 4
197 f3 f2 5
198 f2 f1 4
199 f1 e1 1
200 e1 b1 2
201 b1 a2 4
202 a2 a1 2
203 a1 x 1
$f5
AnimalID InfectingAnimal habitat
205 f5 f4 5
206 f4 f3 4
207 f3 f2 5
208 f2 f1 4
209 f1 e1 1
210 e1 b1 2
211 b1 a2 4
212 a2 a1 2
213 a1 x 1
$f6
AnimalID InfectingAnimal habitat
215 f6 f5 4
216 f5 f4 5
217 f4 f3 4
218 f3 f2 5
219 f2 f1 4
220 f1 e1 1
221 e1 b1 2
222 b1 a2 4
223 a2 a1 2
224 a1 x 1
$f7
AnimalID InfectingAnimal habitat
226 f7 f6 3
227 f6 f5 4
228 f5 f4 5
229 f4 f3 4
230 f3 f2 5
231 f2 f1 4
232 f1 e1 1
233 e1 b1 2
234 b1 a2 4
235 a2 a1 2
236 a1 x 1
$x
[1] AnimalID InfectingAnimal habitat
<0 rows> (or 0-length row.names)
when comparing it to the while
loop above using m
icrobenchmark, this function is twice as fast as the
while loop`.
microbenchmark::microbenchmark(
path_= {path= function(animals,dat){
.path=function(x,d=""){
k=match(x,dat[,1])
d = paste(d,do.call(paste,dat[k,]),sep="\n ")
ifelse(is.na(k),d,.path(dat[k,2],d))}
n = .path(animals)
regmatches(n,gregexpr("(?<=\\n)",n,perl = T)) = animals
tab = na.omit(read.table(text=n,col.names = c("grp",names(dat))))
split(tab[-1],tab$grp)# This is not necessary. You can decide to return the tab
}
path("d2",allanimals)
},
answer_above= {allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))
infected <- rep(NA_integer_, length(allanimals_ID))
infected[match(allanimals$AnimalID, allanimals_ID)] <-
match(allanimals$InfectingAnimal, allanimals_ID)
path <- rep(NA_integer_, length(allanimals_ID))
curOne <- match("d2", allanimals_ID)
i <- 1
while (!is.na(nextOne <- infected[curOne])) {
path[i] <- curOne
i <- i + 1
curOne <- nextOne
}
allanimals[path[seq_len(i - 1)], ]}
)
Unit: milliseconds
expr min lq mean median uq max neval
path_ 1.347699 1.394348 1.606106 1.448677 1.526331 11.800467 100
answer_above 2.655575 2.734935 2.897814 2.800926 2.882846 6.433567 100