I have a MongoDB collection, whose docs use several levels of nesting, from which I would like to extract a multidimensional array compiled from a subset of their fields. I have a solution that works for me right now, but I want to better understand this concept of 'idempotency' and its consequences related to the reduce function.
{
"host_name" : "gateway",
"service_description" : "PING",
"last_update" : 1305777787,
"performance_object" : [
[ "rta", 0.105, "ms", 100, 500, 0 ],
[ "pl", 0, "%", 20, 60, 0 ]
]
}
And here are the map/reduce functions
var M = function() {
var hn = this.host_name,
sv = this.service_description,
ts = this.last_update;
this.performance_object.forEach(function(P){
emit( {
host: hn,
service: sv,
metric: P[0]
}, {
time: ts,
value: P[1]
} );
});
}
var R = function(key,values) {
var result = {
time: [],
value: []
};
values.forEach(function(V){
result.time.push(V.time);
result.value.push(V.value);
});
return result;
}
db.runCommand({
mapreduce: <colname>,
out: <col2name>,
map: M,
reduce: R
});
Data is returned in a useful structure, which I reformat/sort with finalize for graphing.
{
"_id" : {
"host" : "localhost",
"service" : "Disk Space",
"metric" : "/var/bck"
},
"value" : {
"time" : [
[ 1306719302, 1306719601, 1306719903, ... ],
[ 1306736404, 1306736703, 1306737002, ... ],
[ 1306766401, 1306766701, 1306767001, ... ]
],
"value" : [
[ 122, 23423, 25654, ... ],
[ 336114, 342511, 349067, ... ],
[ 551196, 551196, 551196, ... ]
]
}
}
Finally...
[ [1306719302,122], [1306719601,23423], [1306719903,25654], ... ]
TL;DR: What is the expected behavior with the oberved "chunking" of the array results?
I understand that the reduce function may be called multiple times on array(s) of emitted values, which is why there are several "chunks" of the complete arrays, rather than a single array. The array chunks are typically 25-50 items and it's easy enough to clean this up in finalize(). I concat() the arrays, interleave them as [time,value] and sort. But what I really want to know is if this can get more complex:
1) Is the chunking observed because of my code, MongoDB's implementation or the Map/Reduce algorithm itself?
2) Will there ever be deeper (recursive) nesting of array chunks in sharded configurations or even just because of my hasty implementation? This would break the concat() method.
3) Is there simply a better strategy for getting array results as shown above?
EDIT: Modified to emit arrays:
I took Thomas' advise and re-wrote it to emit arrays. It absolutely doesn't make any sense to split up the values.
var M = function() {
var hn = this.host_name,
sv = this.service_description,
ts = this.last_update;
this.performance_object.forEach(function(P){
emit( {
host: hn,
service: sv,
metric: P[0]
}, {
value: [ ts, P[1] ]
} );
});
}
var R = function(key,values) {
var result = {
value: []
};
values.forEach(function(V){
result.value.push(V.value);
});
return result;
}
db.runCommand({
mapreduce: <colname>,
out: <col2name>,
map: M,
reduce: R
});
Now the output is similar to this:
{
"_id" : {
"host" : "localhost",
"service" : "Disk Space",
"metric" : "/var/bck"
},
"value" : {
"value" : [
[ [1306736404,336114],[1306736703,342511],[1306737002,349067], ... ],
[ [1306766401,551196],[1306766701,551196],[1306767001,551196], ... ],
[ [1306719302,122],[1306719601,122],[1306719903,122], ... ]
]
}
}
And I used this finalize function to concatenate the array chunks and sort them.
...
var F = function(key,values) {
return (Array.concat.apply([],values.value)).sort(function(a,b){
if (a[0] < b[0]) return -1;
if (a[0] > b[0]) return 1;
return 0;
});
}
db.runCommand({
mapreduce: <colname>,
out: <col2name>,
map: M,
reduce: R,
finalize: F
});
Which works nicely:
{
"_id" : {
"host" : "localhost",
"service" : "Disk Space",
"metric" : "/mnt/bck"
},
"value" : [ [1306719302,122],[1306719601,122],[1306719903,122],, ... ]
}
I guess the only question that's gnawing at me is whether this Array.concat.apply([],values.value) can be trusted to clean up the output of reduce all of the time.
LAST EDIT: Much simpler...
I have modified the document structure since the original example given above, but this only changes the example by making the map function really simple.
I'm still trying to wrap my brain around why Array.prototype.push.apply(result, V.data) works so differently from result.push(V.data)... but it works.
var M = function() {
emit( {
host: this.host,
service: this.service,
metric: this.metric
} , {
data: [ [ this.timestamp, this.data ] ]
} );
}
var R = function(key,values) {
var result = [];
values.forEach(function(V){
Array.prototype.push.apply(result, V.data);
});
return { data: result };
}
var F = function(key,values) {
return values.data.sort(function(a,b){
return (a[0]<b[0]) ? -1 : (a[0]>b[0]) ? 1 : 0;
});
}
It has the same output as shown just above the LAST EDIT heading.
Thanks, Thomas!