I'm new with mongo aggregations, and I need a help with creating one,
I have a collection of the following document as an example:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"timestamp" : 1526476550940,
"timeSpent" : 3180
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"timestamp" : 1625476550940,
"timeSpent" : 318000,
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"timestamp" : 15211276550940,
"timeSpent" : 318011
}
],
"variables" : [
{
"_id" : "cd4318a83c9b-a8478d76bfd3e4b6-5967",
"name" : "Customer Profile",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "366cb8c07996-c62c37a87a86d526-d3e7",
"name" : "Target Telephony Queue",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "4ed84742da33-d70ba8a809b712f3-bdf4",
"name" : "IMEI",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"values" : [
{
"value" : "3000",
"timestamp" : NumberLong(1526476550940),
"element" : "c8103687c1c8-97d749e349d785c8-9154"
}
],
"additionalData" : {}
}
]
}
I need to have a resulting document showing the how many times each city have been visited by each traveler in the collection, and the average budget (budget is an element in the variables array
so the resulting document will be similar to:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"visited":2
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"visited":1
}
],
"variables" : [
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"values" : [
{
"value" : "3000",
}
],
}
],
}
Thank you for your help
As a quick note, you need to change your "value"
field inside the "values"
to be numeric, since it's presently a string. But on to the answer:
If you have access to $reduce
from MongoDB 3.4, then you can actually do something like this:
db.collection.aggregate([
{ "$addFields": {
"cities": {
"$reduce": {
"input": "$cities",
"initialValue": [],
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
"then": {
"$concatArrays": [
{ "$filter": {
"input": "$$value",
"as": "v",
"cond": { "$ne": [ "$$this._id", "$$v._id" ] }
}},
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": {
"$add": [
{ "$arrayElemAt": [
"$$value.visited",
{ "$indexOfArray": [ "$$value._id", "$$this._id" ] }
]},
1
]
}
}]
]
},
"else": {
"$concatArrays": [
"$$value",
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": 1
}]
]
}
}
}
}
},
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"_id": "$$this._id",
"name": "$$this.name",
"defaultValue": "$$this.defaultValue",
"lastValue": "$$this.lastValue",
"value": { "$avg": "$$this.values.value" }
}
}
}
}}
])
If you have MongoDB 3.6, you can clean that up a bit with $mergeObjects
:
db.collection.aggregate([
{ "$addFields": {
"cities": {
"$reduce": {
"input": "$cities",
"initialValue": [],
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
"then": {
"$concatArrays": [
{ "$filter": {
"input": "$$value",
"as": "v",
"cond": { "$ne": [ "$$this._id", "$$v._id" ] }
}},
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": {
"$add": [
{ "$arrayElemAt": [
"$$value.visited",
{ "$indexOfArray": [ "$$value._id", "$$this._id" ] }
]},
1
]
}
}]
]
},
"else": {
"$concatArrays": [
"$$value",
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": 1
}]
]
}
}
}
}
},
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"$mergeObjects": [
"$$this",
{ "values": { "$avg": "$$this.values.value" } }
]
}
}
}
}}
])
But it's more or less the same thing except we keep the additionalData
Going back a little before that, then you can always $unwind
the "cities"
to accumulate:
db.collection.aggregate([
{ "$unwind": "$cities" },
{ "$group": {
"_id": {
"_id": "$_id",
"cities": {
"_id": "$cities._id",
"name": "$cities.name"
}
},
"_class": { "$first": "$class" },
"name": { "$first": "$name" },
"startTimestamp": { "$first": "$startTimestamp" },
"endTimestamp" : { "$first": "$endTimestamp" },
"source" : { "$first": "$source" },
"variables": { "$first": "$variables" },
"visited": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id._id",
"_class": { "$first": "$class" },
"name": { "$first": "$name" },
"startTimestamp": { "$first": "$startTimestamp" },
"endTimestamp" : { "$first": "$endTimestamp" },
"source" : { "$first": "$source" },
"cities": {
"$push": {
"_id": "$_id.cities._id",
"name": "$_id.cities.name",
"visited": "$visited"
}
},
"variables": { "$first": "$variables" },
}},
{ "$addFields": {
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"_id": "$$this._id",
"name": "$$this.name",
"defaultValue": "$$this.defaultValue",
"lastValue": "$$this.lastValue",
"value": { "$avg": "$$this.values.value" }
}
}
}
}}
])
All return (almost) the same thing:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"visited" : 1
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"visited" : 2
}
],
"variables" : [
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"value" : 3000
}
]
}
The first two forms are of course the most optimal thing to do since they are simply working "within" the same document at all times.
Operators like $reduce
allow "accumulation" expressions on arrays, so we can use it here to keep a "reduced" array which we test for the unique "_id"
value using $indexOfArray
in order to see if the there already is an accumulated item that matches. A result of -1
means it's not there.
In order construct a "reduced array" we take the "initialValue"
of []
as an empty array and then add to it via $concatArrays
. All of that process is decided via the "ternary" $cond
operator which considers the "if"
condition and "then"
either "joins" the output of the $filter
on the current $$value
to exclude the current index _id
entry, with of course another "array" representing the singular object.
For that "object" we again use the $indexOfArray
to actually get the matched index since we know that the item "is there", and use that to extract the current "visited"
value from that entry via $arrayElemAt
and $add
to it in order to increment.
In the "else"
case we simply add an "array" as an "object" which just has a default "visited"
value of 1
. Using both those cases effectively accumulates unique values within the array to output.
In the latter version, we just $unwind
the array and use successive $group
stages in order to first "count" on the unique inner entries, and then "re-construct the array" into the similar form.
Using $unwind
looks far more simple, but since what it actually does is take a copy of the document for every array entry, then this actually adds considerable overhead to processing. In modern versions there are generally array operators which mean you don't need to use this unless your intention is to "accumulate across documents". So if you actually need to $group
on a value of a key from "inside" an array, then that is where you actually do need to use it.
As for the "variables"
then we can simply use the $filter
again here to get the matching "Budget"
entry. We do this as the input to the $map
operator which allows "re-shaping" of the array content. We mainly want that so you can take the content of the "values"
( once you make it all numeric ) and use the $avg
operator, which is supplied that "field path notation" form directly to the array values because it can in fact return a result from such an input.
That generally makes the tour of pretty much ALL of the main "array operators" for the aggregation pipeline ( excluding the "set" operators ) all within a single pipeline stage.
Also don't ever forget that you just about always want to $match
with regular Query Operators as the "very first stage" of any aggregation pipeline in order to just select the documents you need. Ideally using an index.
Alternates
Alternates are working through the documents in client code. It generally would not be recommended since all methods above show they actually "reduce" the content as returned from the server, as is generally the point of "server aggregations".
It "may" be possible due to the "document based" nature that larger result sets may take considerably more time using $unwind
and client processing could be an option, but I would consider it far more likely
Below is a listing that demonstrates applying a transform to the cursor stream as results are returned doing the same thing. There are three demonstrated versions of the transform, showing "exactly" the same logic as above, a implementation with lodash
methods for accumulation, and a "natural" accumulation on the Map
implementation:
const { MongoClient } = require('mongodb');
const { chain } = require('lodash');
const uri = 'mongodb://localhost:27017';
const opts = { useNewUrlParser: true };
const log = data => console.log(JSON.stringify(data, undefined, 2));
const transform = ({ cities, variables, ...d }) => ({
...d,
cities: cities.reduce((o,{ _id, name }) =>
(o.map(i => i._id).indexOf(_id) != -1)
? [
...o.filter(i => i._id != _id),
{ _id, name, visited: o.find(e => e._id === _id).visited + 1 }
]
: [ ...o, { _id, name, visited: 1 } ]
, []).sort((a,b) => b.visited - a.visited),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
const alternate = ({ cities, variables, ...d }) => ({
...d,
cities: chain(cities)
.groupBy("_id")
.toPairs()
.map(([k,v]) =>
({
...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
visited: v.length
})
)
.sort((a,b) => b.visited - a.visited)
.value(),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
const natural = ({ cities, variables, ...d }) => ({
...d,
cities: [
...cities
.reduce((o,{ _id, name }) => o.set(_id,
[ ...(o.has(_id) ? o.get(_id) : []), { _id, name } ]), new Map())
.entries()
]
.map(([k,v]) =>
({
...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
visited: v.length
})
)
.sort((a,b) => b.visited - a.visited),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
(async function() {
try {
const client = await MongoClient.connect(uri, opts);
let db = client.db('test');
let coll = db.collection('junk');
let cursor = coll.find().map(natural);
while (await cursor.hasNext()) {
let doc = await cursor.next();
log(doc);
}
client.close();
} catch(e) {
console.error(e)
} finally {
process.exit()
}
})()