How to get average value from a hashmap in MongoDB

2019-04-15 09:33发布

问题:

I have a time data in my Mongo database. Each document equal a minute and contain 60 seconds as objects with value for each. How to get average value of all seconds in one minute?

A document looking like that:

{
    "_id" : ObjectId("55575e4062771c26ec5f2287"),
    "timestamp" : "2015-05-16T18:12:00.000Z",
    "values" : {
        "0" : "26.17",
        "1" : "26.17",
        "2" : "26.17",
        ...
        "58" : "24.71",
        "59" : "25.20"
    }
}

回答1:

You could take two approaches here:

  1. Changing the schema and use the aggregation framework to get the average by using the $avg operator OR
  2. Apply Map-Reduce.

Let's look at the first option. Currently as it is, the schema will not make it possible to use the aggregation framework because of the dynamic keys in the values subdocument. The ideal schema that would favour the aggregation framework would have the values field be an array which contains embedded key/value documents like this:

/* 0 */
{
    "_id" : ObjectId("5559d66c9bbec0dd0344e4b0"),
    "timestamp" : "2015-05-16T18:12:00.000Z",
    "values" : [ 
        {
            "k" : "0",
            "v" : 26.17
        }, 
        {
            "k" : "1",
            "v" : 26.17
        }, 
        {
            "k" : "2",
            "v" : 26.17
        },
        ...         
        {
            "k" : "58",
            "v" : 24.71
        }, 
        {
            "k" : "59",
            "v" : 25.20
        }
    ]
}

With MongoDB 3.6 and newer, use the aggregation framework to tranform the hashmaps to an array by using the $objectToArray operator then use $avg to calculate the average.

Consider running the following aggregate pipeline:

db.test.aggregate([
    {
        "$addFields": {
            "values": { "$objectToArray": "$values" }
        }
    }   
])

Armed with this new schema, you would then need to update your collection to change the string values to int by iterating the cursor returned from the aggregate method and using bulkWrite as follows:

var bulkUpdateOps = [],
    cursor = db.test.aggregate([
        {
            "$addFields": {
                "values": { "$objectToArray": "$values" }
            }
        }   
    ]);

cursor.forEach(doc => {
    const { _id, values } = doc;
    let temp = values.map(item => {
        item.key = item.k;
        item.value = parseFloat(item.v) || 0;
        delete item.k;
        delete item.v;
        return item;
    });

    bulkUpdateOps.push({
        "updateOne": {
           "filter": { _id },
           "update": { "$set": { values: temp } },
           "upsert": true
        }
    });

    if (bulkUpdateOps.length === 1000) {
        db.test.bulkWrite(bulkUpdateOps);  
        bulkUpdateOps = [];                 
    }
}); 

if (bulkUpdateOps.length > 0) {
    db.test.bulkWrite(bulkUpdateOps);
}

If your MongoDB version does not support the $objectToArray operator in the aggregation framework, then to convert the current schema into the one above takes a bit of native JavaScript functions with the MongoDB find() cursor's forEach() function as follows (assuming you have a test collection):

var bulkUpdateOps = [],
    cursor = db.test.find();

cursor.forEach(doc => {
    const { _id, values } = doc;
    let temp =  Object.keys(values).map(k => {
        let obj = {};
        obj.key = k;
        obj.value = parseFloat(doc.values[k]) || 0;
        return obj;
    });

    bulkUpdateOps.push({
        "updateOne": {
           "filter": { _id },
           "update": { "$set": { values: temp } },
           "upsert": true
        }
    });

    if (bulkUpdateOps.length === 1000) {
        db.test.bulkWrite(bulkUpdateOps);  
        bulkUpdateOps = [];                 
    }
}); 

if (bulkUpdateOps.length > 0) {
    db.test.bulkWrite(bulkUpdateOps);
}

or

db.test.find().forEach(function (doc){
     var keys = Object.keys(doc.values),
        values = keys.map(function(k){
            var obj = {};
            obj.key = k;
            obj.value = parseFloat(doc.values[k]) || 0;
            return obj;
        });
    doc.values = values;
    db.test.save(doc);    
});

The collection will now have the above schema and thus follows the aggregation pipeline that will give you the average time in one minute:

db.test.aggregate([
    {
        "$fields": {
            "average": { "$avg": "$values.value" }
        }
    }    
])

Or for MongoDB 3.0 and lower

db.test.aggregate([
    { "$unwind": "$values" },
    {
        "$group": {
            "_id": "$timestamp",
            "average": {
                "$avg": "$values.value"
            }
        }
    }    
])

For the above document, the output would be:

/* 0 */
{
    "result" : [ 
        {
            "_id" : "2015-05-16T18:12:00.000Z",
            "average" : 25.684
        }
    ],
    "ok" : 1
}

As for the other Map-Reduce option, the intuition behind the operation is you would use JavaScript to make the necessary transformations and calculate the final average. You would need to define three functions:

Map

When you tell Mongo to MapReduce, the function you provide as the map function will receive each document as the this parameter. The purpose of the map is to exercise whatever logic you need in JavaScript and then call emit 0 or more times to produce a reducible value.

var map = function(){
    var obj = this.values;
    var keys = Object.keys(obj);
    var values = [];
    keys.forEach(function(key){  
        var val = parseFloat(obj[key]);
        var value = { count: 1, qty: val };  
        emit(this.timestamp, value);
    }); 
};

For each document you need to emit a key and a value. The key is the first parameter to the emit function and represents how you want to group the values (in this case you will be grouping by the timestamp). The second parameter to emit is the value, which in this case is a little object containing the count of documents (always 1) and total value of each individual value object key i.e. for each second within the minute.

Reduce

Next you need to define the reduce function where Mongo will group the items you emit and pass them as an array to this reduce function It's inside the reduce function where you want to do the aggregation calculations and reduce all the objects to a single object.

var reduce = function(key, values) {
    var result = {count: 0, total: 0 };
    values.forEach(function(value){               
        result.count += value.count;
        result.total += value.qty;
    });

    return result;
};

This reduce function returns a single result. It's important for the return value to have the same shape as the emitted values. It's also possible for MongoDB to call the reduce function multiple times for a given key and ask you to process a partial set of values, so if you need to perform some final calculation, you can also give MapReduce a finalize function.

Finalize

The finalize function is optional, but if you need to calculate something based on a fully reduced set of data, you'll want to use a finalize function. Mongo will call the finalize function after all the reduce calls for a set are complete. This would be the place to calculate the average of all the second values in a document/timestamp:

var finalize = function (key, value) {
    value.average = value.total / value.count;
    return value;
};

Putting It Together

With the JavaScript in place, all that is left is to tell MongoDB to execute a MapReduce:

var map = function(){
    var obj = this.values;
    var keys = Object.keys(obj);
    var values = [];
    keys.forEach(function(key){  
        var val = parseFloat(obj[key]);
        var value = { count: 1, qty: val };  
        emit(this.timestamp, value);
    }); 
};

var reduce = function(key, values) {
    var result = {count: 0, total: 0 };
    values.forEach(function(value){               
        result.count += value.count;
        result.total += value.qty;
    });

    return result;
};

var finalize = function (key, value) {
    value.average = value.total / value.count;
    return value;
};

db.collection.mapReduce(
    map,
    reduce,
    {
        out: { merge: "map_reduce_example" },        
        finalize: finalize
    }
)

And when you query the output collection map_reduce_example, db.map_reduce_example.find(), you get the result:

/* 0 */
{
    "_id" : null,
    "value" : {
        "count" : 5,
        "total" : 128.42,
        "average" : 25.684
    }
}

References:

  1. A Simple MapReduce with MongoDB and C#
  2. MongoDB docuumentation on mapReduce


回答2:

This kind of data structure creates lots of conflicts and difficult to handled mongo operations. This case either you changed your schema design. But, if you not able to changed this schema then follow this :


In your schema having two major problem 1> keys dynamic and 2> values of given keys in string so you should use some programming code to calculating avg check below scripts

From ref this first calculated size of values

Object.size = function(obj) {
    var size = 0,
        key;
    for (key in obj) {
        if (obj.hasOwnProperty(key)) size++;
    }
    return size;
};

db.collectionName.find().forEach(function(myDoc) {
    var objects = myDoc.values;
    var value = 0;
    // Get the size of an object
    var size = Object.size(objects);
    for (var key in objects) {
        value = value + parseFloat(objects[key]); // parse string values to float

    }
    var avg = value / size
    print(value);
    print(size);
    print(avg);
});