calculate frequency using mongodb aggregate framew

2019-05-20 22:44发布

问题:

I'm trying to calculate frequency of documents in my db based on 10 seconds intervals.

this is how my database objects look like:

[
  {
     created_at: "2014-03-31T22:30:48.000Z",
     id: 450762158586880000,
     _id: "5339ec9808eb125965f2eae1"
  },
  {
     created_at: "2014-03-31T22:30:48.000Z",
     id: 450762160407597060,
     _id: "5339ec9808eb125965f2eae2"
  },
  {
     created_at: "2014-03-31T22:30:49.000Z",
     id: 450762163482017800,
     _id: "5339ec9908eb125965f2eae3"
  },
  {
     created_at: "2014-03-31T22:30:49.000Z",
     id: 450762166367707140,
     _id: "5339ec9908eb125965f2eae4"
  },
  {
     created_at: "2014-03-31T22:30:50.000Z",
     id: 450762167412064260,
     _id: "5339ec9a08eb125965f2eae5"
  }
]

I have managed to display the frequency in the given interval, but I would like to get that for every 10 seconds. So preferably my JSON would look like:

[
  {
     time_from: "2014-03-31T22:30:48.000Z",
     time_to: "2014-03-31T22:30:58.000Z",
     count: 6
  },
  {
     time_from: "2014-03-31T22:30:58.000Z",
     time_to: "2014-03-31T22:31:08.000Z",
     count: 3
  },
  {
     time_from: "2014-03-31T22:31:08.000Z",
     time_to: "2014-03-31T22:31:18.000Z",
     count: 10
  },
  {
     time_from: "2014-03-31T22:31:18.000Z",
     time_to: "2014-03-31T22:31:28.000Z",
     count: 1
  },
  {
     time_from: "2014-03-31T22:31:28.000Z",
     time_to: "2014-03-31T22:31:38.000Z",
     count: 3
  }
]

this is what I have done so far:

exports.findAll = function (req, res) {
    db.collection(collection_name, function (err, collection) {
        collection.find().toArray(function (err, items) {
            collection.find().sort({"_id": 1}).limit(1).toArray(function (err, doc) {
                var interval = 100000; // in milliseconds
                var startTime = doc[0].created_at;
                var endTime = new Date(+startTime + interval);

                collection.aggregate([
                    {$match: {"created_at": {$gte: startTime, $lt: endTime}}},
                    {$group: {"_id": 1, "count":{$sum: 1}}}
                ], function(err, result){
                    console.log(result);
                    res.send(result);
                });
            });
        })
    });
};

and this is the result of that:

[
  {
     _id: 1,
     count: 247
  }
]

EDIT:

collection.aggregate([
                    { $group: {
                        _id: {
                            year: { '$year': '$created_at'},
                            month: {'$month': '$created_at'},
                            day: {'$dayOfMonth': '$created_at'},
                            hour: {'$hour': '$created_at'},
                            minute: {'$minute': '$created_at'},
                            second: {'$second': '$created_at'}
                        },
                        count: { $sum : 1 }
                    } }
                ], function (err, result) {
                    console.log(result);
                    res.send(result);
                });

which results in:

[
  {
     _id: {
        year: 2014,
        month: 3,
        day: 31,
        hour: 22,
        minute: 37,
        second: 10
     },
     count: 6
  }, ...

new progress, now how would I display it in 10 seconds interval?

回答1:

If it just about getting things within 10 second intervals, you can do a little math and run this through aggregate:

db.collection.aggregate([
    { "$group": {
        "_id": {
             "year": { "$year": "$created_at" },
             "month":{ "$month": "$created_at" },
             "day": { "$dayOfMonth": "$created_at" },
             "hour": { "$hour": "$created_at" },
             "minute": { "$minute": "$created_at" },
             "second": { "$subtract": [
                 { "$second": "$created_at" },
                 { "$mod": [
                     { "$second": "$created_at" },
                     10
                 ]}
             ]}
        },
        "count": { "$sum" : 1 }
    }}
])

So that breaks things down to the intervals of 10 seconds in a minute where they occur with a little mod 10 math.

I think that is reasonable, and would be the fastest runner since it uses aggregate. If you really need your sequence as shown to be a running 10 seconds from an initially matched time, then you can do the process with mapReduce:

First a mapper:

var mapper = function () {

    if ( this.created_at.getTime() > ( last_date + 10000 ) ) {
        if ( last_date == 0 ) {
            last_date = this.created_at.getTime();
        } else {
            last_date += 10000;
        }
    }

    emit(
        {
            start: new Date( last_date ),
            end: new Date( last_date + 10000 )
        },
        this.created_at
    );

}

So this is going to emit dates within a 10 second interval, starting with the first date and then increasing the interval each time something is found out of range

Now you need a reducer:

var reducer = function (key, values) {
    return values.length;
};

Very simple. Just return the length of the array passed in.

Because mapReduce works the way it does, anything that did not have more than one value is not passed to the reducer, so clean this up with finalize:

var finalize = function (key, value) {
    if ( typeof(value) == "object" ) {
        value = 1;
    }
    return value;
};

Then just run it to get the results. Note the "scope" section that passes a global variable to be used in the mapper:

db.collection.mapReduce(
    mapper,
    reducer,
    { 
        "out": { "inline": 1 }, 
        "scope": { "last_date": 0 }, 
        "finalize": finalize 
    }
)

Each approach is likely to give slightly different results, but that is the point. It depends on which one you actually want to use.


Considering your comment you could either "inspect" the output from either statement and "fill in the gaps" programatically as it were. I do generally prefer that option, but It's not my program and I do not know how large a series you are trying to retrieve from this query.

On the server side, you can patch up the "mapper" to do something such as this:

var mapper = function () {

    if ( this.created_at.getTime() > ( last_date + 10000 ) ) {

        if ( last_date == 0 ) {
            last_date = this.created_at.getTime();
        } else {
            // Patching for empty blocks
            var times = Math.floor( 
                 ( this.created_at.getTime() - last_date ) / 10000
            );

            if ( times > 1 ) {
                for ( var i=1; i < times; i++ ) {
                    last_date += 10000;
                    emit(
                        {
                            start: new Date( last_date ),
                            end: new Date( last_date + 10000 )
                        },
                        0
                    );
                }
            }
            // End patch
            last_date += 10000;
        }
    }

    emit(
        {
            start: new Date( last_date ),
            end: new Date( last_date + 10000 )
        },
        this.created_at
    );

}