MongoDB merge related collection item count with o

I'm new to mongodb and trying figure out how to efficiently query on each item within a collection.

I have projects collection & tasks collections

//projects
{
   _id: ObjectId(),
   name: String
}
//tasks
{
   _id: ObjectId(),
   projectId: ObjectId(), //reference project id
   completed: Bool
}

I would like to get all projects and then count of completed and incomplete tasks of each project

db.projects.find({})...
//perhaps something similar in output
[
 {
   _id: ObjectId(), //projectId
   name: String
   completed: Number,
   incomplete: Number
 }
]

I'm using mongoose as ORM. I don't know if this is possible in mongoose or even native mongodb query. Appreciate any help. Thanks!

回答1:

Whatever way you look at this, as long as you have a normalized relationship like this then you would need two queries to get a result containing details from the "tasks" collection and filling out with details from the "projects" collection. MongoDB does not use joins in any way, and mongoose is no different. Mongoose does offer .populate(), but that is only convenience magic for what is essentially running another query and merging the results on the referenced field value.

So this is one case where maybe you may ultimately consider embedding the project information in the task. Of course there will be duplication, but it makes the query patterns much more simple with a singular collection.

Keeping the collections separated with a referenced model you basically then have two approaches. But firstly you can use aggregate in order to get results more along your actual requirements:

      Task.aggregate(
        [
          { "$group": {
            "_id": "$projectId",
            "completed": {
              "$sum": {
                "$cond": [ "$completed", 1, 0 ]
              }
            },
            "incomplete": {
              "$sum": {
                "$cond": [ "$completed", 0, 1 ]
              }
            }
          }}
        ],
        function(err,results) {

        }
    );

This merely uses a $group pipeline in order to accumulate on the values of "projectid" witin the "tasks" collection. In order to count the values for "completed" and "incomplete" we use the $cond operator which is a ternary to decide which value to pass to $sum. Since the first or "if" condition here is a boolean evaluation, then the existing boolean "complete" field will do, passing on where true to "then" or "else" passing the third argument.

Those results are okay but they do not contain any information from the "project" collection for the collected "_id" values. One approach to making the output look this way is to call the model form of .populate() from within the aggregation results callback on the returned "results" object:

    Project.populate(results,{ "path": "_id" },callback);

In this form the .populate() call takes an object or array of data as it's first argument, with the second being an options document for the population, where the mandatory field here is for "path". This will process any items and "populate" from the model that was called inserting those objects into the results data in the callback.

As a complete example listing:

var async = require('async'),
    mongoose = require('mongoose'),
    Schema = mongoose.Schema;

var projectSchema = new Schema({
  "name": String
});

var taskSchema = new Schema({
  "projectId": { "type": Schema.Types.ObjectId, "ref": "Project" },
  "completed": { "type": Boolean, "default": false }
});

var Project = mongoose.model( "Project", projectSchema );
var Task = mongoose.model( "Task", taskSchema );

mongoose.connect('mongodb://localhost/test');

async.waterfall(
  [
    function(callback) {
      async.each([Project,Task],function(model,callback) {
        model.remove({},callback);
      },
      function(err) {
        callback(err);
      });
    },

    function(callback) {
      Project.create({ "name": "Project1" },callback);
    },

    function(project,callback) {
      Project.create({ "name": "Project2" },callback);
    },

    function(project,callback) {
      Task.create({ "projectId": project },callback);
    },

    function(task,callback) {
      Task.aggregate(
        [
          { "$group": {
            "_id": "$projectId",
            "completed": {
              "$sum": {
                "$cond": [ "$completed", 1, 0 ]
              }
            },
            "incomplete": {
              "$sum": {
                "$cond": [ "$completed", 0, 1 ]
              }
            }
          }}
        ],
        function(err,results) {
          if (err) callback(err);
          Project.populate(results,{ "path": "_id" },callback);
        }
      );
    }
  ],
  function(err,results) {
    if (err) throw err;
    console.log( JSON.stringify( results, undefined, 4 ));
    process.exit();
  }
);

And this will gives results like so:

[
    {
        "_id": {
            "_id": "54beef3178ef08ca249b98ef",
            "name": "Project2",
            "__v": 0
        },
        "completed": 0,
        "incomplete": 1
    }
]

So .populate() works well for this sort of aggregation result, even as effectively another query, and should generally be suitable for most purposes. There was however a specific example included in the listing where there are "two" projects created but of course only "one" task referencing just one of the projects.

Since aggregation is working on the "tasks" collection, it has no knowledge whatsoever of any "project" that is not referenced there. In order to get a complete list of "projects" with the calculated totals you need to be more specific in running two queries and "merging" the results.

This is basically a "hash merge" on distinct keys and data, however nice helper for this is a module called nedb, which allows you to apply the logic in a manner more consistent with MongoDB queries and operations.

Basically you want a copy of the data from the "projects" collection with augmented fields, then you want to "merge" or .update() that information with the aggregation results. Again as a complete listing to demonstrate:

var async = require('async'),
    mongoose = require('mongoose'),
    Schema = mongoose.Schema,
    DataStore = require('nedb'),
    db = new DataStore();


var projectSchema = new Schema({
  "name": String
});

var taskSchema = new Schema({
  "projectId": { "type": Schema.Types.ObjectId, "ref": "Project" },
  "completed": { "type": Boolean, "default": false }
});

var Project = mongoose.model( "Project", projectSchema );
var Task = mongoose.model( "Task", taskSchema );

mongoose.connect('mongodb://localhost/test');

async.waterfall(
  [
    function(callback) {
      async.each([Project,Task],function(model,callback) {
        model.remove({},callback);
      },
      function(err) {
        callback(err);
      });
    },

    function(callback) {
      Project.create({ "name": "Project1" },callback);
    },

    function(project,callback) {
      Project.create({ "name": "Project2" },callback);
    },

    function(project,callback) {
      Task.create({ "projectId": project },callback);
    },

    function(task,callback) {
      async.series(
        [

          function(callback) {
            Project.find({},function(err,projects) {
              async.eachLimit(projects,10,function(project,callback) {
                db.insert({
                  "projectId": project._id.toString(),
                  "name": project.name,
                  "completed": 0,
                  "incomplete": 0
                },callback);
              },callback);
            });
          },

          function(callback) {
            Task.aggregate(
              [
                { "$group": {
                  "_id": "$projectId",
                  "completed": {
                    "$sum": {
                      "$cond": [ "$completed", 1, 0 ]
                    }
                  },
                  "incomplete": {
                    "$sum": {
                      "$cond": [ "$completed", 0, 1 ]
                    }
                  }
                }}
              ],
              function(err,results) {
                async.eachLimit(results,10,function(result,callback) {
                  db.update(
                    { "projectId": result._id.toString() },
                    { "$set": {
                        "complete": result.complete,
                        "incomplete": result.incomplete
                      }
                    },
                    callback
                  );
                },callback);
              }
            );
          },

        ],

        function(err) {
          if (err) callback(err);
          db.find({},{ "_id": 0 },callback);
        }
      );
    }
  ],
  function(err,results) {
    if (err) throw err;
    console.log( JSON.stringify( results, undefined, 4 ));
    process.exit();
  }

And the results here:

[
    {
        "projectId": "54beef4c23d4e4e0246379db",
        "name": "Project2",
        "completed": 0,
        "incomplete": 1
    },
    {
        "projectId": "54beef4c23d4e4e0246379da",
        "name": "Project1",
        "completed": 0,
        "incomplete": 0
    }
]

That lists data from every "project" and includes the calculated values from the "tasks" collection that is related to it.

So there are a few approaches you can do. Again, you might ultimately be best off just embedding "tasks" into the "project" items instead, which would again be a simple aggregation approach. And if you are going to embed the task information, then you may as well maintain counters for "complete" and "incomplete" on the "project" object and simply update these as items are marked completed in the tasks array with the $inc operator.

var taskSchema = new Schema({
  "completed": { "type": Boolean, "default": false }
});

var projectSchema = new Schema({
  "name": String,
  "completed": { "type": Number, "default": 0 },
  "incomplete": { "type": Number, "default": 0 }
  "tasks": [taskSchema]
});

var Project = mongoose.model( "Project", projectSchema );
// cheat for a model object with no collection
var Task = mongoose.model( "Task", taskSchema, undefined );

// Then in later code

// Adding a task
var task = new Task();
Project.update(
    { "task._id": { "$ne": task._id } },
    { 
        "$push": { "tasks": task },
        "$inc": {
            "completed": ( task.completed ) ? 1 : 0,
            "incomplete": ( !task.completed ) ? 1 : 0;
        }
    },
    callback
 );

// Removing a task
Project.update(
    { "task._id": task._id },
    { 
        "$pull": { "tasks": { "_id": task._id } },
        "$inc": {
            "completed": ( task.completed ) ? -1 : 0,
            "incomplete": ( !task.completed ) ? -1 : 0;
        }
    },
    callback
 );


 // Marking complete
Project.update(
    { "tasks": { "$elemMatch": { "_id": task._id, "completed": false } }},
    { 
        "$set": { "tasks.$.completed": true },
        "$inc": {
            "completed": 1,
            "incomplete": -1
        }
    },
    callback
);

You have to know the current task status though for the counter updates to work correctly, but this is easy to code for and you probably should have at least those details in an object passing into your methods.

Personally I would re-model to the latter form and do that. You can do query "merging" as has been shown in two examples here, but it does of course come at a cost.

回答2:

When you need to group or count things in MongoDB then you often need to use the aggregation framework. Here is how to count your data in the shell:

db.tasks.aggregate([ {$group: {
  _id: {projectID: "$projectID", completed: "$completed"},
  count: {$sum: 1}
  }});

This will return two documents for each task in a project - one with a count of the completed tasks and one with those yet to be completed.

I have never used Mongoose, but now you have something to start from :)