Remove all fields that are null

2020-06-16 03:17发布

问题:

How can I remove all fields that are null from all documents of a given collection?


I have a collection of documents such as:

{
    'property1': 'value1',
    'property2': 'value2',
    ...
}

but each document may have a null entry instead of a value entry.

I would like to save disk space by removing all null entries. The existence of the null entries does not contain any information in my case because I know the format of the JSON document a priori.

回答1:

// run in mongo shell  

var coll = db.getCollection("collectionName");
var cursor = coll.find();
while (cursor.hasNext()) {
    var doc = cursor.next();
    var keys = {};
    var hasNull = false;
    for ( var x in doc) {
        if (x != "_id" && doc[x] == null) {
            keys[x] = 1;
            hasNull = true;
        }
    }
    if (hasNull) {
        coll.update({_id: doc._id}, {$unset:keys});
    }
}


回答2:

Starting Mongo 4.2, db.collection.update() can accept an aggregation pipeline, finally allowing the removal of a field based on its value:

// { _id: ObjectId("5d0e8...d2"), property1: "value1", property2: "value2" }
// { _id: ObjectId("5d0e8...d3"), property1: "value1", property2: null, property3: "value3" }
db.collection.update(
  {},
  [{ $replaceWith: {
    $arrayToObject: {
      $filter: {
        input: { $objectToArray: "$$ROOT" },
        as: "item",
        cond: { $ne: ["$$item.v", null] }
      }
    }
  }}],
  { multi: true }
)
// { _id: ObjectId("5d0e8...d2"), property1: "value1", property2: "value2" }
// { _id: ObjectId("5d0e8...d3"), property1: "value1", property3: "value3" }

In details:

  • The first part {} is the match query, filtering which documents to update (in our case all documents).

  • The second part [{ $replaceWith: { ... }] is the update aggregation pipeline (note the squared brackets signifying the use of an aggregation pipeline):

    • With $objectToArray, we first transform the document to an array of key/values such as [{ k: "property1", v: "value1" }, { k: "property2", v: null }, ...].
    • With $filter, we filter this array of key/values by removing items for which v is null.
    • We then transform back the filtered array of key/values to an object using $arrayToObject.
    • Finally, we replace the whole document by the modified one with $replaceWith.
  • Don't forget { multi: true }, otherwise only the first matching document will be updated.



回答3:

Like this question mentioned (mongodb query without field name):

Unfortunately, MongoDB does not support any method of querying all fields with a particular value.

So, you can either iterate the document (like Wizard's example) or do it in non-mongodb way.

If this is a JSON file, remove all the lines with null in sed might works:

sed '/null/d' ./mydata.json


回答4:

You can use the mongo updateMany functionality, but you must do this by specifying the parameter you are going to update, such as the year parameter:

db.collection.updateMany({year: null}, { $unset : { year : 1 }})


回答5:

This is an important question since mongodb cannot index null values (i.e. do not query for nulls or you will be waiting for a long time), so it is best to entirely avoid nulls and set default values using setOnInsert.

Here is a recursive solution to removing nulls:

/**
 * RETRIEVES A LIST OF ALL THE KEYS IN A DOCUMENT, WHERE THE VALUE IS 'NULL' OR 'UNDEFINED'
 *
 * @param doc
 * @param keyName
 * @param nullKeys
 */
function getNullKeysRecursively(doc, keyName, nullKeys)
{
    for (var item_property in doc)
    {
        // SKIP BASE-CLASS STUFF
        if (!doc.hasOwnProperty(item_property))
            continue;
        // SKIP ID FIELD
        if (item_property === "_id")
            continue;

        // FULL KEY NAME (FOR SUB-DOCUMENTS)
        var fullKeyName;
        if (keyName)
            fullKeyName = keyName + "." + item_property;
        else
            fullKeyName = item_property;

        // DEBUGGING
        // print("fullKeyName: " + fullKeyName);

        // NULL FIELDS - MODIFY THIS BLOCK TO ADD CONSTRAINTS
        if (doc[item_property] === null || doc[item_property] === undefined)
            nullKeys[fullKeyName] = 1;

        // RECURSE OBJECTS / ARRAYS
        else if (doc[item_property] instanceof Object || doc[item_property] instanceof Array)
            getNullKeysRecursively(doc[item_property], fullKeyName, nullKeys);
    }
}

/**
 * REMOVES ALL PROPERTIES WITH A VALUE OF 'NULL' OR 'UNDEFINED'.
 * TUNE THE 'LIMIT' VARIABLE TO YOUR MEMORY AVAILABILITY.
 * ONLY CLEANS DOCUMENTS THAT REQUIRE CLEANING, FOR EFFICIENCY.
 * USES bulkWrite FOR EFFICIENCY.
 *
 * @param collectionName
 */
function removeNulls(collectionName)
{
    var coll = db.getCollection(collectionName);
    var lastId = ObjectId("000000000000000000000000");
    var LIMIT = 10000;
    while (true)
    {
        // GET THE NEXT PAGE OF DOCUMENTS
        var page = coll.find({ _id: { $gt: lastId } }).limit(LIMIT);
        if (! page.hasNext())
            break;

        // BUILD BULK OPERATION
        var arrBulkOps = [];
        page.forEach(function(item_doc)
        {
            lastId = item_doc._id;

            var nullKeys = {};
            getNullKeysRecursively(item_doc, null, nullKeys);

            // ONLY UPDATE MODIFIED DOCUMENTS
            if (Object.keys(nullKeys).length > 0)
            // UNSET INDIVIDUAL FIELDS, RATHER THAN REWRITE THE ENTIRE DOC
                arrBulkOps.push(
                    { updateOne: {
                            "filter": { _id: item_doc._id },
                            "update": { $unset: nullKeys }
                        } }
                );
        });

        coll.bulkWrite(arrBulkOps, { ordered: false } );
    }
}

// GO GO GO
removeNulls('my_collection');

document before:

{
    "_id": ObjectId("5a53ed8f6f7c4d95579cb87c"),
    "first_name": null,
    "last_name": "smith",
    "features": {
        "first": {
            "a": 1,
            "b": 2,
            "c": null
        },
        "second": null,
        "third" : {},
        "fourth" : []
    },
    "other": [ 
        null, 
        123, 
        {
            "a": 1,
            "b": "hey",
            "c": null
        }
    ]
}

document after:

{
    "_id" : ObjectId("5a53ed8f6f7c4d95579cb87c"),
    "last_name" : "smith",
    "features" : {
        "first" : {
            "a" : 1,
            "b" : 2
        }
    },
    "other" : [ 
        null, 
        123, 
        {
            "a" : 1,
            "b" : "hey"
        }
    ]
}

As you can see, it removes null, undefined, empty objects and empty arrays. If you need it to be more/less aggressive, it is a matter of modifying the block "NULL FIELDS - MODIFY THIS BLOCK TO ADD CONSTRAINTS".

edits welcome, especially @stennie



标签: mongodb null