How can I remove all fields that are null
from all documents of a given collection?
I have a collection of documents such as:
{
'property1': 'value1',
'property2': 'value2',
...
}
but each document may have a null
entry instead of a value entry.
I would like to save disk space by removing all null
entries. The existence of the null
entries does not contain any information in my case because I know the format of the JSON document a priori.
// run in mongo shell
var coll = db.getCollection("collectionName");
var cursor = coll.find();
while (cursor.hasNext()) {
var doc = cursor.next();
var keys = {};
var hasNull = false;
for ( var x in doc) {
if (x != "_id" && doc[x] == null) {
keys[x] = 1;
hasNull = true;
}
}
if (hasNull) {
coll.update({_id: doc._id}, {$unset:keys});
}
}
Starting Mongo 4.2
, db.collection.update()
can accept an aggregation pipeline, finally allowing the removal of a field based on its value:
// { _id: ObjectId("5d0e8...d2"), property1: "value1", property2: "value2" }
// { _id: ObjectId("5d0e8...d3"), property1: "value1", property2: null, property3: "value3" }
db.collection.update(
{},
[{ $replaceWith: {
$arrayToObject: {
$filter: {
input: { $objectToArray: "$$ROOT" },
as: "item",
cond: { $ne: ["$$item.v", null] }
}
}
}}],
{ multi: true }
)
// { _id: ObjectId("5d0e8...d2"), property1: "value1", property2: "value2" }
// { _id: ObjectId("5d0e8...d3"), property1: "value1", property3: "value3" }
In details:
The first part {}
is the match query, filtering which documents to update (in our case all documents).
The second part [{ $replaceWith: { ... }]
is the update aggregation pipeline (note the squared brackets signifying the use of an aggregation pipeline):
- With
$objectToArray
, we first transform the document to an array of key/values such as [{ k: "property1", v: "value1" }, { k: "property2", v: null }, ...]
.
- With
$filter
, we filter this array of key/values by removing items for which v
is null
.
- We then transform back the filtered array of key/values to an object using
$arrayToObject
.
- Finally, we replace the whole document by the modified one with
$replaceWith
.
Don't forget { multi: true }
, otherwise only the first matching document will be updated.
Like this question mentioned (mongodb query without field name):
Unfortunately, MongoDB does not support any method of querying all fields with a particular value.
So, you can either iterate the document (like Wizard's example) or do it in non-mongodb way.
If this is a JSON file, remove all the lines with null
in sed might works:
sed '/null/d' ./mydata.json
You can use the mongo updateMany functionality, but you must do this by specifying the parameter you are going to update, such as the year
parameter:
db.collection.updateMany({year: null}, { $unset : { year : 1 }})
This is an important question since mongodb cannot index null values (i.e. do not query for nulls or you will be waiting for a long time), so it is best to entirely avoid nulls and set default values using setOnInsert
.
Here is a recursive solution to removing nulls:
/**
* RETRIEVES A LIST OF ALL THE KEYS IN A DOCUMENT, WHERE THE VALUE IS 'NULL' OR 'UNDEFINED'
*
* @param doc
* @param keyName
* @param nullKeys
*/
function getNullKeysRecursively(doc, keyName, nullKeys)
{
for (var item_property in doc)
{
// SKIP BASE-CLASS STUFF
if (!doc.hasOwnProperty(item_property))
continue;
// SKIP ID FIELD
if (item_property === "_id")
continue;
// FULL KEY NAME (FOR SUB-DOCUMENTS)
var fullKeyName;
if (keyName)
fullKeyName = keyName + "." + item_property;
else
fullKeyName = item_property;
// DEBUGGING
// print("fullKeyName: " + fullKeyName);
// NULL FIELDS - MODIFY THIS BLOCK TO ADD CONSTRAINTS
if (doc[item_property] === null || doc[item_property] === undefined)
nullKeys[fullKeyName] = 1;
// RECURSE OBJECTS / ARRAYS
else if (doc[item_property] instanceof Object || doc[item_property] instanceof Array)
getNullKeysRecursively(doc[item_property], fullKeyName, nullKeys);
}
}
/**
* REMOVES ALL PROPERTIES WITH A VALUE OF 'NULL' OR 'UNDEFINED'.
* TUNE THE 'LIMIT' VARIABLE TO YOUR MEMORY AVAILABILITY.
* ONLY CLEANS DOCUMENTS THAT REQUIRE CLEANING, FOR EFFICIENCY.
* USES bulkWrite FOR EFFICIENCY.
*
* @param collectionName
*/
function removeNulls(collectionName)
{
var coll = db.getCollection(collectionName);
var lastId = ObjectId("000000000000000000000000");
var LIMIT = 10000;
while (true)
{
// GET THE NEXT PAGE OF DOCUMENTS
var page = coll.find({ _id: { $gt: lastId } }).limit(LIMIT);
if (! page.hasNext())
break;
// BUILD BULK OPERATION
var arrBulkOps = [];
page.forEach(function(item_doc)
{
lastId = item_doc._id;
var nullKeys = {};
getNullKeysRecursively(item_doc, null, nullKeys);
// ONLY UPDATE MODIFIED DOCUMENTS
if (Object.keys(nullKeys).length > 0)
// UNSET INDIVIDUAL FIELDS, RATHER THAN REWRITE THE ENTIRE DOC
arrBulkOps.push(
{ updateOne: {
"filter": { _id: item_doc._id },
"update": { $unset: nullKeys }
} }
);
});
coll.bulkWrite(arrBulkOps, { ordered: false } );
}
}
// GO GO GO
removeNulls('my_collection');
document before:
{
"_id": ObjectId("5a53ed8f6f7c4d95579cb87c"),
"first_name": null,
"last_name": "smith",
"features": {
"first": {
"a": 1,
"b": 2,
"c": null
},
"second": null,
"third" : {},
"fourth" : []
},
"other": [
null,
123,
{
"a": 1,
"b": "hey",
"c": null
}
]
}
document after:
{
"_id" : ObjectId("5a53ed8f6f7c4d95579cb87c"),
"last_name" : "smith",
"features" : {
"first" : {
"a" : 1,
"b" : 2
}
},
"other" : [
null,
123,
{
"a" : 1,
"b" : "hey"
}
]
}
As you can see, it removes null
, undefined
, empty objects and empty arrays. If you need it to be more/less aggressive, it is a matter of modifying the block "NULL FIELDS - MODIFY THIS BLOCK TO ADD CONSTRAINTS".
edits welcome, especially @stennie