I have a collection filled with documents that look like this:
{
data: 11,
version: "0.0.32"
}
and some have a test
suffix to version
:
{
data: 55,
version: "0.0.42-test"
}
The version
field has different values but it always conforms to the pattern: 0.0.XXX
. I would like to update all the documents to look like this:
{
data: 11,
version: 32
}
and the suffixed version (for test documents - version
should be negative):
{
data: 55,
version: -42
}
The collection with these documents is used by our critical system, that needs to be turned off while updating the data - so I want the update/change to be as fast as possible. There are about 66_000_000
documents in this collection, and it's about 100GB in size.
Which type of mongodb operation would be the most efficient one?
The most efficient way to do this is in the upcoming release of MongoDB as of this writing using the $split
operator to split our string as shown here then assign the last element in the array to a variable using the $let
variable operator and the $arrayElemAt
operators.
Next, we use the $switch
operator to perform a logical condition processing or case statement against that variable.
The condition here is $gt
which returns true if the value contains "test"
, and in which case in the in expression we split that string and simply return the $concat
enated value of the first element in the newly computed array and the -
. If the condition evaluates to false, we just return the variable.
Of course in our case statement, we use the $indexOfCP
which returns -1
if there were no occurrences of "test"
.
let cursor = db.collection.aggregate(
[
{ "$project": {
"data": 1,
"version": {
"$let": {
"vars": {
"v": {
"$arrayElemAt": [
{ "$split": [ "$version", "." ] },
-1
]
}
},
"in": {
"$switch": {
"branches": [
{
"case": {
"$gt": [
{ "$indexOfCP": [ "$$v", "test" ] },
-1
]
},
"then": {
"$concat": [
"-",
"",
{ "$arrayElemAt": [
{ "$split": [ "$$v", "-" ] },
0
]}
]
}
}
],
"default": "$$v"
}
}
}
}
}}
]
)
The aggregation query produces something like this:
{ "_id" : ObjectId("57a98773cbbd42a2156260d8"), "data" : 11, "version" : "32" }
{ "_id" : ObjectId("57a98773cbbd42a2156260d9"), "data" : 55, "version" : "-42" }
As you can see, the "version" field data are string. If the data type for that field does not matter then, you can simply use the $out
aggregation pipeline stage operator to write the result into a new collection or replace your collection.
{ "out": "collection" }
If you need to convert your data to floating point number then, the only way to do this, simply because MongoDB doesn't not provides a way to do type conversion out of the box except for integer to string, is to iterate the aggregation Cursor object and convert your value using parseFloat
or Number
then update your documents using the $set
operator and the bulkWrite()
method for maximum efficiency.
let requests = [];
cursor.forEach(doc => {
requests.push({
"updateOne": {
"filter": { "_id": doc._id },
"update": {
"$set": {
"data": doc.data,
"version": parseFloat(doc.version)
},
"$unset": { "person": " " }
}
}
});
if ( requests.length === 1000 ) {
// Execute per 1000 ops and re-init
db.collection.bulkWrite(requests);
requests = [];
}}
);
// Clean up queues
if(requests.length > 0) {
db.coll.bulkWrite(requests);
}
While the aggregation query will perfectly work in MongoDB 3.4 or newer our best bet from MongoDB 3.2 backwards is mapReduce
with the bulkWrite()
method.
var results = db.collection.mapReduce(
function() {
var v = this.version.split(".")[2];
emit(this._id, v.indexOf("-") > -1 ? "-"+v.replace(/\D+/g, '') : v)
},
function(key, value) {},
{ "out": { "inline": 1 } }
)["results"];
results
looks like this:
[
{
"_id" : ObjectId("57a98773cbbd42a2156260d8"),
"value" : "32"
},
{
"_id" : ObjectId("57a98773cbbd42a2156260d9"),
"value" : "-42"
}
]
From here you use the previous .forEach
loop to update your documents.
From MongoDB 2.6 to 3.0 you will need to use the now deprecated Bulk()
API and it associated method as show in my answer here.