admin管理员组文章数量:1313121
I have a function that runs periodically, that updates the item.price
of some Documents
in my Prices
Collection. The Price Collection
has 100k+ items. The function looks like this:
//Just a helper function for multiple GET requests with request.
let _request = (urls, cb) => {
let results = {}, i = urls.length, c = 0;
handler = (err, response, body) => {
let url = response.request.uri.href;
results[url] = { err, response, body };
if (++c === urls.length) {
cb(results);
}
};
while (i--) {
request(urls[i], handler);
}
};
// function to update the prices in our Prices collection.
const update = (cb) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
id = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${id} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
itemid: id,
hash_name: key,
price: jsonResult[key]
}
});
Price.insertMany(allItemsArray).then(docs => {
logger.info(`Saved docs for ${id}`)
}, (e) => {
logger.error(`Error saving docs.`);
});
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
As you can see, to avoid iterating through 100k+ Documents, and updating each and every one of them separately, I delete them all at the beginning, and just call the API that gives me these Items with prices, and use InsertMany
to Insert all of them into my Prices Collection.
This updating process will happen every 30 minutes.
But I just now realised, what if some user wants to check the Prices and my Prices Collection
is currently empty because it's in the middle of updating itself?
The Question
So do I have to iterate through all of them in order to not delete it? (Remember, there are MANY documents to be updated every 30 mins.) Or is there another solution?
Here's a picture of how my Prices Collection
looks (there are 100k docs like these, I just want to update the price property):
Update:
I have re-written my update
function a bit and now it looks like this:
const update = (cb = null) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
gameid = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${gameid} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
game_id: gameid,
market_hash_name: key,
price: jsonResult[key]
}
});
let bulk = Price.collection.initializeUnorderedBulkOp();
allItemsArray.forEach(item => {
bulk.find({market_hash_name: item.market_hash_name})
.upsert().updateOne(item);
});
bulk.execute((err, bulkers) => {
if (err) {
return logger.error(`Error bulking: ${e}`);
}
logger.info(`Updated Items for ${gameid}`)
});
// Price.insertMany(allItemsArray).then(docs => {
// logger.info(`Saved docs for ${gameid}`)
// }, (e) => {
// logger.error(`Error saving docs.`);
// });
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
Notice the bulk variable now (Thanks @Rahul) but now, the collection takes ages to update. My processor is burning up and it literally takes 3+ minutes to update 60k+ documents. I honestly feel like the previous method, while it might delete all of them and then reinserting them, it also takes 10x faster.
Anyone?
I have a function that runs periodically, that updates the item.price
of some Documents
in my Prices
Collection. The Price Collection
has 100k+ items. The function looks like this:
//Just a helper function for multiple GET requests with request.
let _request = (urls, cb) => {
let results = {}, i = urls.length, c = 0;
handler = (err, response, body) => {
let url = response.request.uri.href;
results[url] = { err, response, body };
if (++c === urls.length) {
cb(results);
}
};
while (i--) {
request(urls[i], handler);
}
};
// function to update the prices in our Prices collection.
const update = (cb) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
id = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${id} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
itemid: id,
hash_name: key,
price: jsonResult[key]
}
});
Price.insertMany(allItemsArray).then(docs => {
logger.info(`Saved docs for ${id}`)
}, (e) => {
logger.error(`Error saving docs.`);
});
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
As you can see, to avoid iterating through 100k+ Documents, and updating each and every one of them separately, I delete them all at the beginning, and just call the API that gives me these Items with prices, and use InsertMany
to Insert all of them into my Prices Collection.
This updating process will happen every 30 minutes.
But I just now realised, what if some user wants to check the Prices and my Prices Collection
is currently empty because it's in the middle of updating itself?
The Question
So do I have to iterate through all of them in order to not delete it? (Remember, there are MANY documents to be updated every 30 mins.) Or is there another solution?
Here's a picture of how my Prices Collection
looks (there are 100k docs like these, I just want to update the price property):
Update:
I have re-written my update
function a bit and now it looks like this:
const update = (cb = null) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
gameid = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${gameid} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
game_id: gameid,
market_hash_name: key,
price: jsonResult[key]
}
});
let bulk = Price.collection.initializeUnorderedBulkOp();
allItemsArray.forEach(item => {
bulk.find({market_hash_name: item.market_hash_name})
.upsert().updateOne(item);
});
bulk.execute((err, bulkers) => {
if (err) {
return logger.error(`Error bulking: ${e}`);
}
logger.info(`Updated Items for ${gameid}`)
});
// Price.insertMany(allItemsArray).then(docs => {
// logger.info(`Saved docs for ${gameid}`)
// }, (e) => {
// logger.error(`Error saving docs.`);
// });
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
Notice the bulk variable now (Thanks @Rahul) but now, the collection takes ages to update. My processor is burning up and it literally takes 3+ minutes to update 60k+ documents. I honestly feel like the previous method, while it might delete all of them and then reinserting them, it also takes 10x faster.
Anyone?
Share Improve this question edited Feb 18, 2018 at 14:44 filipbarak asked Feb 18, 2018 at 14:07 filipbarakfilipbarak 1,9052 gold badges19 silver badges30 bronze badges 2- Instead of deleting and adding data every time use bulk upsert of MongoDB, another thing is to don't plete data from the _request return in the chunk of 100/1000. – Rahul Sharma Commented Feb 18, 2018 at 14:17
- I've updated my question thanks to your ment. Care to take a look now? – filipbarak Commented Feb 18, 2018 at 14:45
3 Answers
Reset to default 5From my experience (updating millions of mongo docs on a hourly basis), here's a realistic approach to very large bulk updates:
- do all your API calls separately and write results in as bson into a file
- invoke
mongoimport
and import that bson file into a new empty collectionprices_new
. Javascript, let alone high-level OO wrappers, are just too slow for that - rename
prices_new
->prices
dropTarget=true
(this will be atomic hence no downtime)
Schematically, it would look like this in JS
let fname = '/tmp/data.bson';
let apiUrls = [...];
async function doRequest(url) {
// perform a request and return an array of records
}
let responses = await Promise.all(apiUrls.map(doRequest));
// if the data too big to fit in memory, use streams instead of this:
let data = flatMap(responses, BSON.serialize).join('\n'));
await fs.writeFile(fname, data);
await child_process.exec(`mongoimport --collection prices_new --drop ${fname}`);
await db.prices_new.renameCollection('prices', true);
There's no need to clear the database and do a fresh insert. You can use the bulkWrite()
method for this or use the updateMany()
method to do the updates.
You can refactor the existing code to
const update = (cb) => {
_request(urls, responses => {
let bulkUpdateOps = [], gameid;
responses.forEach(url => {
let response = responses[url];
gameid = url.split('/')[5].split('?')[0];
if (response.err) {
logger.error(`Error in request to ${url}: ${response.err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
Object.keys(jsonResult).forEach(key => {
bulkUpdateOps.push({
"updateOne": {
"filter": { market_hash_name: key },
"update": { "$set": {
game_id: gameid,
price: jsonResult[key]
} },
"upsert": true
}
});
});
}
if (bulkUpdateOps.length === 1000) {
Price.bulkWrite(bulkUpdateOps).then(result => {
logger.info(`Updated Items`)
}).catch(e => logger.error(`Error bulking: ${e}`));
bulkUpdateOps = [];
}
});
if (bulkUpdateOps.length > 0) {
Price.bulkWrite(bulkUpdateOps).then(result => {
logger.info(`Updated Items`)
}).catch(e => logger.error(`Error bulking: ${e}`));
}
});
if (cb && typeof cb == 'function') {
cb();
}
}
I have not tested anything but you can try this, might be helpful. I am using bluebird library for concurrency.
let _request = (url) => {
return new Promise((resolve, reject) => {
request(url, (err, response, body) => {
if (err) {
reject(err);
}
resolve(body);
});
});
};
const formatRespose = async (response) => {
// do stuff
return {
query: {}, // itemid: id,
body: {}
};
}
const bulkUpsert = (allItemsArray) => {
let bulk = Price.collection.initializeUnorderedBulkOp();
return new Promise((resolve, reject) => {
allItemsArray.forEach(item => {
bulk.find(item.query).upsert().updateOne(item.body);
});
bulk.execute((err, bulkers) => {
if (err) {
return reject(err);
}
return resolve(bulkers);
});
});
}
const getAndUpdateData = async (urls) => {
const allItemsArray = urls.map((url) => {
const requestData = await _request(url); // you can make this also parallel
const formattedData = formatRespose(requestData); // return {query: {},body: {} };
return formattedData;
});
return await (bulkUpsert(allItemsArray));
};
function update() {
// split urls into as per your need 100/1000
var i, j, chunkUrls = [],
chunk = 100;
for (i = 0, j = urls.length; i < j; i += chunk) {
chunkUrls.push(getAndUpdateData(urls.slice(i, i + chunk)));
}
Bluebird.map(chunkUrls, function (chunk) {
return await chunk;
}, {
concurrency: 1 // depends on concurrent request change 1 = 100 request get and insert in db at time
}).then(function () {
console.log("done");
}).catch(function () {
console.log("error");
});
}
本文标签: javascriptUpdating many(100k) documents in the most efficient way MongoDBStack Overflow
版权声明:本文标题:javascript - Updating many(100k+) documents in the most efficient way MongoDB - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741948435a2406564.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论