Promise.all consumes all my RAM - javascript

I've got a rate limiter for an API I am using which allows 20 requests per second. All requests are promise based and the promise will be resolved with the API data once there is a response.
The problem:
I setup a promiseArray which contains 58k promises all waiting for a response. So slowly the memory is increasing until I am running out of memory. In my specific situation I don't need to pass the resolved data to my then() and the data is eating up all my RAM.
The code:
}).then(() => {
// 2. Crawl for all clanprofiles from these leaderboards
const promiseArray = []
for (let i = 0; i < clanTags.length; i++) {
// Resolved data from getClanProfile() is eating up all my RAM
const p = backgroundScheduler.getClanProfile(clanTags[i], true)
promiseArray.push(p)
}
return Promise.all(promiseArray)
}).then(() => {
So is there a way to await until the promiseArray is resolved without needing the resolved data?

You will use a lesser amount of memory if you don't ever have 58k promises, their associated async operations and their result data active at once.
Instead you want to run X operations at once and then when one finishes, you start the next one with never more than X in flight at the same time and never more than X promises in use at once.
You can experiment with an appropriate value of X. A value of 1 is sequential operations but you can often improve overall end-to-end operation time by using some higher value of X. If all requests are hitting the same host, then X is probably no more than 5-10 (since a given host can't really do a lots of things at once and asking it to do more than it can do at once just slows it down).
If every request is to a different host, then you may be able to make X higher. Experimentation would give you an optimal value for both peak memory usage and overall throughput and somewhat depends upon your specific circumstances.
Bluebird's Promise.map() has a concurrency option that will do this for you, but there are also numerous ways to code for only X in flight at the same time.
Here are some other coding examples of managing how many are in flight at a time:
Make several requests to an API that can only handle 20 request a minute
How to execute promises in series?
unable to complete promises due to out of memory
Fire off 1,000,000 requests 100 at a time
How to make it so that I can execute say 10 promises at a time in javascript to prevent rate limits on api calls?
If you don't need the resolved data, you can allow it to be GCed sooner by replacing it like this:
const p = backgroundScheduler.getClanProfile(clanTags[i], true).then(data => {
return 0; // make resolved value just be a simple number
// so other data is now eligible for GC
});
promiseArray.push(p)
And, here's a simple implementation that iterates an array with no more than X requests in flight at the same time:
// takes an array of items and a function that returns a promise
// runs no more than maxConcurrent requests at once
function mapConcurrent(items, maxConcurrent, fn) {
let index = 0;
let inFlightCntr = 0;
let doneCntr = 0;
let results = new Array(items.length);
let stop = false;
return new Promise(function(resolve, reject) {
function runNext() {
let i = index;
++inFlightCntr;
fn(items[index], index++).then(function(val) {
++doneCntr;
--inFlightCntr;
results[i] = val;
run();
}, function(err) {
// set flag so we don't launch any more requests
stop = true;
reject(err);
});
}
function run() {
// launch as many as we're allowed to
while (!stop && inFlightCntr < maxConcurrent && index < items.length) {
runNext();
}
// if all are done, then resolve parent promise with results
if (doneCntr === items.length) {
resolve(results);
}
}
run();
});
}

Related

Why an async function takes more time to execute than a sync one?

I wrote two recursive functions that sum numbers from a array. They do the same thing, one asynchronously and the other synchronously. The async function took about 9x the time the sync one did.
Shouldn't the async function take advantage from the fact of running more tasks at the same time?
The functions
// Asynchronously sum the numbers in array
async function sumAsync(arr){
if(arr.length == 1) return arr[0];
const half = arr.length/2;
// Numbers on the left half
const left = arr.filter((e, i) => {
return i < half;
});
// Numbers on the right half
const right = arr.filter((e, i) => {
return i >= half;
});
// Recursive call
const leftR = sumAsync(left);
const rightR = sumAsync(right);
// Wait for resolves
await Promise.all([leftR, rightR]);
return await leftR + await rightR;
}
// Synchronously sum the numbers in array
function sumSync(arr){
if(arr.length == 1) return arr[0];
const half = arr.length/2;
// Numbers on the left half
const left = arr.filter((e, i) => {
return i < half;
});
// Numbers on the right half
const right = arr.filter((e, i) => {
return i >= half;
});
// Recursive call
const leftR = sumSync(left);
const rightR = sumSync(right);
return leftR + rightR;
}
Testing them
(async () => {
const LENGTH = 1048576; // 1024^2
const arr = Array.from(Array(LENGTH), num => Math.random()*10 | 0);
// arr[1048576] <- random (0 ~ 9)
// Async sum
console.log('ASYNC');
before = Date.now();
console.log(`SUM: ${await sumAsync(arr)}`);
after = Date.now();
console.log(`TIME: ${after - before} ms`);
// Sync sum
console.log('SYNC');
before = Date.now();
console.log(`SUM: ${sumSync(arr)}`);
after = Date.now();
console.log(`TIME: ${after - before} ms`);
})();
Results
// ASYNC
// SUM: 4720832
// TIME: 5554 ms
// SYNC
// SUM: 4720832
// TIME: 613 ms
The return value of an async function is always a Promise, even if the function only carries out synchronous operations, and the await (or .then) of a Promise will only run what follows during a microtask (after the current synchronous code is finished running). With a large array, that'll result in a lot of unnecessary microtasks wrapping synchronous code.
When nothing actual asynchronous is going on, this is just extra baggage, and results in additional processing time and power required.
Shouldn't the async function take advantage from the fact of running more tasks at the same time?
Javascript is single-threaded, even with async functions. If multiple async functions are called at once, only one path through the code may be "active" at any one time. If the total processing time required for all tasks is, say, 1000 ms, in standard Javascript, there's no way around spending at least 1000 ms.
You're not actually running more tasks at the same time - you're just wrapping the tasks in Promises (unnecessarily), while doing the same work.
For truly parallel actions, you'll have to use something provided by your current environment, such as child_process in Node, or a web worker.
Short version: async doesn't do more than one thing at a time. It switches between tasks (with overhead for each switch) in a queue, and when one task blocks, it hands control off to another (with overhead for the switch, and requeueing the blocked task when it unblocks).
Long version: Async doesn't mean parallel processing, it means interleaved (concurrent, cooperative) processing. JavaScript is still single-threaded even with async usage, and all of the actual work you do is purely CPU bound. In fact, your only real concurrency is that the async code will be scheduling, pausing and resuming your recursive calls repeatedly (but still only doing work for one at a time), while the sync code will just do them in order as fast as possible, with no event loop involvement.
The benefit of async code is that when blocking I/O (including stuff like waiting on user input) is being performed, that task can be suspended until it's unblocked by some out-of-band signal (I/O done, user clicked mouse, whatever), and other tasks can run. The benefit is in reaping the benefits of concurrent (but not parallel) processing in situations where most tasks, most of the time, are waiting for something, so the few that are ready to run can begin running immediately (and since they're usually not running, the overhead of task switching doesn't matter much; most of the time, there's nothing to switch to, so much of the overhead is paid when you've got nothing better to do). But it's definitely higher overhead than just doing number-crunching without a pause.

Prevent overload due to Promise resolving

I need to perform lots of findOneAndUpdate() operations using mongoose as there is no way to perform an atomic operation in bulk. Therefore I create a promise array in a for loop which will be resolved afterwards. Unfortunately this takes ~2-3 seconds and during that time my Express application can't process any new requests.
The code:
const promiseArray = []
for (let i = 0; i < 1500; i++) {
const p = PlayerProfile.findOneAndUpdate(filter, updateDoc)
promiseArray.push(p)
}
return Promise.all(promiseArray).then((values) => {
// Process the values
})
Question:
How can I avoid that my Express application becomes completely unresponsive to new requests while it's resolving this promise?
More context information:
I am trying to update and return many documents with a atomic operation, hence the big for loop. It's basically selecting a document and setting up a lock for this document.
Try using update with the multi option:
PlayerProfile.update(filter, updateDoc, { multi: true }, function(err, result) {
// Do something
})
The signature is:
Model.update(conditions, update, options, callback)

Run 1000 requests so that only 10 runs at a time

With node.js I want to http.get a number of remote urls in a way that only 10 (or n) runs at a time.
I also want to retry a request if an exception occures locally (m times), but when the status code returns an error (5XX, 4XX, etc) the request counts as valid.
This is really hard for me to wrap my head around.
Problems:
Cannot try-catch http.get as it is async.
Need a way to retry a request on failure.
I need some kind of semaphore that keeps track of the currently active request count.
When all requests finished I want to get the list of all request urls and response status codes in a list which I want to sort/group/manipulate, so I need to wait for all requests to finish.
Seems like for every async problem using promises are recommended, but I end up nesting too many promises and it quickly becomes uncypherable.
There are lots of ways to approach the 10 requests running at a time.
Async Library - Use the async library with the .parallelLimit() method where you can specify the number of requests you want running at one time.
Bluebird Promise Library - Use the Bluebird promise library and the request library to wrap your http.get() into something that can return a promise and then use Promise.map() with a concurrency option set to 10.
Manually coded - Code your requests manually to start up 10 and then each time one completes, start another one.
In all cases, you will have to manually write some retry code and as with all retry code, you will have to very carefully decide which types of errors you retry, how soon you retry them, how much you backoff between retry attempts and when you eventually give up (all things you have not specified).
Other related answers:
How to make millions of parallel http requests from nodejs app?
Million requests, 10 at a time - manually coded example
My preferred method is with Bluebird and promises. Including retry and result collection in order, that could look something like this:
const request = require('request');
const Promise = require('bluebird');
const get = Promise.promisify(request.get);
let remoteUrls = [...]; // large array of URLs
const maxRetryCnt = 3;
const retryDelay = 500;
Promise.map(remoteUrls, function(url) {
let retryCnt = 0;
function run() {
return get(url).then(function(result) {
// do whatever you want with the result here
return result;
}).catch(function(err) {
// decide what your retry strategy is here
// catch all errors here so other URLs continue to execute
if (err is of retry type && retryCnt < maxRetryCnt) {
++retryCnt;
// try again after a short delay
// chain onto previous promise so Promise.map() is still
// respecting our concurrency value
return Promise.delay(retryDelay).then(run);
}
// make value be null if no retries succeeded
return null;
});
}
return run();
}, {concurrency: 10}).then(function(allResults) {
// everything done here and allResults contains results with null for err URLs
});
The simple way is to use async library, it has a .parallelLimit method that does exactly what you need.

Progress of promises

Theory:
I have around 100 promises which I make in start and then later resolve them using Promise.all().
Each of those 100 promises in turn make some async REST calls whose response may vary mainly (for example due to network connectivity).
Process of resolving all 100 promises take around 20 sec. During that time user should be given a live feedback of progress to keep him engaged.
In order to implement progress of these async operations I am thinking of using a progressCounter on client end whose value will be updated by each promise as soon as its resolved.
Thing is that if progressCounter = 1 and as all those operations are async, I fear to hit a race condition where, for example, current value of progressCounter retrieved by two distinct promises might be found as same i.e. 1 so they may try to increment progressCounter to same value i.e. 2. So final value won't be 3 because of the race condition.
Experiment:
I tried to reproduce this theory but couldn't using following:
var progress = {};
progress.counter = 1;
var promise1 = new Promise(function(resolve, reject) {
resolve();
});
var promise2 = new Promise(function(resolve, reject) {
resolve();
});
promise1.then(function() {
progress.counter += 1;
});
promise2.then(function() {
progress.counter += 1;
});
setTimeout(function() {
alert(progress.counter); // progress.counter: 3
}, 1000);`
Question:
Question is can such a race condition be hit described in theory above? If not, how is the theory flawed?
If yes, what is a good way to track progress of resolution of the promises?
Question: Question is can such a race condition be hit described in theory above? If not, how is the theory flawed?
The answer is no, such a race condition can not occur in Javascript, because Javascript is single-threaded. (see: Concurrency Model and Event Loop on MDN)
This means that while one callback handler is working with the data (assuming that setting the counter is a synchronous operation, which += is), nothing can force it to "yield" its execution, the next handler can only run when the previous one has finished.

asynchronously iterate over massive array in JavaScript without triggering stack size exceeded

My environment is NodeJS, although this could be a web related problem as well. I have a large set of data from a database which I am attempting to enumerate over. However, for the sake of argument lets say that I have an array of 20,000 strings:
var y = 'strstrstrstrstrstrstrstrstrstr';
var x = [];
for(var i = 0; i < 20000; i++)
x.push(y);
and I want to enumerate this list asynchronously, lets say using the async library, and lets say because I'm super cautious that I even limit my enumeration to 5 iterations at once:
var allDone = function() { console.log('done!') };
require('async').eachLimit(x, 5, function(item, cb){
...
someAsyncCall(.., cb);
}, allDone);
The expectation is that 5 items of x would be iterated concurrently above and that eventually all 20,000 items would be iterated over and the console would print 'done!'. What actually happens is:
Uncaught exception: [RangeError: Maximum call stack size exceeded]
And at this point I assumed that this must be some sort of bug with the async library, so I wrote my own version of eachLimit which follows:
function eachLimit(data, limit, iterator, cb) {
var consumed = 0;
var consume;
var finished = false;
consume = function() {
if(!finished && consumed >= data.length) {
finished = true;
cb();
}else if(!finished) {
return iterator(data[consumed++], consume);
}
};
var concurrent = limit > data.length ? data.length : limit;
for(var i = 0; i < concurrent; i++)
consume();
}
and interestingly enough, this solved my problem. But then when I moved my experiment from nodeJS over to Chrome, even with my solution above I still receive a stack size exceeded.
Clearly, my method does not increase the stack as large as the eachLimit method contained within async. However, I still consider my approach to be bad because maybe not for 20k items, but for some sized array I can still exceed the stack size using my method. I feel like I need to design some sort of solution to this problem using tail recursion, but I'm not sure if v8 will even optimize for this case, or if it's possible given the problem.
I feel like I need to design some sort of solution to this problem using tail recursion, but I'm not sure if v8 will even optimize for this case, or if it's possible given the problem.
The continuation-passing-style you are using is already tail recursive (or close to anyway). The problem is that most JS engines really tend to do stackoverflows in these sorts of situations.
There are two main ways to work around this issue:
1) Force the code to be async using setTimeout.
What is happening with your code is that you are calling the return callbacks before the original function returns. In some async libraries this will end up resulting in stackoverflow. One simple workaround is to force the callback to run only in the next iteration of the event handling loop, by wrapping it inside a setTimeout. Translate
//Turns out this was actually "someSyncCall"...
someAsyncCall(.., cb);
into
someAsyncCall(..., function(){
setTimeout(cb, 0)
});
The main advantage here is that this is very simple to do. The disadvantage is that this add some latency to your loop because setTimeout is implemented so that there will always be some nonzero delay to the callback (even if you set it to zero). On the server you can use nextTick (or somethign like that, forgot the precise name) to do something similar as well.
That said, its already a bit weird to have a large loop of sequential async operations. If your operations are all actually async then its going to take years to complete due to the network latency.
2) Use trampolining to handle the sync code.
The only way to 100% avoid a stackoverflow is to use bona-fide while loops. With promises this would be a bit easier to write the pseudocode for:
//vastly incomplete pseudocode
function loopStartingFrom(array, i){
for(;i<array.length; i++){
var x = run_next_item(i);
if(is_promise(x)){
return x.then(function(){
loopStartingFrom(array, i+1)
});
}
}
}
Basically, you run your loop in an actual loop, with some way to detect if one of your iterations is returning immediately or deferring to an async computation. When things return immediately you keep the loop running and when you finally get a real async result you stop the loop and resume it when the async iteration result completes.
The downside of using trampolining is that its a bit more complicated. That said, there are some async libraries out there that guarantee that stackoverflow does not occur (by using one of the two tricks I mentioned under the hood).
To prevent a stack overflow, you need to avoid that consume recurses into itself. You can do that using a simple flag:
function eachLimit(data, limit, iterator, cb) {
var consumed = 0,
running = 0,
isAsync = true;
function consume() {
running--;
if (!isAsync)
return;
while (running < limit && consumed < data.length) {
isAsync = false;
running++;
iterator(data[consumed++], consume);
isAsync = true;
}
if (running == 0)
cb();
}
running++;
consume();
}
Have you considered using promises for this? They should resolve the issue of an ever-increasing stack (and also you get to use promises, which is a big plus in my book):
// Here, iterator() should take a single data value as input and return
// a promise for the asynchronous behavior (if it is asynchronous)
// or any value if it is synchronous
function eachLimit(data, limit, iterator) {
return Promise(function (resolve, reject) {
var i = 0;
var failed = false;
function handleFailure(error) {
failed = true;
reject(error);
}
function queueAction() {
try {
Promise.when(iterator(data[i]))
.then(handleSuccess, handleFailure);
} catch (error) {
reject(error);
}
}
function handleSuccess() {
if (!failed) {
if (i < data.length) {
queueAction();
i += 1;
} else {
resolve();
}
}
}
for (; i < data.length && i < limit; i += 1) {
queueAction();
}
});
}

Categories