How to use pool of tabs #92

anandanand84 · 2017-04-11T20:02:42Z

This is more of a how to question than a issue. Lets assume the scenario of generating screenshots of webpages concurrently. My thought was to create a pool of tabs like below and utilize the workers to get screenshots.

//headless-page-pool
let CDP = require('chrome-remote-interface');
var genericPool:any = require('generic-pool');

const sandBoxFactory = {
    create:async function () {
        try {
            let tabMeta = await CDP.New({ remote : true });
            let client = await CDP({ tab : tabMeta})
            client._target = tabMeta;
            return client;
        } catch (e) {
            console.error(e);
        }
    },
    validate: function(client:any) {
        //TODO: Find a way to validate dev tools connection
    },
    destroy: async function (client:any){
        try {
            return await client.Target.closeTarget(client._target.id);
        } catch (e) {
            console.error(e);
        }
    }
}

var opts = {
    max: 10, // maximum size of the pool
    min: 3 // minimum size of the pool
}

let WorkerPool:any;

function getWorkerPool() {
    return WorkerPool;
}

function createWorkerPool() {
    console.log('creating pool');
    WorkerPool = genericPool.createPool(sandBoxFactory, opts);
    WorkerPool.on('factoryCreateError', function(err:any){
        console.error(err);
    });

    WorkerPool.on('factoryDestroyError', function(err:any){
        console.error(err);
    });
    return WorkerPool
}

function getWorkerFromPool() {
    return WorkerPool.acquire()
}

export { getWorkerPool, createWorkerPool, getWorkerFromPool };

This creates 3 tabs in chrome and in my request handler

import {createWorkerPool, getWorkerPool, getWorkerFromPool } from './headless-page-pool';
import fs = require('fs');
createWorkerPool();

export async function captureScreenshot(url:string, 
    options:any,    timeout=10000):Promise<any> {
    try {
        return new Promise(async function(resolve, reject) {
            let WorkerPool = getWorkerPool();
            let worker = await getWorkerFromPool();
            await worker.Page.enable();
            worker.Page.loadEventFired(async function() {
                console.log('page loaded');
                let result = await worker.Page.captureScreenshot(); //This line never resolves since the target is not active
                resolve(Buffer.from(result.data, 'base64'));
            });
            await worker.Page.navigate({ url });
            WorkerPool.release(worker);
        });
    } catch (error) {
        console.error(error);
    }
}

The problem is await worker.Page.captureScreenshot(); never resolves because the tab is not active. Just click on the tab containing the url in chrome it just resolves.

This can be workaround by calling

worker.Target.activateTarget({ targetId : worker._target.id})

After doing this I just get the blank screen as image unless I put a pause which gives the actual image. The bottom line is whatever I do there is no way to process multiple images simultaneously because only the active tab can process the image, So how do we use multiple tabs and process images in all the tabs concurrently. Is this a bug in chrome which does not allow to capture screenshots when the tab is not active or I am missing something.

Let me know if I am not clear.

The text was updated successfully, but these errors were encountered:

cyrus-and · 2017-04-12T15:26:28Z

It seems that you're right, the tab must be exposed during the screenshot phase. As you say, activating the target on page load won't work probably because another page load event steals the focus before the screenshot is completed thus producing empty/partial images.

Luckily though it's the page load phase which is usually time consuming and that part can be fully parallelized by spawning multiple tabs/targets. Try this, instead of calling Page.captureScreenshot as soon as the page is loaded, simply enqueue the task to a common array (or use promises, see below), then when all the pages in the batch are finished start the serial activate-screenshot-repeat phase.

Here's what I mean:

const fs = require('fs');

const CDP = require('chrome-remote-interface');

function loadForScrot(url) {
    return new Promise(async (fulfill, reject) => {
        const tab = await CDP.New();
        const client = await CDP({tab});
        const {Page} = client;
        Page.loadEventFired(() => {
            fulfill({client, tab});
        });
        await Page.enable();
        await Page.navigate({url});
    });
}

async function process(urls) {
    try {
        const handlers = await Promise.all(urls.map(loadForScrot));
        for (const {client, tab} of handlers) {
            const {Page} = client;
            await CDP.Activate({id: tab.id});
            const filename = `/tmp/scrot_${tab.id}.png`;
            const result = await Page.captureScreenshot();
            const image = Buffer.from(result.data, 'base64');
            fs.writeFileSync(filename, image);
            console.log(filename);
            await client.close();
        }
    } catch (err) {
        console.error(err);
    }
}

process(['http://example.com',
         'http://example.com',
         'http://example.com',
         'http://example.com',
         'http://example.com',
         'http://example.com',
         'http://example.com',
         'http://example.com']);

Please let me know if this can work for you.

anandanand84 · 2017-04-12T16:12:50Z

@cyrus-and Thanks for taking time to look into this issue. Yes the above code should work and I get what you are saying, but I was planning something like an api which gets screenshots/pdf based on the url. When taking screenshots serially in batches is the only way it would block the other requests from loading the pages in the mean time. Generating PDF was my use case and Yesterday I tried the latest headless for pdf generation using printToPDF api and it doesn't require the tab to be activated or exposed.

Thanks once again for this awesome project !

cyrus-and · 2017-04-12T19:05:23Z

OK great. You're welcome! :)

pthieu · 2017-04-20T08:43:04Z

@anandanand84: by any chance do you know the method to close all tabs at the beginning of the process? I'm using docker for chrome headless and it seems if you prematurely cancel your script, the tab remains opened, and the next time you connect, you connect to the tab you were on before.

anandanand84 · 2017-04-20T14:27:27Z

@pthieu I used client.Target.closeTarget(client._target.id); not sure if that is the right way but it does close the browser tab.

vvo · 2017-04-21T20:08:12Z

Closing all tabs:

async function cleanup() {
  console.log('Closing existing targets if any');
  const targets = await CDP.List();
  return Promise.all(targets.map(({id}) => CDP.Close({id})));
}

async function run() {
  await cleanup();
}

run();

pthieu · 2017-04-22T00:06:54Z

@vvo do you know if headless chrome allows for tabs or new windows? I tried to create a new tab and got the error: Could not create new page

Seems like windows support for creating a new tab in headless mode is not supported? #96 <- are you on linux?

Nevermind. I was using the justinribeiro/chrome-headless docker image. Switched over to the yukinying/chrome-headless-browser image and things are working as expected.

Not sure what's up, both are advertised to be using tip of tree for chromium. Only difference is that the former is using headless_shell and the other is using chrome --headless.

cyrus-and · 2017-04-22T08:01:43Z

@pthieu

do you know if headless chrome allows for tabs or new windows?

I don't think Chrome distinguishes between the two, and you shouldn't care IMO.

Not sure what's up, both are advertised to be using tip of tree for chromium.

There are known issues about tabs managements in headless mode. If you use version 59 you should be OK.

utkuturunc · 2017-04-30T12:19:43Z

I would like to accomplish the same goal as @anandanand84 but with images. Is there any way I can make chrome take two screenshots at the same time?

My other options are

have more than 1 active browsers
first write to pdf then convert to png

cyrus-and · 2017-04-30T12:27:26Z

@utkuturunc this solution should work if you can live with the fact that only the page loading phase happens in parallel while screenshots are taken sequentially once that all the URLs are loaded.

utkuturunc · 2017-04-30T12:49:04Z

I am not sure on how to manage the url array. I am trying to write a script that takes html from stdin and pipe the rendered image to stdout. (Which I then stream to the user)

The script should load the html immediately then return the image when it is the client's turn.

The current idea is:
Client requests screenshot -> Tab is opened and loaded -> Client is added to a task queue -> When its their turn the tab is activated

Also to limit the number of opened tabs, I may need a pooling implementation.

This looks like an overly complicated architecture. (especially comparing to my current solution => start a new phantomjs instance every time)

Edit:
I am ok with taking one screenshot at a time. If I can guarantee the active tab is not changed by other requests before capture is complete, that is good enough.

cyrus-and · 2017-04-30T13:39:41Z

@utkuturunc with the proper design it should be possible to do that fairly easily, the complication arises from the fact that you have to babysit the screenshotting phase due to a limitation of Chrome which I think (hope!) is just temporary.

If I can guarantee the active tab is not changed by other requests before capture is complete

Luckily enough you don't have to care about race conditions in Node.js, as long as everything stays in the same synchronous block. Of course your script would actually be a server which would act as a broker for client requests.

start a new phantomjs instance every time

You can always start a new Chrome instance every time on a different port but that's probably not a great idea.

utkuturunc · 2017-04-30T18:07:15Z

So, funny story...

I was not using headless mode since it was producing blank images in macOS.

Turns out when I use chrome v60 in a headless docker, there is no issue at all.

cyrus-and · 2017-04-30T18:37:46Z

@utkuturunc well I don't know about the OP but I can only reproduce this issue in non-headless mode. Both version 59 and 60 work fine in headless mode on Linux.

cyrus-and added the question label Apr 12, 2017

cyrus-and closed this as completed Apr 12, 2017

westy92 mentioned this issue Jun 29, 2017

Allow creating multiple PDFs in parallel westy92/html-pdf-chrome#14

Closed

bliu23 mentioned this issue Aug 25, 2017

Unable to take multiple screen captures (printToPDF) at the same time #246

Closed

dominiclovell mentioned this issue Mar 23, 2018

Clearing state between request-logger requests adieuadieu/serverless-chrome#126

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to use pool of tabs #92

How to use pool of tabs #92

anandanand84 commented Apr 11, 2017 •

edited by cyrus-and

Loading

cyrus-and commented Apr 12, 2017

anandanand84 commented Apr 12, 2017

cyrus-and commented Apr 12, 2017

pthieu commented Apr 20, 2017

anandanand84 commented Apr 20, 2017

vvo commented Apr 21, 2017

pthieu commented Apr 22, 2017 •

edited

Loading

cyrus-and commented Apr 22, 2017

utkuturunc commented Apr 30, 2017

cyrus-and commented Apr 30, 2017

utkuturunc commented Apr 30, 2017 •

edited

Loading

cyrus-and commented Apr 30, 2017

utkuturunc commented Apr 30, 2017

cyrus-and commented Apr 30, 2017

How to use pool of tabs #92

How to use pool of tabs #92

Comments

anandanand84 commented Apr 11, 2017 • edited by cyrus-and Loading

cyrus-and commented Apr 12, 2017

anandanand84 commented Apr 12, 2017

cyrus-and commented Apr 12, 2017

pthieu commented Apr 20, 2017

anandanand84 commented Apr 20, 2017

vvo commented Apr 21, 2017

pthieu commented Apr 22, 2017 • edited Loading

cyrus-and commented Apr 22, 2017

utkuturunc commented Apr 30, 2017

cyrus-and commented Apr 30, 2017

utkuturunc commented Apr 30, 2017 • edited Loading

cyrus-and commented Apr 30, 2017

utkuturunc commented Apr 30, 2017

cyrus-and commented Apr 30, 2017

anandanand84 commented Apr 11, 2017 •

edited by cyrus-and

Loading

pthieu commented Apr 22, 2017 •

edited

Loading

utkuturunc commented Apr 30, 2017 •

edited

Loading