-
Notifications
You must be signed in to change notification settings - Fork 60
/
wrapper.js
67 lines (59 loc) · 2.33 KB
/
wrapper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
const { loadNotBlankPage, getChromeExecutablePath } = require('./helpers');
const { Cluster } = require('puppeteer-cluster');
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
async function clusterWrapper({
func,
queueEntries,
proxyEndpoint = '',
monitor = false,
useProfile = false, // After solving Captcha, save uour profile, so you may avoid doing it next time
otherConfigs = {},
}) {
if (!Array.isArray(queueEntries) && (typeof queueEntries !== 'object' || queueEntries === null))
throw new Error('queueEntries must be an array or an object');
try {
var { origin, username, password } = new URL(proxyEndpoint);
} catch (_) {
console.log('Proxy disabled => To use Proxy, provide an endpoint in the form of http://username:password@host:port');
origin = username = password = null;
}
const maxConcurrency = Math.min(Object.keys(queueEntries).length, 5);
const perBrowserOptions = [...Array(maxConcurrency).keys()].map(i => {
const puppeteerOptions = {
...{
headless: false,
defaultViewport: false,
executablePath: getChromeExecutablePath(), // Avoid Bot detection
},
...otherConfigs,
};
if (useProfile) puppeteerOptions.userDataDir = `./tmp/profile${i + 1}`; // Must use different profile for each browser
if (proxyEndpoint) puppeteerOptions.args = [`--proxy-server=${origin}`];
return puppeteerOptions;
});
console.log(`Configuration for ${maxConcurrency} browsers in Cluster:`, perBrowserOptions);
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency,
perBrowserOptions,
puppeteer,
monitor,
timeout: 1e7,
});
cluster.on('taskerror', (err, data) => {
console.log(err.message, data);
});
await cluster.task(async ({ page, data: queueData }) => {
const notBlankPage = await loadNotBlankPage(page, 'https://ipinfo.io/json', username, password);
const content = await notBlankPage.$eval('body', el => el.innerText);
console.log(`IP Information for scraping ${queueData}: ${content}`);
if (typeof func === 'function') await func(notBlankPage, queueData);
else console.log('Function not found.');
});
for (const queueData of queueEntries) await cluster.queue(queueData);
await cluster.idle();
await cluster.close();
}
module.exports = { clusterWrapper };