admin管理员组文章数量:1420973
I've made a node.js web scraper code that runs fine on my puter, however, when I deploy to my Google Cloud VM instance running Debian, it returns a timeout error for a specific website. I've tried many differnent setups for puppeteer, but none seems to work. I believe the website I'm trying to scrape is blocking my code when I run from the google cloud server, but not when I run from my puter. The scraping part is working fine on my puter. Puppeteer finds the HTML tags and retrieve the info.
const puppeteer = require('puppeteer');
const GoogleSpreadsheet = require('google-spreadsheet');
const { promisify } = require('util');
const credentials = require('./credentials.json');
async function main(){
const scrapCopasa = await scrapCopasaFunction();
console.log('Done!')
}
async function scrapCopasaFunction() {
const browser = await puppeteer.launch({
args: ['--no-sandbox'],
});
const page = await browser.newPage();
//await page.setDefaultNavigationTimeout(0);
//await page.setViewport({width: 1366, height: 768});
await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
await page.goto('');
//await new Promise(resolve => setTimeout(resolve, 5000));
let isUsernameNotFound = await page.evaluate(() => {
if(document.getElementsByClassName('h2')[0]) {
if(document.getElementsByTagName('h2')[0].textContent == "Sorry, this page isn't available.") {
return true;
}
}
});
if(isUsernameNotFound) {
console.log('Account not exists!');
await browser.close();
return;
}
let reservoirLevelsCopasa = await page.evaluate(() => {
const tds = Array.from(document.querySelectorAll('table tr td'))
return tds.map(td => td.innerText)
});
const riomanso = reservoirLevelsCopasa[13].replace(",",".").substring(0,5);
const serraazul = reservoirLevelsCopasa[17].replace(",",".").substring(0,5);
const vargemdasflores = reservoirLevelsCopasa[21].replace(",",".").substring(0,5);
await browser.close();
return[riomanso, serraazul, vargemdasflores];
}
main();
And error that I'm getting is the following:
(node:6425) UnhandledPromiseRejectionWarning: TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at /home/xxx/reservoirs/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
at async FrameManager.navigateFrame (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:94:17)
at async Frame.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:406:12)
at async Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:12)
at async scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:5)
at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
-- ASYNC --
at Frame.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:49)
at Page.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:112:23)
at scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:16)
at processTicksAndRejections (internal/process/task_queues.js:93:5)
at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6425) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async f
unction without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled
promise rejection, use the CLI flag `--unhandled-rejections=strict` (see .html#cli_unhandled_rejections_mode)
. (rejection id: 1)
(node:6425) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not ha
ndled will terminate the Node.js process with a non-zero exit code.
I've made a node.js web scraper code that runs fine on my puter, however, when I deploy to my Google Cloud VM instance running Debian, it returns a timeout error for a specific website. I've tried many differnent setups for puppeteer, but none seems to work. I believe the website I'm trying to scrape is blocking my code when I run from the google cloud server, but not when I run from my puter. The scraping part is working fine on my puter. Puppeteer finds the HTML tags and retrieve the info.
const puppeteer = require('puppeteer');
const GoogleSpreadsheet = require('google-spreadsheet');
const { promisify } = require('util');
const credentials = require('./credentials.json');
async function main(){
const scrapCopasa = await scrapCopasaFunction();
console.log('Done!')
}
async function scrapCopasaFunction() {
const browser = await puppeteer.launch({
args: ['--no-sandbox'],
});
const page = await browser.newPage();
//await page.setDefaultNavigationTimeout(0);
//await page.setViewport({width: 1366, height: 768});
await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
await page.goto('http://www.copasa..br/wps/portal/internet/abastecimento-de-agua/nivel-dos-reservatorios');
//await new Promise(resolve => setTimeout(resolve, 5000));
let isUsernameNotFound = await page.evaluate(() => {
if(document.getElementsByClassName('h2')[0]) {
if(document.getElementsByTagName('h2')[0].textContent == "Sorry, this page isn't available.") {
return true;
}
}
});
if(isUsernameNotFound) {
console.log('Account not exists!');
await browser.close();
return;
}
let reservoirLevelsCopasa = await page.evaluate(() => {
const tds = Array.from(document.querySelectorAll('table tr td'))
return tds.map(td => td.innerText)
});
const riomanso = reservoirLevelsCopasa[13].replace(",",".").substring(0,5);
const serraazul = reservoirLevelsCopasa[17].replace(",",".").substring(0,5);
const vargemdasflores = reservoirLevelsCopasa[21].replace(",",".").substring(0,5);
await browser.close();
return[riomanso, serraazul, vargemdasflores];
}
main();
And error that I'm getting is the following:
(node:6425) UnhandledPromiseRejectionWarning: TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at /home/xxx/reservoirs/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
at async FrameManager.navigateFrame (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:94:17)
at async Frame.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/FrameManager.js:406:12)
at async Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:12)
at async scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:5)
at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
-- ASYNC --
at Frame.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/home/xxx/reservoirs/node_modules/puppeteer/lib/Page.js:674:49)
at Page.<anonymous> (/home/xxx/reservoirs/node_modules/puppeteer/lib/helper.js:112:23)
at scrapCopasaFunction (/home/xxx/reservoirs/reservatorios.js:129:16)
at processTicksAndRejections (internal/process/task_queues.js:93:5)
at async main (/home/xxx/reservoirs/reservatorios.js:9:25)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6425) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async f
unction without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled
promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs/api/cli.html#cli_unhandled_rejections_mode)
. (rejection id: 1)
(node:6425) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not ha
ndled will terminate the Node.js process with a non-zero exit code.
Share
Improve this question
edited Jan 28, 2021 at 19:58
DisappointedByUnaccountableMod
6,8464 gold badges20 silver badges23 bronze badges
asked Jan 10, 2021 at 20:08
grcgrc
3241 gold badge6 silver badges23 bronze badges
2 Answers
Reset to default 5The cloud functions are a bit slow for puppeteer. There were a GitHub issue #3120. regarding this. You can assign more CPU/ram for the function, if that's a possibility. The more CPU and RAM you provide for chrome, the faster it will be.
You can add a timeout to goto
, which is maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout.
await page.goto('http://www.copasa..br', { timeout: 60000 });
You can also setup the navigation timeout with setDefaultTimeout and setDefaultNavigationTimeout which takes priority over setDefaultTimeout.
page.setDefaultNavigationTimeout(60000)
The data you're extracting is already in HTML, so you can fetch HTML with HTTP request and extract data in Node.js script instead of the browser. This will be faster and require fewer resources. If you need to authenticate, you can send a POST request and reuse the cookie in the following GET request. Example in this answer.
Full example
const cheerio = require('cheerio')
const got = require('got')
const URL = 'http://www.copasa..br/wps/portal/internet/abastecimento-de-agua/nivel-dos-reservatorios'
function reportAndExit (error) {
console.error(error)
process.exit(1)
}
async function main () {
const headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
const response = await got(URL, headers)
const $ = cheerio.load(response.body)
const reservoirLevelsCopasa = $('#conteudo-principal table:first-of-type tr:nth-of-type(n+3) td:nth-child(4)').map((i, el) => parseFloat($(el).text().replace(',', '.'))).get()
console.log(reservoirLevelsCopasa)
return reservoirLevelsCopasa
}
main().catch(reportAndExit)
Output
[ 83.4, 88.8, 85.9 ]
本文标签: javascriptPuppeteer timeout on specific website running on server cloud headlessStack Overflow
版权声明:本文标题:javascript - Puppeteer timeout on specific website running on server cloud headless - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1745338545a2654154.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论