admin管理员组文章数量:1391850
I'm coding a Puppeteer script, which starts up an instance of Google Chrome for Testing. Even if this browser is not headless, and even if I control it manually without any automation, whenever I try to do a Google search, I get blocked as a bot immediately. Going directly to website URLs works; it's Google that's blocking me from doing any searches. This instant block has persisted for days. However, I'm not using any proxies, and I'm still allowed to do searches (without CAPTCHAs) with my IP on my normal Google Chrome browser.
Puppeteer is using the Stealth plugin, params to avoid bot detection, a fake user agent, etc. Yet, Chrome instantly detects me as a bot. How can I avoid this, and why is this happening? How is Chrome tracking/detecting me?
Here is my code to define the stealth settings:
const os = require("os");
async function initializeStealthBrowser(page) {
async function generateChromeUserAgent() {
// Latest Chrome versions for different platforms
const CHROME_VERSIONS = {
win: ["134.0.6998.35", "134.0.6998.36"],
mac: ["134.0.6998.44", "134.0.6998.45"],
linux: ["134.0.6998.35"],
};
let platform = os.platform();
platform === "win32"
? (platform = "win")
: platform === "darwin"
? (platform = "mac")
: (platform = "linux");
const version =
CHROME_VERSIONS[platform][
Math.floor(Math.random() * CHROME_VERSIONS[platform].length)
];
switch (platform) {
case "win":
return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
case "mac":
return `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
case "linux":
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
default:
return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
}
}
async function generateRealisticHTTPHeaders(userAgent) {
return (headers = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
"cache-control": "max-age=0",
"sec-ch-ua":
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": userAgent,
});
}
async function injectFakeHistory(page) {
try {
await page.evaluate(() => {
// Create a fake history array to store navigation entries
const fakeHistoryEntries = [];
// Generate some realistic history entries - all on the same origin as current page
const currentOrigin = window.location.origin;
const pathOptions = [
"/search",
"/news",
"/maps",
"/images",
"/mail",
"/drive",
"/calendar",
"/shopping",
];
// Add 3-8 fake history entries on the same origin
const entryCount = Math.floor(Math.random() * 6) + 3;
for (let i = 0; i < entryCount; i++) {
const randomPath =
pathOptions[Math.floor(Math.random() * pathOptions.length)];
fakeHistoryEntries.push({
url: `${currentOrigin}${randomPath}`,
title: `Page ${randomPath.substring(1)}`,
state: {},
});
}
// Store the original history methods
const originalPushState = window.history.pushState;
const originalReplaceState = window.history.replaceState;
const originalBack = window.history.back;
// Override history length property to return a more realistic value
Object.defineProperty(window.history, "length", {
get: function () {
return fakeHistoryEntries.length;
},
});
// Replace history methods with our own versions
window.history.pushState = function (state, title, url) {
// Call original but catch and ignore security errors
try {
originalPushState.call(window.history, state, title, url);
} catch (e) {
// Silently fail on security errors
}
// Add to our fake history regardless
fakeHistoryEntries.push({
url: url || window.location.href,
title: title,
state: state,
});
};
window.history.replaceState = function (state, title, url) {
// Call original but catch and ignore security errors
try {
originalReplaceState.call(window.history, state, title, url);
} catch (e) {
// Silently fail on security errors
}
// Replace last entry in fake history
if (fakeHistoryEntries.length > 0) {
fakeHistoryEntries[fakeHistoryEntries.length - 1] = {
url: url || window.location.href,
title: title,
state: state,
};
}
};
// Make it seem like we have history to go back to
window.history.back = function () {
// Call the original
originalBack.call(window.history);
// Also manage our fake history
if (fakeHistoryEntries.length > 1) {
fakeHistoryEntries.pop();
}
};
// Set initial history state
for (const entry of fakeHistoryEntries) {
try {
originalPushState.call(
window.history,
entry.state,
entry.title,
entry.url
);
} catch (e) {
// Ignore security errors
}
}
// Make the history property appear realistic
console.log(
`Successfully set up fake browser history with ${fakeHistoryEntries.length} entries`
);
});
} catch (error) {
console.warn("Error setting up fake history:", error);
// Continue execution even if history injection fails
}
}
try {
// Ensure the generated user agent is always a valid string
let userAgent = await generateChromeUserAgent();
// Verify the user agent is a valid string without any non-string characters
if (typeof userAgent !== "string" || userAgent.includes("\u0000")) {
console.warn("Generated an invalid user agent, falling back to default");
userAgent =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36";
}
await page.setUserAgent(userAgent);
await page.setExtraHTTPHeaders(generateRealisticHTTPHeaders(userAgent));
await injectFakeHistory(page);
await page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1,
});
await page.setJavaScriptEnabled(true);
} catch (error) {
console.error("Error initializing stealth browser:", error);
throw error;
}
}
module.exports = { initializeStealthBrowser };
And here's the code for the first initialization:
const browser = await puppeteerExtra.launch({
headless: false,
userDataDir: userDataDir,
referer: ";,
args: [
"--disable-features=site-per-process",
"--disable-advertisements",
"--enable-javascript",
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-gpu",
"--enable-webgl",
],
});
let page = await browser.newPage();
I'm coding a Puppeteer script, which starts up an instance of Google Chrome for Testing. Even if this browser is not headless, and even if I control it manually without any automation, whenever I try to do a Google search, I get blocked as a bot immediately. Going directly to website URLs works; it's Google that's blocking me from doing any searches. This instant block has persisted for days. However, I'm not using any proxies, and I'm still allowed to do searches (without CAPTCHAs) with my IP on my normal Google Chrome browser.
Puppeteer is using the Stealth plugin, params to avoid bot detection, a fake user agent, etc. Yet, Chrome instantly detects me as a bot. How can I avoid this, and why is this happening? How is Chrome tracking/detecting me?
Here is my code to define the stealth settings:
const os = require("os");
async function initializeStealthBrowser(page) {
async function generateChromeUserAgent() {
// Latest Chrome versions for different platforms
const CHROME_VERSIONS = {
win: ["134.0.6998.35", "134.0.6998.36"],
mac: ["134.0.6998.44", "134.0.6998.45"],
linux: ["134.0.6998.35"],
};
let platform = os.platform();
platform === "win32"
? (platform = "win")
: platform === "darwin"
? (platform = "mac")
: (platform = "linux");
const version =
CHROME_VERSIONS[platform][
Math.floor(Math.random() * CHROME_VERSIONS[platform].length)
];
switch (platform) {
case "win":
return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
case "mac":
return `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
case "linux":
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
default:
return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
}
}
async function generateRealisticHTTPHeaders(userAgent) {
return (headers = {
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
"cache-control": "max-age=0",
"sec-ch-ua":
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": userAgent,
});
}
async function injectFakeHistory(page) {
try {
await page.evaluate(() => {
// Create a fake history array to store navigation entries
const fakeHistoryEntries = [];
// Generate some realistic history entries - all on the same origin as current page
const currentOrigin = window.location.origin;
const pathOptions = [
"/search",
"/news",
"/maps",
"/images",
"/mail",
"/drive",
"/calendar",
"/shopping",
];
// Add 3-8 fake history entries on the same origin
const entryCount = Math.floor(Math.random() * 6) + 3;
for (let i = 0; i < entryCount; i++) {
const randomPath =
pathOptions[Math.floor(Math.random() * pathOptions.length)];
fakeHistoryEntries.push({
url: `${currentOrigin}${randomPath}`,
title: `Page ${randomPath.substring(1)}`,
state: {},
});
}
// Store the original history methods
const originalPushState = window.history.pushState;
const originalReplaceState = window.history.replaceState;
const originalBack = window.history.back;
// Override history length property to return a more realistic value
Object.defineProperty(window.history, "length", {
get: function () {
return fakeHistoryEntries.length;
},
});
// Replace history methods with our own versions
window.history.pushState = function (state, title, url) {
// Call original but catch and ignore security errors
try {
originalPushState.call(window.history, state, title, url);
} catch (e) {
// Silently fail on security errors
}
// Add to our fake history regardless
fakeHistoryEntries.push({
url: url || window.location.href,
title: title,
state: state,
});
};
window.history.replaceState = function (state, title, url) {
// Call original but catch and ignore security errors
try {
originalReplaceState.call(window.history, state, title, url);
} catch (e) {
// Silently fail on security errors
}
// Replace last entry in fake history
if (fakeHistoryEntries.length > 0) {
fakeHistoryEntries[fakeHistoryEntries.length - 1] = {
url: url || window.location.href,
title: title,
state: state,
};
}
};
// Make it seem like we have history to go back to
window.history.back = function () {
// Call the original
originalBack.call(window.history);
// Also manage our fake history
if (fakeHistoryEntries.length > 1) {
fakeHistoryEntries.pop();
}
};
// Set initial history state
for (const entry of fakeHistoryEntries) {
try {
originalPushState.call(
window.history,
entry.state,
entry.title,
entry.url
);
} catch (e) {
// Ignore security errors
}
}
// Make the history property appear realistic
console.log(
`Successfully set up fake browser history with ${fakeHistoryEntries.length} entries`
);
});
} catch (error) {
console.warn("Error setting up fake history:", error);
// Continue execution even if history injection fails
}
}
try {
// Ensure the generated user agent is always a valid string
let userAgent = await generateChromeUserAgent();
// Verify the user agent is a valid string without any non-string characters
if (typeof userAgent !== "string" || userAgent.includes("\u0000")) {
console.warn("Generated an invalid user agent, falling back to default");
userAgent =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36";
}
await page.setUserAgent(userAgent);
await page.setExtraHTTPHeaders(generateRealisticHTTPHeaders(userAgent));
await injectFakeHistory(page);
await page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1,
});
await page.setJavaScriptEnabled(true);
} catch (error) {
console.error("Error initializing stealth browser:", error);
throw error;
}
}
module.exports = { initializeStealthBrowser };
And here's the code for the first initialization:
const browser = await puppeteerExtra.launch({
headless: false,
userDataDir: userDataDir,
referer: "https://www.google",
args: [
"--disable-features=site-per-process",
"--disable-advertisements",
"--enable-javascript",
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-gpu",
"--enable-webgl",
],
});
let page = await browser.newPage();
Share
Improve this question
edited Mar 13 at 3:54
ggorlen
58k8 gold badges114 silver badges157 bronze badges
asked Mar 12 at 2:56
NateNate
135 bronze badges
3
- This is rather broad. Different sites use different techniques for blocking bots, so I suggest sharing the exact site, as well as your minimal code and your high-level goal you want to achieve on the site. I've had no problem automating google searches--the bot blocking is quite non-aggressive as far as I've seen. There are also search APIs you can use. Thanks. – ggorlen Commented Mar 12 at 13:20
- Hi, I edited the question to attach the code. The site I'm trying to bypass right now is google for searches. My goal is to not get captchas in the first place, but for google and other websites (including tripadvisor), I get a captcha immediately, even if I don't automate anything -- just by starting the browser. There are other sites I can use, but I want to fix this block so it doesn't happen anywhere. – Nate Commented Mar 12 at 21:58
- Your ip is blacklisted at this point so you would need to log in or solve a captcha – pguardiario Commented Mar 16 at 23:27
1 Answer
Reset to default 0If it's for personal use then you could export your daily browser cookie & spoof it using puppeteer.
If it's for commercial use than look into antidetect browsers. They offer pre-made cookies
本文标签: web scrapingAvoid instant block by Chrome when using PuppeteerStack Overflow
版权声明:本文标题:web scraping - Avoid instant block by Chrome when using Puppeteer - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1744771384a2624359.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论