admin管理员组文章数量:1289732
I am trying to scrape a page product detail page
but I am not able to find the tag when the code runs. I print the parent tag out, and I see the h2
tag I want, and also when I enter the debugger I can get what I want.
import time
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
with sync_playwright() as this_playwright:
browser = this_playwright.chromium.launch()
page = browser.new_page()
page.goto(url)
try:
page.wait_for_load_state("load")
if wait_after_page_load:
time.sleep(wait_after_page_load)
except:
pass
if selector_to_wait_for:
page.wait_for_selector(selector_to_wait_for, timeout=15000)
soup = BeautifulSoup(page.content(), "html.parser")
browser.close()
return soup
def parse_product_detail_page(soup):
parent_block = soup.find("div", class_="primary_block")
name_and_id_box = parent_block.find("div", class_="item-box")
print(name_and_id_box) # the h2 tag is visible here
name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")
# import ipdb; ipdb.set_trace() # the h2 tag is also visible here
id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out
def scrape_product_detail_page(product_detail_url):
try:
soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
except:
return None
parsed_data = parse_product_detail_page(soup)
return parsed_data
result = scrape_product_detail_page(".html")
I would appreciate some help determining why name_and_id_header
keeps showing up as none. Thank you
I am trying to scrape a page product detail page
but I am not able to find the tag when the code runs. I print the parent tag out, and I see the h2
tag I want, and also when I enter the debugger I can get what I want.
import time
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
with sync_playwright() as this_playwright:
browser = this_playwright.chromium.launch()
page = browser.new_page()
page.goto(url)
try:
page.wait_for_load_state("load")
if wait_after_page_load:
time.sleep(wait_after_page_load)
except:
pass
if selector_to_wait_for:
page.wait_for_selector(selector_to_wait_for, timeout=15000)
soup = BeautifulSoup(page.content(), "html.parser")
browser.close()
return soup
def parse_product_detail_page(soup):
parent_block = soup.find("div", class_="primary_block")
name_and_id_box = parent_block.find("div", class_="item-box")
print(name_and_id_box) # the h2 tag is visible here
name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")
# import ipdb; ipdb.set_trace() # the h2 tag is also visible here
id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out
def scrape_product_detail_page(product_detail_url):
try:
soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
except:
return None
parsed_data = parse_product_detail_page(soup)
return parsed_data
result = scrape_product_detail_page("https://www.innovation-line/four-color-photoimage-products/ventoux-210d-polyester-drawstring-cinch-pack-backpack-907.html")
I would appreciate some help determining why name_and_id_header
keeps showing up as none. Thank you
2 Answers
Reset to default 3There is a whitespace in your BeautifulSoup class selection:
name_and_id_box.find("h2", class_="col-xs-6 ")
should be "col-xs-6":
name_and_id_box.find("h2", class_="col-xs-6").get_text()
or simply, because it is the only <h2>
there:
name_and_id_box.h2.get_text()
The code seems overengineered. I suggest not using BeautifulSoup with Playwright, because BeautifulSoup requires that you dump the entire page to string and re-parse the string before you can select elements, adding another layer of indirection and confusion between you and your goal, not to mention another dependency.
Simply use Playwright directly, with an auto-waiting locator:
from playwright.sync_api import sync_playwright # 1.48.0
url = "<Your URL>"
with sync_playwright() as pw:
browser = pw.chromium.launch()
page = browser.new_page()
page.goto(url, wait_until="domcontentloaded")
print(page.locator("h2").text_content())
Output:
Item #907 - 14-1/2" W x 17-1/2" H - "VENTOUX" 210D Polyester Drawstring Cinch Pack Backpack
This locator is strict, so you'll get a nice error if the page changes and adds another <h2>
, along with a prompt giving you the selectors you can use to fix the problem--very nice DX.
You can also use page.get_by_role("heading", level=2)
if you want to avoid CSS selectors entirely.
Only once it's working should you worry about breaking out functions. First order of business is correctness.
Another approach is to recognize that the data you want is in the static HTML, so you can skip Playwright and simply use requests
and BS:
import requests # 2.31.0
from bs4 import BeautifulSoup # 4.11.2
soup = BeautifulSoup(requests.get(url).text, "lxml")
print(soup.select_one("h2").text)
本文标签:
版权声明:本文标题:web scraping - Playwright Python can't find HTML tag which shows up in debugger and in a print statement - Stack Overflo 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741480893a2381164.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论