admin管理员组

文章数量:1289732

I am trying to scrape a page product detail page

but I am not able to find the tag when the code runs. I print the parent tag out, and I see the h2 tag I want, and also when I enter the debugger I can get what I want.

import time

from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright


def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
    with sync_playwright() as this_playwright:
        browser = this_playwright.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        try:
            page.wait_for_load_state("load")
            if wait_after_page_load:
                time.sleep(wait_after_page_load)
        except:
            pass
        
        if selector_to_wait_for:
            page.wait_for_selector(selector_to_wait_for, timeout=15000)

        soup = BeautifulSoup(page.content(), "html.parser")
        browser.close()
    return soup


def parse_product_detail_page(soup):
    parent_block = soup.find("div", class_="primary_block")
    name_and_id_box = parent_block.find("div", class_="item-box")

    print(name_and_id_box) # the h2 tag is visible here

    name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")

    # import ipdb; ipdb.set_trace() # the h2 tag is also visible here

    id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out


def scrape_product_detail_page(product_detail_url):
    try:
        soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
    except:
        return None
    parsed_data = parse_product_detail_page(soup)
    return parsed_data


result = scrape_product_detail_page(".html")

I would appreciate some help determining why name_and_id_header keeps showing up as none. Thank you

I am trying to scrape a page product detail page

but I am not able to find the tag when the code runs. I print the parent tag out, and I see the h2 tag I want, and also when I enter the debugger I can get what I want.

import time

from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright


def playwright_get_soup(url, selector_to_wait_for=None, wait_after_page_load=None):
    with sync_playwright() as this_playwright:
        browser = this_playwright.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        try:
            page.wait_for_load_state("load")
            if wait_after_page_load:
                time.sleep(wait_after_page_load)
        except:
            pass
        
        if selector_to_wait_for:
            page.wait_for_selector(selector_to_wait_for, timeout=15000)

        soup = BeautifulSoup(page.content(), "html.parser")
        browser.close()
    return soup


def parse_product_detail_page(soup):
    parent_block = soup.find("div", class_="primary_block")
    name_and_id_box = parent_block.find("div", class_="item-box")

    print(name_and_id_box) # the h2 tag is visible here

    name_and_id_header = name_and_id_box.find("h2", class_="col-xs-6 ")

    # import ipdb; ipdb.set_trace() # the h2 tag is also visible here

    id_and_raw_name = name_and_id_header.split("#", maxsplit=1) # this is where the program errors out


def scrape_product_detail_page(product_detail_url):
    try:
        soup = playwright_url_to_soup(product_detail_url, selector_to_wait_for=".item-box")
    except:
        return None
    parsed_data = parse_product_detail_page(soup)
    return parsed_data


result = scrape_product_detail_page("https://www.innovation-line/four-color-photoimage-products/ventoux-210d-polyester-drawstring-cinch-pack-backpack-907.html")

I would appreciate some help determining why name_and_id_header keeps showing up as none. Thank you

Share Improve this question edited Feb 21 at 22:43 Cody Childers asked Feb 19 at 17:27 Cody ChildersCody Childers 251 silver badge4 bronze badges 0
Add a comment  | 

2 Answers 2

Reset to default 3

There is a whitespace in your BeautifulSoup class selection:

name_and_id_box.find("h2", class_="col-xs-6 ")

should be "col-xs-6":

name_and_id_box.find("h2", class_="col-xs-6").get_text()

or simply, because it is the only <h2> there:

name_and_id_box.h2.get_text()

The code seems overengineered. I suggest not using BeautifulSoup with Playwright, because BeautifulSoup requires that you dump the entire page to string and re-parse the string before you can select elements, adding another layer of indirection and confusion between you and your goal, not to mention another dependency.

Simply use Playwright directly, with an auto-waiting locator:

from playwright.sync_api import sync_playwright  # 1.48.0

url = "<Your URL>"

with sync_playwright() as pw:
    browser = pw.chromium.launch()
    page = browser.new_page()
    page.goto(url, wait_until="domcontentloaded")
    print(page.locator("h2").text_content())

Output:

Item #907 - 14-1/2" W x 17-1/2" H - "VENTOUX" 210D Polyester Drawstring Cinch Pack Backpack

This locator is strict, so you'll get a nice error if the page changes and adds another <h2>, along with a prompt giving you the selectors you can use to fix the problem--very nice DX.

You can also use page.get_by_role("heading", level=2) if you want to avoid CSS selectors entirely.

Only once it's working should you worry about breaking out functions. First order of business is correctness.


Another approach is to recognize that the data you want is in the static HTML, so you can skip Playwright and simply use requests and BS:

import requests  # 2.31.0
from bs4 import BeautifulSoup  # 4.11.2

soup = BeautifulSoup(requests.get(url).text, "lxml")
print(soup.select_one("h2").text)

本文标签: