python - Cannot scrap xpath using Selenium - Stack Overflow

IT技术

更新时间：2025-01-115

admin管理员组
文章数量:1123706

I am trying to scrap reviews from the glassdoor. I could scrap text reviews, but I have trouble scraping the recommendation (Yes/No). example URL is .htm. Here is the screenshot of what I am working on.

Here, I scraped pros and cons, and want to get recommendation. Checked for recommendation, their d attribute has 8.835 and no has 18.299. I have no problem with the other part but only for # 3. Scrape Recommendation part. Target XPath example is

//*[@id="empReview_9142916"]/div[2]/div[2]/div[1]/svg/path

10 reviews are gathered from each page, but 0 for recommendations in

print(svg_elements)

It shows an empty list. Below is my current code, removing my ID and password. Thank you in advance for your help.

import csv
import time
from seleniumbase import SB
from selenium.webdrivermon.by import By

def scrape_stackoverflow_cloudflare_and_save_csv(csv_filename="cloudflare_questions.csv"):
    """
    Scrapes text from Glassdoor reviews pages (pros, cons, recommendations) for pages 1 to 5, then saves data to a CSV file.
    """
    try:
        with SB(uc=True) as sb:
            base_url = ";
            start_url = ".htm"
            
            all_pros = []
            all_cons = []
            all_recommendations = []

            # Loop through pages 1 to 5
            for page_num in range(1, 6):
                print(f"Scraping page {page_num}...")
                if page_num == 1:
                    sb.uc_open_with_reconnect(start_url, 6)
                else:
                    next_page_link = f"/Reviews/Amazon-Reviews-E6036_P{page_num}.htm"
                    sb.open(base_url + next_page_link)
                if page_num == 2:
                    email_input = sb.find_element('input[data-test="emailInput-input"]')
                    email_input.send_keys("my id")
                    sb.sleep(2)  # Wait for the email to be entered
                    continue_button = sb.find_element('button[data-test="email-form-button"]')
                    continue_button.click()
                    sb.sleep(2)  # Wait for the next page to load

                    password_input = sb.find_element('input[data-test="passwordInput-input"]')
                    password_input.send_keys("my password")
                    sb.sleep(2)  # Wait for the password to be entered
                    sign_in_button = sb.find_element('button[data-role-variant="primary"][type="submit"]')
                    sign_in_button.click()
                    sb.sleep(2)  # Wait for the sign-in process to complete
                sb.uc_gui_click_captcha()
                sb.sleep(4)  # Wait for the page to load
                
                # 1. Scrape PROS
                pros_elements = sb.find_elements('span[data-test="review-text-PROS"]')
                pros_texts = [elem.text.strip() for elem in pros_elements if elem.text.strip()]

                # 2. Scrape CONS
                cons_elements = sb.find_elements('span[data-test="review-text-CONS"]')
                cons_texts = [elem.text.strip() for elem in cons_elements if elem.text.strip()]

                # 3. Scrape Recommendations (Yes/No)
                svg_elements = sb.find_elements(By.XPATH, '//div[contains(@id, "empReview")]/div[2]/div[2]/div[1]/svg/path')
                recommendations = []
                for svg in svg_elements:
                    d_attribute = svg.get_attribute('d')
                    if d_attribute:
                        if '8.835 17.64' in d_attribute:  # Unique part of the "Yes" SVG
                            recommendations.append('Yes')
                        elif '18.299 5.327' in d_attribute:  # Unique part of the "No" SVG
                            recommendations.append('No')

                # Collect data from this page
                all_pros.extend(pros_texts)
                all_cons.extend(cons_texts)
                all_recommendations.extend(recommendations)

                # Debug: Print collected data for this page
                print(f"Page {page_num} - Pros: {len(pros_texts)}, Cons: {len(cons_texts)}, Recommendations: {len(recommendations)}")

            # Save all collected data to CSV
            print("Saving data to CSV...")
            with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["pros_text", "cons_text", "recommendation"])  # Add "recommendation" here
                
                for pros, cons, rec in zip(all_pros, all_cons, all_recommendations):
                    writer.writerow([pros, cons, rec])

            print("Scraping completed successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        print("Exiting function (finally block).")


# Example usage:
if __name__ == "__main__":
    scrape_stackoverflow_cloudflare_and_save_csv()

I am trying to scrap reviews from the glassdoor. I could scrap text reviews, but I have trouble scraping the recommendation (Yes/No). example URL is https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm. Here is the screenshot of what I am working on.

Here, I scraped pros and cons, and want to get recommendation. Checked for recommendation, their d attribute has 8.835 and no has 18.299. I have no problem with the other part but only for # 3. Scrape Recommendation part. Target XPath example is

//*[@id="empReview_9142916"]/div[2]/div[2]/div[1]/svg/path

10 reviews are gathered from each page, but 0 for recommendations in

print(svg_elements)

It shows an empty list. Below is my current code, removing my ID and password. Thank you in advance for your help.

import csv
import time
from seleniumbase import SB
from selenium.webdriver.common.by import By

def scrape_stackoverflow_cloudflare_and_save_csv(csv_filename="cloudflare_questions.csv"):
    """
    Scrapes text from Glassdoor reviews pages (pros, cons, recommendations) for pages 1 to 5, then saves data to a CSV file.
    """
    try:
        with SB(uc=True) as sb:
            base_url = "https://www.glassdoor.com"
            start_url = "https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm"
            
            all_pros = []
            all_cons = []
            all_recommendations = []

            # Loop through pages 1 to 5
            for page_num in range(1, 6):
                print(f"Scraping page {page_num}...")
                if page_num == 1:
                    sb.uc_open_with_reconnect(start_url, 6)
                else:
                    next_page_link = f"/Reviews/Amazon-Reviews-E6036_P{page_num}.htm"
                    sb.open(base_url + next_page_link)
                if page_num == 2:
                    email_input = sb.find_element('input[data-test="emailInput-input"]')
                    email_input.send_keys("my id")
                    sb.sleep(2)  # Wait for the email to be entered
                    continue_button = sb.find_element('button[data-test="email-form-button"]')
                    continue_button.click()
                    sb.sleep(2)  # Wait for the next page to load

                    password_input = sb.find_element('input[data-test="passwordInput-input"]')
                    password_input.send_keys("my password")
                    sb.sleep(2)  # Wait for the password to be entered
                    sign_in_button = sb.find_element('button[data-role-variant="primary"][type="submit"]')
                    sign_in_button.click()
                    sb.sleep(2)  # Wait for the sign-in process to complete
                sb.uc_gui_click_captcha()
                sb.sleep(4)  # Wait for the page to load
                
                # 1. Scrape PROS
                pros_elements = sb.find_elements('span[data-test="review-text-PROS"]')
                pros_texts = [elem.text.strip() for elem in pros_elements if elem.text.strip()]

                # 2. Scrape CONS
                cons_elements = sb.find_elements('span[data-test="review-text-CONS"]')
                cons_texts = [elem.text.strip() for elem in cons_elements if elem.text.strip()]

                # 3. Scrape Recommendations (Yes/No)
                svg_elements = sb.find_elements(By.XPATH, '//div[contains(@id, "empReview")]/div[2]/div[2]/div[1]/svg/path')
                recommendations = []
                for svg in svg_elements:
                    d_attribute = svg.get_attribute('d')
                    if d_attribute:
                        if '8.835 17.64' in d_attribute:  # Unique part of the "Yes" SVG
                            recommendations.append('Yes')
                        elif '18.299 5.327' in d_attribute:  # Unique part of the "No" SVG
                            recommendations.append('No')

                # Collect data from this page
                all_pros.extend(pros_texts)
                all_cons.extend(cons_texts)
                all_recommendations.extend(recommendations)

                # Debug: Print collected data for this page
                print(f"Page {page_num} - Pros: {len(pros_texts)}, Cons: {len(cons_texts)}, Recommendations: {len(recommendations)}")

            # Save all collected data to CSV
            print("Saving data to CSV...")
            with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["pros_text", "cons_text", "recommendation"])  # Add "recommendation" here
                
                for pros, cons, rec in zip(all_pros, all_cons, all_recommendations):
                    writer.writerow([pros, cons, rec])

            print("Scraping completed successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        print("Exiting function (finally block).")


# Example usage:
if __name__ == "__main__":
    scrape_stackoverflow_cloudflare_and_save_csv()

Share Improve this question edited yesterday Shuo 2,4782 gold badges4 silver badges16 bronze badges asked yesterday Simon Shin 152 bronze badges

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

Instead of using svg values to check if it is recommended, try to check the class for the div that clearly states if it is positive, negative, or neutral. There are neutral and nodata ratings as well.

I have modified the recommendation checking part only. You have to check for neutral and no data.

Try this:

elements = sb.find_elements(By.XPATH, '//span[text()="Recommend"]/parent::div')
recommendations = []
for elem in elements:
    attribute = elem.get_attribute('class')
    if 'positiveStyles' in attribute:
        recommendations.append('Yes')
    elif 'negativeStyles' in attribute:
        recommendations.append('No')

It should give you the recommended and not recommended reviews.

本文标签： pythonCannot scrap xpath using SeleniumStack Overflow

版权声明：本文标题：python - Cannot scrap xpath using Selenium - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1736590873a1945075.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

python - Cannot scrap xpath using Selenium - Stack Overflow

1 Answer 1

更多相关文章