admin管理员组文章数量:1123706
I am trying to scrap reviews from the glassdoor. I could scrap text reviews, but I have trouble scraping the recommendation (Yes/No). example URL is .htm. Here is the screenshot of what I am working on.
Here, I scraped pros and cons, and want to get recommendation. Checked for recommendation, their d attribute has 8.835 and no has 18.299. I have no problem with the other part but only for # 3. Scrape Recommendation part. Target XPath example is
//*[@id="empReview_9142916"]/div[2]/div[2]/div[1]/svg/path
10 reviews are gathered from each page, but 0 for recommendations in
print(svg_elements)
It shows an empty list. Below is my current code, removing my ID and password. Thank you in advance for your help.
import csv
import time
from seleniumbase import SB
from selenium.webdrivermon.by import By
def scrape_stackoverflow_cloudflare_and_save_csv(csv_filename="cloudflare_questions.csv"):
"""
Scrapes text from Glassdoor reviews pages (pros, cons, recommendations) for pages 1 to 5, then saves data to a CSV file.
"""
try:
with SB(uc=True) as sb:
base_url = ";
start_url = ".htm"
all_pros = []
all_cons = []
all_recommendations = []
# Loop through pages 1 to 5
for page_num in range(1, 6):
print(f"Scraping page {page_num}...")
if page_num == 1:
sb.uc_open_with_reconnect(start_url, 6)
else:
next_page_link = f"/Reviews/Amazon-Reviews-E6036_P{page_num}.htm"
sb.open(base_url + next_page_link)
if page_num == 2:
email_input = sb.find_element('input[data-test="emailInput-input"]')
email_input.send_keys("my id")
sb.sleep(2) # Wait for the email to be entered
continue_button = sb.find_element('button[data-test="email-form-button"]')
continue_button.click()
sb.sleep(2) # Wait for the next page to load
password_input = sb.find_element('input[data-test="passwordInput-input"]')
password_input.send_keys("my password")
sb.sleep(2) # Wait for the password to be entered
sign_in_button = sb.find_element('button[data-role-variant="primary"][type="submit"]')
sign_in_button.click()
sb.sleep(2) # Wait for the sign-in process to complete
sb.uc_gui_click_captcha()
sb.sleep(4) # Wait for the page to load
# 1. Scrape PROS
pros_elements = sb.find_elements('span[data-test="review-text-PROS"]')
pros_texts = [elem.text.strip() for elem in pros_elements if elem.text.strip()]
# 2. Scrape CONS
cons_elements = sb.find_elements('span[data-test="review-text-CONS"]')
cons_texts = [elem.text.strip() for elem in cons_elements if elem.text.strip()]
# 3. Scrape Recommendations (Yes/No)
svg_elements = sb.find_elements(By.XPATH, '//div[contains(@id, "empReview")]/div[2]/div[2]/div[1]/svg/path')
recommendations = []
for svg in svg_elements:
d_attribute = svg.get_attribute('d')
if d_attribute:
if '8.835 17.64' in d_attribute: # Unique part of the "Yes" SVG
recommendations.append('Yes')
elif '18.299 5.327' in d_attribute: # Unique part of the "No" SVG
recommendations.append('No')
# Collect data from this page
all_pros.extend(pros_texts)
all_cons.extend(cons_texts)
all_recommendations.extend(recommendations)
# Debug: Print collected data for this page
print(f"Page {page_num} - Pros: {len(pros_texts)}, Cons: {len(cons_texts)}, Recommendations: {len(recommendations)}")
# Save all collected data to CSV
print("Saving data to CSV...")
with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["pros_text", "cons_text", "recommendation"]) # Add "recommendation" here
for pros, cons, rec in zip(all_pros, all_cons, all_recommendations):
writer.writerow([pros, cons, rec])
print("Scraping completed successfully!")
except Exception as e:
print(f"An error occurred: {e}")
finally:
print("Exiting function (finally block).")
# Example usage:
if __name__ == "__main__":
scrape_stackoverflow_cloudflare_and_save_csv()
I am trying to scrap reviews from the glassdoor. I could scrap text reviews, but I have trouble scraping the recommendation (Yes/No). example URL is https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm. Here is the screenshot of what I am working on.
Here, I scraped pros and cons, and want to get recommendation. Checked for recommendation, their d attribute has 8.835 and no has 18.299. I have no problem with the other part but only for # 3. Scrape Recommendation part. Target XPath example is
//*[@id="empReview_9142916"]/div[2]/div[2]/div[1]/svg/path
10 reviews are gathered from each page, but 0 for recommendations in
print(svg_elements)
It shows an empty list. Below is my current code, removing my ID and password. Thank you in advance for your help.
import csv
import time
from seleniumbase import SB
from selenium.webdriver.common.by import By
def scrape_stackoverflow_cloudflare_and_save_csv(csv_filename="cloudflare_questions.csv"):
"""
Scrapes text from Glassdoor reviews pages (pros, cons, recommendations) for pages 1 to 5, then saves data to a CSV file.
"""
try:
with SB(uc=True) as sb:
base_url = "https://www.glassdoor.com"
start_url = "https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm"
all_pros = []
all_cons = []
all_recommendations = []
# Loop through pages 1 to 5
for page_num in range(1, 6):
print(f"Scraping page {page_num}...")
if page_num == 1:
sb.uc_open_with_reconnect(start_url, 6)
else:
next_page_link = f"/Reviews/Amazon-Reviews-E6036_P{page_num}.htm"
sb.open(base_url + next_page_link)
if page_num == 2:
email_input = sb.find_element('input[data-test="emailInput-input"]')
email_input.send_keys("my id")
sb.sleep(2) # Wait for the email to be entered
continue_button = sb.find_element('button[data-test="email-form-button"]')
continue_button.click()
sb.sleep(2) # Wait for the next page to load
password_input = sb.find_element('input[data-test="passwordInput-input"]')
password_input.send_keys("my password")
sb.sleep(2) # Wait for the password to be entered
sign_in_button = sb.find_element('button[data-role-variant="primary"][type="submit"]')
sign_in_button.click()
sb.sleep(2) # Wait for the sign-in process to complete
sb.uc_gui_click_captcha()
sb.sleep(4) # Wait for the page to load
# 1. Scrape PROS
pros_elements = sb.find_elements('span[data-test="review-text-PROS"]')
pros_texts = [elem.text.strip() for elem in pros_elements if elem.text.strip()]
# 2. Scrape CONS
cons_elements = sb.find_elements('span[data-test="review-text-CONS"]')
cons_texts = [elem.text.strip() for elem in cons_elements if elem.text.strip()]
# 3. Scrape Recommendations (Yes/No)
svg_elements = sb.find_elements(By.XPATH, '//div[contains(@id, "empReview")]/div[2]/div[2]/div[1]/svg/path')
recommendations = []
for svg in svg_elements:
d_attribute = svg.get_attribute('d')
if d_attribute:
if '8.835 17.64' in d_attribute: # Unique part of the "Yes" SVG
recommendations.append('Yes')
elif '18.299 5.327' in d_attribute: # Unique part of the "No" SVG
recommendations.append('No')
# Collect data from this page
all_pros.extend(pros_texts)
all_cons.extend(cons_texts)
all_recommendations.extend(recommendations)
# Debug: Print collected data for this page
print(f"Page {page_num} - Pros: {len(pros_texts)}, Cons: {len(cons_texts)}, Recommendations: {len(recommendations)}")
# Save all collected data to CSV
print("Saving data to CSV...")
with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["pros_text", "cons_text", "recommendation"]) # Add "recommendation" here
for pros, cons, rec in zip(all_pros, all_cons, all_recommendations):
writer.writerow([pros, cons, rec])
print("Scraping completed successfully!")
except Exception as e:
print(f"An error occurred: {e}")
finally:
print("Exiting function (finally block).")
# Example usage:
if __name__ == "__main__":
scrape_stackoverflow_cloudflare_and_save_csv()
Share
Improve this question
edited yesterday
Shuo
2,4782 gold badges4 silver badges16 bronze badges
asked yesterday
Simon ShinSimon Shin
152 bronze badges
1 Answer
Reset to default 0Instead of using svg values to check if it is recommended, try to check the class for the div that clearly states if it is positive, negative, or neutral. There are neutral and nodata ratings as well.
I have modified the recommendation checking part only. You have to check for neutral and no data.
Try this:
elements = sb.find_elements(By.XPATH, '//span[text()="Recommend"]/parent::div')
recommendations = []
for elem in elements:
attribute = elem.get_attribute('class')
if 'positiveStyles' in attribute:
recommendations.append('Yes')
elif 'negativeStyles' in attribute:
recommendations.append('No')
It should give you the recommended and not recommended reviews.
本文标签: pythonCannot scrap xpath using SeleniumStack Overflow
版权声明:本文标题:python - Cannot scrap xpath using Selenium - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1736590873a1945075.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论