admin管理员组

文章数量:1290947

I'm trying to work on a project, and I need to get the links off google image results. Here is my code:

from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse


def google_images(driver, name, intAmount):
    for x in lsDir("gg_downloads"):
        os.remove(os.path.join("gg_downloads", x))

    print(f"[*] Opening Google...")
    driver.get(f";udm=2&q={name}")
    time.sleep(3)

    print(f"[*] Scrolling down...")
    for _ in tqdm(range(intAmount)):
        ActionChains(driver).scroll_by_amount(0, 10000).perform()
        time.sleep(1)

    print(f"[*] Gathering profile photos...")
    image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
    images = []
    for profile in tqdm(image_elements,desc="Gathering Photos"):
        images.append(profile.get_attribute("src"))
    images = images[::2]

    print(f"[*] Gathering links...")
    links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a") 
    image_links = []
    for link in tqdm(links, desc="Gathering Links"):
        image_links.append(link.get_attribute("href"))
    print(image_links)
    print(f"[*] Downloading profile photos...")
    for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
        try:
            if image_url.startswith('data:'):
                # Handle data URL
                header, encoded = image_url.split(",", 1)
                content_type = header.split(":")[1].split(";")[0]
                file_extension = content_type.split("/")[-1]
                
                # Decode base64 content
                image_data = base64.b64decode(encoded)
                
                # Save the file
                with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
                    f.write(image_data)
            else:
                # Handle regular URL
                parsed_url = urlparse(image_url)
                file_extension = os.path.splitext(parsed_url.path)[1]
                if not file_extension:
                    file_extension = '.jpg'  # Default to .jpg if no extension found
                
                download_file(image_url, f"gg_downloads/{i}{file_extension}")
        except Exception as e:
            print(f"Error downloading image {i}: {str(e)}")

if __name__ == "__main__":
    # Make sure you use pip to install stuff.
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.firefox.service import Service
    from webdriver_manager.firefox import GeckoDriverManager
    #...
    options = Options()
    if settings.headless:
        options.add_argument('-headless')
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(options=options, service=service)
    google_images(driver, "John Smith", 10)

When printing image_links, the array is all None. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:

# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
    options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)

Help would be greatly appreciated! Thank you!

I'm trying to work on a project, and I need to get the links off google image results. Here is my code:

from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse


def google_images(driver, name, intAmount):
    for x in lsDir("gg_downloads"):
        os.remove(os.path.join("gg_downloads", x))

    print(f"[*] Opening Google...")
    driver.get(f"https://www.google/search?sclient=img&udm=2&q={name}")
    time.sleep(3)

    print(f"[*] Scrolling down...")
    for _ in tqdm(range(intAmount)):
        ActionChains(driver).scroll_by_amount(0, 10000).perform()
        time.sleep(1)

    print(f"[*] Gathering profile photos...")
    image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
    images = []
    for profile in tqdm(image_elements,desc="Gathering Photos"):
        images.append(profile.get_attribute("src"))
    images = images[::2]

    print(f"[*] Gathering links...")
    links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a") 
    image_links = []
    for link in tqdm(links, desc="Gathering Links"):
        image_links.append(link.get_attribute("href"))
    print(image_links)
    print(f"[*] Downloading profile photos...")
    for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
        try:
            if image_url.startswith('data:'):
                # Handle data URL
                header, encoded = image_url.split(",", 1)
                content_type = header.split(":")[1].split(";")[0]
                file_extension = content_type.split("/")[-1]
                
                # Decode base64 content
                image_data = base64.b64decode(encoded)
                
                # Save the file
                with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
                    f.write(image_data)
            else:
                # Handle regular URL
                parsed_url = urlparse(image_url)
                file_extension = os.path.splitext(parsed_url.path)[1]
                if not file_extension:
                    file_extension = '.jpg'  # Default to .jpg if no extension found
                
                download_file(image_url, f"gg_downloads/{i}{file_extension}")
        except Exception as e:
            print(f"Error downloading image {i}: {str(e)}")

if __name__ == "__main__":
    # Make sure you use pip to install stuff.
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.firefox.service import Service
    from webdriver_manager.firefox import GeckoDriverManager
    #...
    options = Options()
    if settings.headless:
        options.add_argument('-headless')
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(options=options, service=service)
    google_images(driver, "John Smith", 10)

When printing image_links, the array is all None. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:

# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
    options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)

Help would be greatly appreciated! Thank you!

Share Improve this question asked Feb 13 at 22:52 Thomas HaddadThomas Haddad 11 bronze badge 2
  • first you should check driver.page_source to see what you get from server. Often problem is that server sends different HTML than you expect. Try it without -headless so you could use DevTools in Firefox to see what HTML you have in browser. – furas Commented Feb 20 at 21:38
  • other problem can be that server may send different HTML (especially different/random classes) to different users, different browsers, different devices - to stop scrapers. Other problem can be that Google has a lot of money to build complex system to detect script/scrapers/hackers (it can even use AI for this). Selenium uses some variables in Javascript and some portals can detect - so there is specila version driver undetected_chromium which uses random names for variables. – furas Commented Feb 20 at 21:46
Add a comment  | 

1 Answer 1

Reset to default 0

I have found the behavior of the page that causes no result for scraping links.
On this page, anchor elements are loaded without any attributes. Once mouse touches an element, 'href' attribute is added to the element.

The following test code can get both links and 'src's.
#1 After moving mouse to a card element, get 'href' attribute.
#2 When reached to a card out of the view area, scroll the view up.
#3 Since continual scrolling-up also refreshes 'src's of img elements, precedent whole page scrolling is not necessary.
#4 When the access reaches near the last card, new 99(if exists) cards are loaded. If the number of them is increased we can continue.
#5 For the test convenience, iteration can be limited.

I hope it helps.

from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from time import sleep

def link(cards,begin):

    for i,card in enumerate(cards[begin:]):
        print('\n{:0=3}'.format(begin+i+1))
        
        for j in range(3):
            try:
                ActionChains(driver).move_to_element(card).perform()#1
            except:
                print('== scroll ==')
                ActionChains(driver).scroll_by_amount(0, 800).perform()#2
                sleep(2)
            else:
                break
        sleep(1)
        ancs= card.find_elements(By.XPATH, ".//a")
        url= ancs[0].get_attribute('href')#1
        if url == None:# when element on the view border
            ActionChains(driver).scroll_by_amount(0, 800).perform()#2
            sleep(2)
            ActionChains(driver).move_to_element(card).perform()
            print('== scrolled because of "url=None" ==')
            sleep(1)
            ancs= card.find_elements(By.XPATH, ".//a")
            url= ancs[0].get_attribute('href')
        if i % 2 ==0:# link check sampling
            driver2.get(url)
        source = ancs[1].find_element(By.CSS_SELECTOR, ".ptes9bspan").get_attribute('innerHTML')
        title = ancs[1].find_element(By.CSS_SELECTOR, 'div[class^="toI8R"]').get_attribute('innerHTML')
        print('source:' ,source)
        print('title:' ,title)
        print('url: ',url)
        img_el  = card.find_elements(By.XPATH, ".//img")[0]
        src= img_el.get_attribute('src')#3
        print('\nsrc: ',src)

if __name__ == "__main__":

    driver = webdriver.Firefox()
    driver.get('https://www.google/search?sclient=img&udm=2&q=John%20Smith')
    sleep(5)
    driver2 = webdriver.Firefox()# for link check

    cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
    cardsnum = len(cards)
    pre_len = 0
    begin = 0
    end = cardsnum
    i = 0
    while cardsnum > pre_len:#4
        if i==0:
            print('\n\n==== Initialy loaded cards {} ===\n\n'.format(cardsnum - pre_len))
        else:
            print('\n\n==== Newly loaded cards {} ===\n\n'.format(cardsnum - pre_len))
        pre_len = cardsnum
        link(cards,begin) 
        cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
        cardsnum = len(cards)
        begin = end
        end = cardsnum
        i += 1
        if i > 2:#5
            break  

本文标签: How to scrape links off Google images result with seleniumpythonStack Overflow