How to scrape links off Google images result with selenium, python? - Stack Overflow

IT技术

更新时间：2025-03-090

admin管理员组
文章数量:1290947

I'm trying to work on a project, and I need to get the links off google image results. Here is my code:

from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse


def google_images(driver, name, intAmount):
    for x in lsDir("gg_downloads"):
        os.remove(os.path.join("gg_downloads", x))

    print(f"[*] Opening Google...")
    driver.get(f";udm=2&q={name}")
    time.sleep(3)

    print(f"[*] Scrolling down...")
    for _ in tqdm(range(intAmount)):
        ActionChains(driver).scroll_by_amount(0, 10000).perform()
        time.sleep(1)

    print(f"[*] Gathering profile photos...")
    image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
    images = []
    for profile in tqdm(image_elements,desc="Gathering Photos"):
        images.append(profile.get_attribute("src"))
    images = images[::2]

    print(f"[*] Gathering links...")
    links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a") 
    image_links = []
    for link in tqdm(links, desc="Gathering Links"):
        image_links.append(link.get_attribute("href"))
    print(image_links)
    print(f"[*] Downloading profile photos...")
    for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
        try:
            if image_url.startswith('data:'):
                # Handle data URL
                header, encoded = image_url.split(",", 1)
                content_type = header.split(":")[1].split(";")[0]
                file_extension = content_type.split("/")[-1]
                
                # Decode base64 content
                image_data = base64.b64decode(encoded)
                
                # Save the file
                with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
                    f.write(image_data)
            else:
                # Handle regular URL
                parsed_url = urlparse(image_url)
                file_extension = os.path.splitext(parsed_url.path)[1]
                if not file_extension:
                    file_extension = '.jpg'  # Default to .jpg if no extension found
                
                download_file(image_url, f"gg_downloads/{i}{file_extension}")
        except Exception as e:
            print(f"Error downloading image {i}: {str(e)}")

if __name__ == "__main__":
    # Make sure you use pip to install stuff.
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.firefox.service import Service
    from webdriver_manager.firefox import GeckoDriverManager
    #...
    options = Options()
    if settings.headless:
        options.add_argument('-headless')
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(options=options, service=service)
    google_images(driver, "John Smith", 10)

When printing image_links, the array is all None. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:

# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
    options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)

Help would be greatly appreciated! Thank you!

I'm trying to work on a project, and I need to get the links off google image results. Here is my code:

from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse


def google_images(driver, name, intAmount):
    for x in lsDir("gg_downloads"):
        os.remove(os.path.join("gg_downloads", x))

    print(f"[*] Opening Google...")
    driver.get(f"https://www.google/search?sclient=img&udm=2&q={name}")
    time.sleep(3)

    print(f"[*] Scrolling down...")
    for _ in tqdm(range(intAmount)):
        ActionChains(driver).scroll_by_amount(0, 10000).perform()
        time.sleep(1)

    print(f"[*] Gathering profile photos...")
    image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
    images = []
    for profile in tqdm(image_elements,desc="Gathering Photos"):
        images.append(profile.get_attribute("src"))
    images = images[::2]

    print(f"[*] Gathering links...")
    links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a") 
    image_links = []
    for link in tqdm(links, desc="Gathering Links"):
        image_links.append(link.get_attribute("href"))
    print(image_links)
    print(f"[*] Downloading profile photos...")
    for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
        try:
            if image_url.startswith('data:'):
                # Handle data URL
                header, encoded = image_url.split(",", 1)
                content_type = header.split(":")[1].split(";")[0]
                file_extension = content_type.split("/")[-1]
                
                # Decode base64 content
                image_data = base64.b64decode(encoded)
                
                # Save the file
                with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
                    f.write(image_data)
            else:
                # Handle regular URL
                parsed_url = urlparse(image_url)
                file_extension = os.path.splitext(parsed_url.path)[1]
                if not file_extension:
                    file_extension = '.jpg'  # Default to .jpg if no extension found
                
                download_file(image_url, f"gg_downloads/{i}{file_extension}")
        except Exception as e:
            print(f"Error downloading image {i}: {str(e)}")

if __name__ == "__main__":
    # Make sure you use pip to install stuff.
    from selenium import webdriver
    from selenium.webdriver.firefox.options import Options
    from selenium.webdriver.firefox.service import Service
    from webdriver_manager.firefox import GeckoDriverManager
    #...
    options = Options()
    if settings.headless:
        options.add_argument('-headless')
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(options=options, service=service)
    google_images(driver, "John Smith", 10)

When printing image_links, the array is all None. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:

# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
    options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)

Help would be greatly appreciated! Thank you!

Share Improve this question asked Feb 13 at 22:52 Thomas Haddad 11 bronze badge

first you should check driver.page_source to see what you get from server. Often problem is that server sends different HTML than you expect. Try it without -headless so you could use DevTools in Firefox to see what HTML you have in browser. – furas Commented Feb 20 at 21:38
other problem can be that server may send different HTML (especially different/random classes) to different users, different browsers, different devices - to stop scrapers. Other problem can be that Google has a lot of money to build complex system to detect script/scrapers/hackers (it can even use AI for this). Selenium uses some variables in Javascript and some portals can detect - so there is specila version driver undetected_chromium which uses random names for variables. – furas Commented Feb 20 at 21:46

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

I have found the behavior of the page that causes no result for scraping links.
On this page, anchor elements are loaded without any attributes. Once mouse touches an element, 'href' attribute is added to the element.

The following test code can get both links and 'src's.
#1 After moving mouse to a card element, get 'href' attribute.
#2 When reached to a card out of the view area, scroll the view up.
#3 Since continual scrolling-up also refreshes 'src's of img elements, precedent whole page scrolling is not necessary.
#4 When the access reaches near the last card, new 99(if exists) cards are loaded. If the number of them is increased we can continue.
#5 For the test convenience, iteration can be limited.

I hope it helps.

from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from time import sleep

def link(cards,begin):

    for i,card in enumerate(cards[begin:]):
        print('\n{:0=3}'.format(begin+i+1))
        
        for j in range(3):
            try:
                ActionChains(driver).move_to_element(card).perform()#1
            except:
                print('== scroll ==')
                ActionChains(driver).scroll_by_amount(0, 800).perform()#2
                sleep(2)
            else:
                break
        sleep(1)
        ancs= card.find_elements(By.XPATH, ".//a")
        url= ancs[0].get_attribute('href')#1
        if url == None:# when element on the view border
            ActionChains(driver).scroll_by_amount(0, 800).perform()#2
            sleep(2)
            ActionChains(driver).move_to_element(card).perform()
            print('== scrolled because of "url=None" ==')
            sleep(1)
            ancs= card.find_elements(By.XPATH, ".//a")
            url= ancs[0].get_attribute('href')
        if i % 2 ==0:# link check sampling
            driver2.get(url)
        source = ancs[1].find_element(By.CSS_SELECTOR, ".ptes9bspan").get_attribute('innerHTML')
        title = ancs[1].find_element(By.CSS_SELECTOR, 'div[class^="toI8R"]').get_attribute('innerHTML')
        print('source:' ,source)
        print('title:' ,title)
        print('url: ',url)
        img_el  = card.find_elements(By.XPATH, ".//img")[0]
        src= img_el.get_attribute('src')#3
        print('\nsrc: ',src)

if __name__ == "__main__":

    driver = webdriver.Firefox()
    driver.get('https://www.google/search?sclient=img&udm=2&q=John%20Smith')
    sleep(5)
    driver2 = webdriver.Firefox()# for link check

    cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
    cardsnum = len(cards)
    pre_len = 0
    begin = 0
    end = cardsnum
    i = 0
    while cardsnum > pre_len:#4
        if i==0:
            print('\n\n==== Initialy loaded cards {} ===\n\n'.format(cardsnum - pre_len))
        else:
            print('\n\n==== Newly loaded cards {} ===\n\n'.format(cardsnum - pre_len))
        pre_len = cardsnum
        link(cards,begin) 
        cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
        cardsnum = len(cards)
        begin = end
        end = cardsnum
        i += 1
        if i > 2:#5
            break

本文标签： How to scrape links off Google images result with selenium pythonStack Overflow

版权声明：本文标题：How to scrape links off Google images result with selenium, python? - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1741503199a2382167.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

How to scrape links off Google images result with selenium, python? - Stack Overflow

1 Answer 1

更多相关文章

pandas - Issues in Updating Excel table range and conditional formatting programmatically using openpyxl, python - Stack Overflo

How to read and update image form field in PDF, python? - Stack Overflow

apache spark - Local Databricks installation error when doing dataframe.count() in PySpark, VS Code editor, Juypter notebook, Py

How to scrape links off Google images result with selenium, python? - Stack Overflow

发表评论

推荐文章

javascript - Unable to add files to chrome developer tools sources workspace - Stack Overflow

css - Image Crossfade with Javascript and CSS3 Transitions - Stack Overflow

javascript - google.maps.event.addListener is not a function - Stack Overflow

javascript - replaceText() RegEx &quot;not followed by&quot; - Stack Overflow

javascript - Chaining angular $timeout - Stack Overflow

热门文章

javascript - Filter a legend Item with Chartjs.org V2.7 - Stack Overflow

typescript - Using parameters that are a subtype of generic return type - Stack Overflow

php - Change ajax live search results

Regex (regular expressions), replace the second occurence in javascript - Stack Overflow

javascript - Can you configure the NestJS cache TTL per-endpoint? - Stack Overflow

MVC3 Razor - Adding JavaScript and CSS files to document head - Stack Overflow

microsoft graph api - Generating client assertion token with Ruby - Stack Overflow

ios - Universal Links Open in Safari Instead of My App Despite Correct Setup - Stack Overflow

python - Is there a way to compute only the real part of a NumPy matmul? - Stack Overflow

javascript - RegEx only one dot inside string not at beginning or end - Stack Overflow

最新文章

Win7各正式版下载地址和SHA验证

怎么样把中文版的Windows7改成英文版的Windows7

Win7系统笔记本蓝牙打开指南：详细步骤助你轻松连接

win7开机弹计算机,win7开机弹出Windows Installer窗口的解决方法

windows7虚拟机安装vmtools方法

oop - Access instance variable inside a function in javascript? - Stack Overflow

javascript - How can I check if an ajax .load call has loaded - Stack Overflow

proxy - squid in intercept mode : access denied - Stack Overflow

installation - Can I install a WordPress site locally from a tar.gz file?

javascript - How to remove Google map markers in Vue? - Stack Overflow

惠普OMEN 15-CE001TX 2EF91PA参数报价

苹果新款MacBook Pro 15英寸 i732GB1TBVega Pro 20参数报价

联想Y330A-PSE L参数报价

神舟战神Z7 D6 i7-12650H16GB512GBRTX4050旗舰版参数报价

神舟战神Z7 D6 i7-12650H16GB1TBRTX4050参数报价

javascript - replaceText() RegEx "not followed by" - Stack Overflow