admin管理员组文章数量:1290947
I'm trying to work on a project, and I need to get the links off google image results. Here is my code:
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse
def google_images(driver, name, intAmount):
for x in lsDir("gg_downloads"):
os.remove(os.path.join("gg_downloads", x))
print(f"[*] Opening Google...")
driver.get(f";udm=2&q={name}")
time.sleep(3)
print(f"[*] Scrolling down...")
for _ in tqdm(range(intAmount)):
ActionChains(driver).scroll_by_amount(0, 10000).perform()
time.sleep(1)
print(f"[*] Gathering profile photos...")
image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
images = []
for profile in tqdm(image_elements,desc="Gathering Photos"):
images.append(profile.get_attribute("src"))
images = images[::2]
print(f"[*] Gathering links...")
links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a")
image_links = []
for link in tqdm(links, desc="Gathering Links"):
image_links.append(link.get_attribute("href"))
print(image_links)
print(f"[*] Downloading profile photos...")
for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
try:
if image_url.startswith('data:'):
# Handle data URL
header, encoded = image_url.split(",", 1)
content_type = header.split(":")[1].split(";")[0]
file_extension = content_type.split("/")[-1]
# Decode base64 content
image_data = base64.b64decode(encoded)
# Save the file
with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
f.write(image_data)
else:
# Handle regular URL
parsed_url = urlparse(image_url)
file_extension = os.path.splitext(parsed_url.path)[1]
if not file_extension:
file_extension = '.jpg' # Default to .jpg if no extension found
download_file(image_url, f"gg_downloads/{i}{file_extension}")
except Exception as e:
print(f"Error downloading image {i}: {str(e)}")
if __name__ == "__main__":
# Make sure you use pip to install stuff.
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)
google_images(driver, "John Smith", 10)
When printing image_links
, the array is all None
. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:
# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)
Help would be greatly appreciated! Thank you!
I'm trying to work on a project, and I need to get the links off google image results. Here is my code:
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from tqdm import tqdm
from modules.download_file import download_file
from modules.lsDir import lsDir
import time
from colorama import Fore, Style, init
import os
import base64
from urllib.parse import urlparse
def google_images(driver, name, intAmount):
for x in lsDir("gg_downloads"):
os.remove(os.path.join("gg_downloads", x))
print(f"[*] Opening Google...")
driver.get(f"https://www.google/search?sclient=img&udm=2&q={name}")
time.sleep(3)
print(f"[*] Scrolling down...")
for _ in tqdm(range(intAmount)):
ActionChains(driver).scroll_by_amount(0, 10000).perform()
time.sleep(1)
print(f"[*] Gathering profile photos...")
image_elements = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc img")
images = []
for profile in tqdm(image_elements,desc="Gathering Photos"):
images.append(profile.get_attribute("src"))
images = images[::2]
print(f"[*] Gathering links...")
links = driver.find_elements(By.XPATH, "/html/body/div[3]/div/div[14]/div/div[2]/div[2]/div/div/div/div/div[1]/div/div/div/div[2]/h3/a")
image_links = []
for link in tqdm(links, desc="Gathering Links"):
image_links.append(link.get_attribute("href"))
print(image_links)
print(f"[*] Downloading profile photos...")
for i, image_url in tqdm(enumerate(images), desc="Downloading Photos", total=len(images)):
try:
if image_url.startswith('data:'):
# Handle data URL
header, encoded = image_url.split(",", 1)
content_type = header.split(":")[1].split(";")[0]
file_extension = content_type.split("/")[-1]
# Decode base64 content
image_data = base64.b64decode(encoded)
# Save the file
with open(f"gg_downloads/{i}.{file_extension}", "wb") as f:
f.write(image_data)
else:
# Handle regular URL
parsed_url = urlparse(image_url)
file_extension = os.path.splitext(parsed_url.path)[1]
if not file_extension:
file_extension = '.jpg' # Default to .jpg if no extension found
download_file(image_url, f"gg_downloads/{i}{file_extension}")
except Exception as e:
print(f"Error downloading image {i}: {str(e)}")
if __name__ == "__main__":
# Make sure you use pip to install stuff.
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)
google_images(driver, "John Smith", 10)
When printing image_links
, the array is all None
. I tried the vscode debugger, and tried also CSS Selectors, but its still not working. I am using firefox for the driver and here is the code to initialize the driver:
# Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
#...
options = Options()
if settings.headless:
options.add_argument('-headless')
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(options=options, service=service)
Help would be greatly appreciated! Thank you!
Share Improve this question asked Feb 13 at 22:52 Thomas HaddadThomas Haddad 11 bronze badge 2 |1 Answer
Reset to default 0I have found the behavior of the page that causes no result for scraping links.
On this page, anchor elements are loaded without any attributes. Once mouse touches an element, 'href' attribute is added to the element.
The following test code can get both links and 'src's.
#1 After moving mouse to a card element, get 'href' attribute.
#2 When reached to a card out of the view area, scroll the view up.
#3 Since continual scrolling-up also refreshes 'src's of img elements, precedent whole page scrolling is not necessary.
#4 When the access reaches near the last card, new 99(if exists) cards are loaded. If the number of them is increased we can continue.
#5 For the test convenience, iteration can be limited.
I hope it helps.
from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdrivermon.action_chains import ActionChains
from time import sleep
def link(cards,begin):
for i,card in enumerate(cards[begin:]):
print('\n{:0=3}'.format(begin+i+1))
for j in range(3):
try:
ActionChains(driver).move_to_element(card).perform()#1
except:
print('== scroll ==')
ActionChains(driver).scroll_by_amount(0, 800).perform()#2
sleep(2)
else:
break
sleep(1)
ancs= card.find_elements(By.XPATH, ".//a")
url= ancs[0].get_attribute('href')#1
if url == None:# when element on the view border
ActionChains(driver).scroll_by_amount(0, 800).perform()#2
sleep(2)
ActionChains(driver).move_to_element(card).perform()
print('== scrolled because of "url=None" ==')
sleep(1)
ancs= card.find_elements(By.XPATH, ".//a")
url= ancs[0].get_attribute('href')
if i % 2 ==0:# link check sampling
driver2.get(url)
source = ancs[1].find_element(By.CSS_SELECTOR, ".ptes9bspan").get_attribute('innerHTML')
title = ancs[1].find_element(By.CSS_SELECTOR, 'div[class^="toI8R"]').get_attribute('innerHTML')
print('source:' ,source)
print('title:' ,title)
print('url: ',url)
img_el = card.find_elements(By.XPATH, ".//img")[0]
src= img_el.get_attribute('src')#3
print('\nsrc: ',src)
if __name__ == "__main__":
driver = webdriver.Firefox()
driver.get('https://www.google/search?sclient=img&udm=2&q=John%20Smith')
sleep(5)
driver2 = webdriver.Firefox()# for link check
cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
cardsnum = len(cards)
pre_len = 0
begin = 0
end = cardsnum
i = 0
while cardsnum > pre_len:#4
if i==0:
print('\n\n==== Initialy loaded cards {} ===\n\n'.format(cardsnum - pre_len))
else:
print('\n\n==== Newly loaded cards {} ===\n\n'.format(cardsnum - pre_len))
pre_len = cardsnum
link(cards,begin)
cards = driver.find_elements(By.CSS_SELECTOR, "div.eA0Zlc")
cardsnum = len(cards)
begin = end
end = cardsnum
i += 1
if i > 2:#5
break
本文标签: How to scrape links off Google images result with seleniumpythonStack Overflow
版权声明:本文标题:How to scrape links off Google images result with selenium, python? - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741503199a2382167.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
driver.page_source
to see what you get from server. Often problem is that server sends different HTML than you expect. Try it without-headless
so you could useDevTools
in Firefox to see what HTML you have in browser. – furas Commented Feb 20 at 21:38undetected_chromium
which uses random names for variables. – furas Commented Feb 20 at 21:46