如果不需要说明,请查看联机 IDE 中的完整代码示例。
import time, json from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from parsel import Selector google_play_apps = { 'Top charts': { 'Top free': [], 'Top grossing': [], 'Top paid': [] }, } def scroll_page(url): service = Service(ChromeDriverManager().install()) options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--lang=en") options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36") options.add_argument("--no-sandbox") driver = webdriver.Chrome(service=service, options=options) driver.get(url) while True: try: driver.execute_script("document.querySelector('.snByac').click();") WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body'))) break except: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body'))) scrape_top_charts(driver=driver, chart='Top free', button_selector='#ct\|apps_topselling_free .ypTNYd') scrape_top_charts(driver=driver, chart='Top grossing', button_selector='#ct\|apps_topgrossing .ypTNYd') scrape_top_charts(driver=driver, chart='Top paid', button_selector='#ct\|apps_topselling_paid .ypTNYd') selector = Selector(driver.page_source) driver.quit() return selector def scrape_top_charts(driver, chart, button_selector): button = driver.find_element(By.CSS_SELECTOR, button_selector) driver.execute_script("arguments[0].click();", button) time.sleep(2) selector = Selector(driver.page_source) for result in selector.css('.itIJzb'): title = result.css('.OnEJge::text').get() link = 'https://play.google.com' + result.css('::attr(href)').get() category = result.css('.ubGTjb .sT93pb.w2kbF:not(.K4Wkre)::text').get() rating = float(result.css('.CKzsaf .w2kbF::text').get()) thumbnail = result.css('.stzEZd::attr(srcset)').get().replace(' 2x', '') google_play_apps['Top charts'][chart].append({ 'title': title, 'link': link, 'category': category, 'rating': rating, 'thumbnail': thumbnail, }) def scrape_all_sections(selector): for section in selector.css('section'): section_title = section.css('.kcen6d span::text').get() google_play_apps[section_title] = [] for app in section.css('.UVEnyf'): title = app.css('.Epkrse::text').get() link = 'https://play.google.com' + app.css('.Si6A0c::attr(href)').get() rating = app.css('.LrNMN::text').get() rating = float(rating) if rating else rating thumbnail = app.css('.Q8CSx::attr(srcset)').get().replace(' 2x', '') google_play_apps[section_title].append({ 'title': title, 'link': link, 'rating': rating, 'thumbnail': thumbnail, }) print(json.dumps(google_play_apps, indent=2, ensure_ascii=False)) def scrape_google_play_apps(): params = { 'device': 'phone', 'hl': 'en_GB', # language 'gl': 'US', # country of the search } URL = f"https://play.google.com/store/apps?device={params['device']}&hl={params['hl']}&gl={params['gl']}" result = scroll_page(URL) scrape_all_sections(result) if __name__ == "__main__": scrape_google_play_apps()
安装库:
pip install parsel selenium webdriver webdriver_manager
减少被阻止的机会
确保您使用请求标头用户代理充当“真实”用户访问。因为默认是python请求,并且网站知道它很可能是发送请求的脚本。检查您的用户代理是什么。requestsuser-agent
有一个方法可以减少在网络抓取博客文章时被阻止的机会,可以让您熟悉基本和更高级的方法。
导入库:
import time, json from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from parsel import Selector
图书馆 | 目的 |
---|---|
time |
在 Python 中处理时间。 |
json |
将提取的数据转换为 JSON 对象。 |
webdriver |
像用户一样在本地或使用 Selenium 服务器的远程计算机上本地驱动浏览器。 |
Service |
来管理 ChromeDriver 的启动和停止。 |
By |
到一组支持的定位器策略(By.ID、By.TAG_NAME、By.XPATH 等)。 |
WebDriverWait |
只根据需要等待。. |
expected_conditions |
包含一组用于 WebDriverWait 的预定义条件。 |
Selector |
具有完整 XPath 和 CSS 选择器的 XML/HTML 解析器。 |
定义字典结构:
google_play_apps = { 'Top charts': { 'Top free': [], 'Top grossing': [], 'Top paid': [] }, }
在函数的开头,定义了用于生成 .如果要将其他参数传递给 URL,可以使用字典执行此操作。参数会影响输出结果:URL
params
params = { 'device': 'phone', 'hl': 'en_GB', # language 'gl': 'US', # country of the search }
接下来,将 URL 传递给函数以滚动页面并获取所有数据。此函数返回的结果将传递给函数以提取必要的数据。这些功能的说明将在下面的相应标题中。scroll_page(URL)
scrape_all_categories(result)
此代码使用普遍接受的规则,即使用 __name__ == “__main__”
构造:
def scrape_google_play_apps(): params = { 'device': 'phone', 'hl': 'en_GB', # language 'gl': 'US', # country of the search } URL = f"https://play.google.com/store/apps?device={params['device']}&hl={params['hl']}&gl={params['gl']}" result = scroll_page(URL) scrape_all_sections(result) if __name__ == "__main__": scrape_google_play_apps()
仅当用户运行此文件时,才会执行此检查。如果用户将此文件导入另一个文件,则检查将不起作用。