selenium爬虫

本文主要是介绍selenium爬虫，对大家解决编程问题具有一定的参考价值，需要的程序猿们随着小编来一起学习吧！

介绍

selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题

selenium本质是通过驱动浏览器，完全模拟浏览器的操作，比如跳转、输入、点击、下拉等，来拿到网页渲染之后的结果，可支持多种浏览器

from selenium import webdriver
browser=webdriver.Chrome()  # 推荐使用
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge()

安装

有界面的浏览器

#安装：selenium+chromedriver
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可
国内镜像网站地址：http://npm.taobao.org/mirrors/chromedriver
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads
        
 
# 注意，下载的驱动要和本机浏览器版本对应
# 下载谷歌浏览器驱动：http://npm.taobao.org/mirrors/chromedriver/
#安装使用
from selenium import webdriver

bro = webdriver.Chrome(executable_path='./chromedriver')  # 弹出浏览器，要给浏览器驱动的地址
bro.get('https://www.baidu.com')
print(bro.page_source)  # 获取页面返回的html代码
bro.close()  # 关闭浏览器

#注意：
selenium3默认支持的webdriver是Firfox，而Firefox需要安装geckodriver
下载链接：https://github.com/mozilla/geckodriver/releases
selenium+chromedriver

无界面浏览器

# 5 无界面浏览器（驱动谷歌，驱动其他浏览器）
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
bro=webdriver.Chrome(executable_path='./chromedriver',options=chrome_options)
bro.get("https://www.baidu.com")
print(bro.get_cookies())
bro.close()

开发者模式

options = webdriver.ChromeOptions()
# 开发者模式
options.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path=chromedriver_path, options=options)


def login():
    for res in setting.user:
        # 换用户
        try:
            username = res[0]
            password = res[1]
            options = webdriver.ChromeOptions()
            # 开发者模式
            options.add_experimental_option('excludeSwitches', ['enable-automation'])
            bro = webdriver.Chrome(executable_path=chromedriver_path, options=options)

            bro.implicitly_wait(10)
            bro.get('https://www.taobao.com/')
            # 登录按钮
            # login = bro.find_element_by_css_selector('#login-info > a.sn-login')
            bro.find_element_by_css_selector('#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-sign > a.h').click()
            input_username = bro.find_element_by_css_selector('#fm-login-id')
            input_username.send_keys(username)
            input_password = bro.find_element_by_css_selector('#fm-login-password')
            input_password.send_keys(password)
            # 人工登录
            input("人工操作")
            return bro
        except Exception as e:
            continue

window.navigator.webdriver为true的情况

window.navigator.webdriver为true

def selenium(js):
    option = webdriver.ChromeOptions()
    # option.add_argument('--headless')
    option.add_experimental_option('useAutomationExtension', False)
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    bro = webdriver.Chrome(executable_path='./chromedriver', options=option)  # 弹出浏览器，要给浏览器驱动的地址
     # 打开页面优先执行的js,execute_cdp_cmd
    bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
          """
    })

    bro.implicitly_wait(10)
    bro.get('https://www.toutiao.com/')
    time.sleep(5)
    print(bro.page_source)  # 获取页面返回的html代码
    bro.execute_script(js)
    input()

selenium高级用法

.send_keys('')  # 写入
.click()  # 点击
.clear()  # 清空

selenium选择器选择

# 1、find_element_by_css_selector    # css选择器找
# 2、find_element_by_xpath           #xpath选择器找
# 3、find_element_by_partial_link_text # a标签上的文字模糊
# 4、find_element_by_tag_name        # 根据标签名字找
# 5、find_element_by_id  # id找
# 6、find_element_by_name            # name='xx' 根据name属性找
# 7、find_element_by_class_name      # 根据类名字找
# 8、find_element_by_link_text   # a标签上的文字找

常用用法

# 常用用法(在输入框中输入美女，搜索)
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
bro.implicitly_wait(10)  # 添加隐士等待，最多等待10秒


#  一、在输入框中输入美女（自带的解析器，查找输入框空间）
# 1.找到输入框
input_search=bro.find_element_by_xpath('//*[@id="kw"]')  # xpath选择器
input_search=bro.find_element_by_css_selector('#kw')  # css选择器
# 2.写文字
input_search.send_keys("美女")
# 3.查找搜索按钮
enter=bro.find_element_by_id('su')
time.sleep(3)
# 4.点击按钮
enter.click()
time.sleep(5)
bro.close()

模拟百度登录

# 二、模拟百度登录
import time
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
# 隐士等待(最多等待10s)
# 只有控件没有加载出来，才会等，控件一旦加载出来，直接就取到
bro.implicitly_wait(10)

# 1.找到登录标签
submit_button=bro.find_element_by_link_text('登录')
# 2.点击登录
submit_button.click()  
# 3.找到用户名登录
user_button=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
# 4.点击登录
user_button.click()
# 5.找到用户名框的标签
user_input=bro.find_element_by_id('TANGRAM__PSP_10__userName')
# 6.输入用户名
user_input.send_keys("jeff@qq.com")  
# 7.找到密码输入框
pwd_input=bro.find_element_by_id('TANGRAM__PSP_10__password')
# 8.输入密码
pwd_input.send_keys("123456")
# 9.找到登录按钮标签
submit_input=bro.find_element_by_id('TANGRAM__PSP_10__submit')
# 10.点击登录
submit_input.click()
time.sleep(5)
bro.close()

获取cookie

#搭建cookie池和代理池的作用是什么？封ip ，封账号（弄一堆小号，一堆cookie）

# 三 获取cookie
# 登陆之后，拿到cookie：就可以自己搭建cookie池（requests模块发请求，携带者cookie）
import time
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
print(bro.get_cookies())
bro.close()

获取标签属性、获取文本、标签ID、位置、大小

# 6 获取标签属性
# (重点：获取属性)
print(tag.get_attribute('src'))  # 获取属性
print(tag.get_attribute('href'))  # 获取属性
print(tag.text)  # 获取文本


# #获取标签ID，位置，名称，大小（了解）
print(tag.id)  # 标签ID
print(tag.location)  # 位置      #{'x': 312, 'y': 213}
print(tag.tag_name)  # 标签名称  #input
print(tag.size)  # 大小

显示等待、隐士等待

# 7 显示等待和隐士等待
# 隐士等待(最多等待10s)
# 只有控件没有加载出来，才会等，控件一旦加载出来，直接就取到
bro.implicitly_wait(10)
# 显示等待（每个控件，都要写等待），不要使用

执行JS代码

简单使用

from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")

# 执行js代码
bro.execute_script('alert(1)')
time.sleep(5)
bro.close()

js屏幕上下滚动

# js
window.scrollTo(0,100)  # 向下滑动100
window.scrollTo(0,500)  # 向下滑动500
window.scrollTo(0,document.body.scrollHeight)  # 滑到底部
window.scrollTo(0,document.body.scrollHeight-500) # 滑到-500

# 执行js
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')

# 完整代码
from selenium import webdriver
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.cnblogs.com")
# 执行js代码
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')

模拟浏览器前进后退

# bro.back()   # 后退
# bro.forward()  # 前进


from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='./chromedriver')


bro.get("https://www.cnblogs.com")  # 第一个页面
time.sleep(1)
bro.get("https://www.baidu.com")  # 第二个页面
time.sleep(1)
bro.get("https://www.jd.com")  # 第三个页面
time.sleep(1)
bro.back()  # 后退
time.sleep(1)
bro.forward()  # 前进

选项卡管理(新窗口跳转)

原理：都是js在操作，执行Js代码

from selenium import webdriver
import time
browser=webdriver.Chrome(executable_path='./chromedriver')

browser.get('https://www.baidu.com')
browser.execute_script('window.open()')  # 打开一个新窗口

print(browser.window_handles) #获取所有的选项卡
browser.switch_to_window(browser.window_handles[1])  # 到第一个窗口
browser.get('https://www.taobao.com')  # 跳转网址
time.sleep(2)
browser.switch_to_window(browser.window_handles[0])  # 到第0个窗口
browser.get('https://www.sina.com.cn') # 跳转网址
# browser.close()

异常处理

from selenium import webdriver

try:
    browser=webdriver.Chrome(executable_path='./chromedriver')
    browser.get('http://www.baidu.com')
    browser.find_element_by_id("xxx")

except Exception as e:
    print(e)
finally:  # 不管有没有报错都执行
    browser.close()

模拟键盘操作

from selenium.webdriver.common.keys import Keys
#模拟键盘操作(模拟键盘敲回车)
input_search.send_keys(Keys.ENTER) 


# 案例，打开百度，输入美女，键盘敲回车
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='./chromedriver')
bro.get("https://www.baidu.com")
bro.implicitly_wait(10)   # 隐士等待
 
input_search=bro.find_element_by_css_selector('#kw')  # 找到输入框
input_search.send_keys("美女")  # 输入美女

#模拟键盘操作(模拟键盘敲回车)
input_search.send_keys(Keys.ENTER)

这篇关于selenium爬虫的文章就介绍到这儿，希望我们推荐的文章对大家有所帮助，也希望大家多多支持为之网！

Java教程

selenium爬虫

介绍

安装

有界面的浏览器

无界面浏览器

开发者模式

window.navigator.webdriver为true的情况

selenium高级用法

selenium选择器选择

常用用法

模拟百度登录

获取cookie

获取标签属性、获取文本、标签ID、位置、大小

显示等待、隐士等待

执行JS代码

简单使用

js屏幕上下滚动

模拟浏览器前进后退

选项卡管理(新窗口跳转)

异常处理

模拟键盘操作

前端开发

后端开发

移动端开发

数据库

服务器运维

人工智能

区块链

游戏开发

网站运营

大数据/云计算

软件工程

软件/开发工具使用

资讯