selenium+chormdriver+python 实现淘宝的信息爬取

本文主要是介绍selenium+chormdriver+python 实现淘宝的信息爬取，对大家解决编程问题具有一定的参考价值，需要的程序猿们随着小编来一起学习吧！

因为我是个爬虫新手，所以对爬虫还不熟练，这几天想着自己做一个淘宝信息的自动爬取，一开始感觉比较简单，但做到了登录界面，发现一直被网站检测出来，不能滑动滑块。接下来从网上翻遍了资料，整理了以下自己的代码，完成了这个艰难的工程（嘻嘻，对我来说）下面先把代码放上来，想做这个项目的小伙伴可以复制。

from selenium import webdriver
from selenium.webdriver import ChromeOptions
#导入显性等待库
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import  expected_conditions as ec
from selenium.webdriver.common.by import By
import time
import pymysql

import time

class Information:
    def __init__(self,shop_name,use_name,passward):
        self.option_ = webdriver.ChromeOptions()
        self.option_.add_experimental_option('excludeSwitches',['enable-automation'])
        self.driver_ = webdriver.Chrome(options=self.option_)
        #添加反检测
        self.driver_.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
          """
        })
        self.user_name = use_name
        self.passward = passward
        self.shop_name = shop_name   #确定搜索的商品名字
        #向首页发送请求
        self.driver_.get('https://www.taobao.com/')

    def search(self):

        self.input = self.driver_.find_element_by_xpath('//*[@id="q"]') #定位到输入框
        self.input.send_keys(self.shop_name)
        self.driver_.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
        self.login()
    def login(self):
        print(f'正在登录{self.user_name}的账户')
        #定位到账号
        self.driver_.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(f'{self.user_name}')
        #定位到密码中
        self.driver_.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(f'{self.passward}')
        #定位到登录
        self.driver_.find_element_by_xpath('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[4]/button').click()
        time.sleep(5)   #设置等待时间 避免页面没加载完就开始爬取
        self.collect()
    def collect(self):
        #商品名称
        self.all_names = self.driver_.find_elements_by_xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[*]/div[2]/div[2]/a')
        #商品价格
        self.all_prices = self.driver_.find_elements_by_xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[*]/div[2]/div[1]/div/strong')
        #商品产地
        self.all_places = self.driver_.find_elements_by_xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[*]/div[2]/div[3]/div[1]/a/span[2]')
        #商品网址
        self.all_htmls = self.driver_.find_elements_by_xpath('')
        print(len(self.all_names))
        print(len(self.all_prices))
        print(len(self.all_places))
        print('获取完成，开始录入')
        print('链接数据库中...')
        print(f'正在录入{self.shop_name}信息....')
        # 链接数据库
        self.con = pymysql.connect(host='localhost',user='root',password='asdjkl456',database='xiaole')
        #创建游标对象
        self.cur = self.con.cursor()
        # 创建表格
        print(f'创建{self.shop_name}表...')
        self.sql = f'''
          CREATE table {self.shop_name} (
            id int primary key auto_increment,
            商品名称 varchar(100),
            商品价格 varchar(10),
            商品产地 varchar(50)


        )charset=utf8;


        '''
        self.cur.execute(self.sql)
        print(f'{self.shop_name}表创建完成')


        # 循环输出各个标题
        for self.i  in range(len(self.all_names)):
            # print(info_element.text)
            #存入数据
            print(f'第{self.i}条数据正在录入...')
            self.sql = f'insert into {self.shop_name}(商品名称,商品价格,商品产地) values("{self.all_names[self.i].text}","{self.all_prices[self.i].text}","{self.all_places[self.i].text}")'
            self.cur.execute(self.sql)
            self.con.commit()
#
#
#
shop_name = input('请输入你想查找的商品：')
user_name = input('请输入你的淘宝账号:')
passward = input('请输入你的淘宝密码:')
information = Information(shop_name,user_name,passward)
information.search()


#问题一：无法直接进入搜索   ==》  先进入登录界面，点击登录后会出现滑块
#问题二：无法滑动滑块  ==》 ？？？  添加等待，人工滑完在进入下一步  显性等待
#问题三：手动滑完滑块会被检测出来是爬虫在滑，会报错  ==》 添加反检测
        #self.driver_.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
                #     "source": """
                #     Object.defineProperty(navigator, 'webdriver', {
                #       get: () => undefined
                #     })
                #   """
                # })

接下来我来讲一下我的项目步骤

step 1.导包(毕竟python的优点就在这里，库多)

from selenium import webdriver   #selenium库
from selenium.webdriver import ChromeOptions  
import time   #设置时间
import pymysql   #数据库

step 2.访问淘宝首页

#创建浏览器对象
self.driver_ = webdriver.Chrome()
#向首页发送请求
self.driver_.get('https://www.taobao.com/')

step 3.在淘宝首页用selenium进行自动化输入搜索的商品

    def search(self):
        self.input = self.driver_.find_element_by_xpath('//*[@id="q"]') #定位到输入框
        self.input.send_keys(self.shop_name)  #self.shop_name 为要搜索的商品名称
        #点击搜索
        self.driver_.find_element_by_xpath('//[@id="J_TSearchForm"]/div[1]/button').click()

鼠标右键检查进入element界面，点击左上角的鼠标形状的图案，在点击输入框，就可以找到输入框在elements的位置，鼠标放到元素所在的位置，右键==》copy==》copy xpath，复制输入框元素的xpath值，利用selenium的定位方法（find_element_by_xpath,或其他定位方法)选中输入框，再利用send_keys(）函数进行传值，利用相同方法定位到搜索并使用click()函数进行点击进入下一界面==》登录界面

step 4.模拟登录（重点）

先将代码奉上

    def login(self):
        print(f'正在登录{self.user_name}的账户')
        #定位到账号
        self.driver_.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(f'{self.user_name}')
        #定位到密码中
        self.driver_.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(f'{self.passward}')
        #定位到登录
        self.driver_.find_element_by_xpath('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[4]/button').click()
        time.sleep(5)   #设置等待时间 避免页面没加载完就开始爬取
        self.collect()

定位账号，密码两个输入框和上一步定位物品搜索框步骤一致

输入完之后点击登录，发现多了滑块滑动验证码，即使你是用滑块滑动方法（那个手机号我是滑完滑块后删除的（个人隐私不方便透露）），仍会给你报错

，到了这里你就要考虑是不是已经被淘宝检测出来了，

可以使用window.navigator.webdriver来进行一个自我查看，如果结果为true，就代表着被检测了出来

接下来要说的就是selenium的反检测

我们都知道使用selenium进行网站登录，只是模拟用户登录，而并不是变成真实用户，有一些网站因为保护措施较严，selenium被检测出来的几率很大我们可以使用反检测代码进行selenium的伪装

    def __init__(self,shop_name,use_name,passward):
        self.option_ = webdriver.ChromeOptions()
        self.option_.add_experimental_option('excludeSwitches',['enable-automation'])
        self.driver_ = webdriver.Chrome(options=self.option_)
        # 添加反检测
        self.driver_.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
          """
        })

添加完之后，再次运行代码，会发现点击登录后直接进入商品搜索界面

进入到这里之后就可以任性发挥了，想获取商品名称，商品价格，生产厂商的数据都可以，需要的话链接数据库，把爬下来的数据放进数据库

这篇关于selenium+chormdriver+python 实现淘宝的信息爬取的文章就介绍到这儿，希望我们推荐的文章对大家有所帮助，也希望大家多多支持为之网！

Python教程

selenium+chormdriver+python 实现淘宝的信息爬取

前端开发

后端开发

移动端开发

数据库

服务器运维

人工智能

区块链

游戏开发

网站运营

大数据/云计算

软件工程

软件/开发工具使用

资讯