1 ssl携带证书(这种网站极少见) import requests respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200 print(respone.status_code) import requests respone=requests.get('https://www.12306.cn', cert=('/path/server.crt','/path/key')) print(respone.status_code) 2 超时设置 import requests respone=requests.get('https://www.baidu.com',timeout=0.0001) 3 认证设置 import requests from requests.auth import HTTPBasicAuth r=requests.get('xxx',auth=HTTPBasicAuth('user','password')) print(r.status_code) 4 异常处理 import requests from requests.exceptions import * #可以查看requests.exceptions获取异常类型 try: r=requests.get('http://www.baidu.com',timeout=0.00001) except ReadTimeout: print('===:') except ConnectionError: #网络不通 print('-----') except Timeout: print('aaaaa') except RequestException: print('Error') except Exception as e: print('未知错误') 5 使用代理 #代理:网上免费的(不稳定,自己玩) 收费的(稳定,公司都会买) #代理:高匿:隐藏访问者ip #透明:不隐藏访问者ip http的请求头中:X-Forwarded-For---》django中从META中取 #使用第三方开源的代理池:python+flask写的,自己搭建一个免费的代理池 https://github.com/jhao104/proxy_pool import requests ip = requests.get('http://118.24.52.95:5010/get/').json()['proxy'] print(ip) proxies = { 'http': ip } respone = requests.get('http://101.133.225.166:8088/test_ip/', proxies=proxies) print(respone.text) 6 上传文件 import requests respone=requests.post('http://101.133.225.166:8088/upload_file/',files={'myfile':open('1 requests高级用法.py','rb')}) print(respone.text)
import requests # data = { # 'linkId': '31009758', # # } data = { 'content': '其实一般', 'linkId': '31008563', 'parentId': '0', 'pictureUrl': '' } header = { 'Referer': 'https://dig.chouti.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 'Cookie': 'deviceId=web.eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiIwZmVlMjk5OS1iMDgzLTRmYzctOTM4MC03YjIzZmVmY2U5YmYiLCJleHBpcmUiOiIxNjIzOTA0ODk5MzM5In0.7cadtBYznS6OgnLwEF8aH0AmtDOoYB1WKDgdU4eYYS0; __snaker__id=VbChmBUEZIVY3FPa; _9755xjdesxxd_=32; YD00000980905869%3AWM_TID=%2FazmF9%2FrClJFEVFBVRN70z7msH6De39Y; YD00000980905869%3AWM_NI=fmln0UTLoOM0bJxRYMet9SoHoQFrKUG7angbfEmftGxseQnkMmbwsdEPNwgtVpQ9K0fqli5fhP6nKsZ15bIt%2BQYBdpjdM8x19UJqjf6LSi%2FmhSgQW%2F3SYGNWEwJPPlYGRWM%3D; YD00000980905869%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb5d567838e8fa6f94dbaef8eb7d54a938e8b85f83bf88a97a2e464a98689afaa2af0fea7c3b92aa6b3a48fb35f9894a1b0d03ca296b8b3dc47a7acf7b4ee44ad8f8a93ca5f85e9af8fe66aa69ba387f74dbcadabb2ed618fb3ae98f27087908298e68096b09fdaca3ca6afa48ab86eac90fa8fca799aeffb83cc80e98f97a3e77caabc83d9fb3bfb8b8692e96ef6949d8aae67ac8da9b2d625f18d97a8cd5d87a986b1d3689b999eb8d037e2a3; Hm_lvt_03b2668f8e8699e91d479d62bc7630f1=1621312902,1621392225; gdxidpyhxdE=weRAWhzVrJfrCGllI4mwY8LxZOiO4D79t%2Fkf8j8qcJUsTDrjyVh05GQiaf6uL8dwsXpkShI%2B2uGHa9Vj5b1QilxdgI%2BoDUr%5C0VN4kMrnVLUmzGb56lwmZRoAmUq%2FToGtCRjYKAaANejzA%5CQcWg4LwkrdXzwqNISMTfwQUaMw4puru4fM%3A1621393127138; token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjZHVfNTMyMDcwNzg0NjAiLCJleHBpcmUiOiIxNjIzOTg0MjcwNDA0In0.4Q4uQAd4LkbVVcu37t0SjRFE4CSIidduRspeQ08-iYE; Hm_lpvt_03b2668f8e8699e91d479d62bc7630f1=1621392374' } # res = requests.post('https://dig.chouti.com/link/vote', data=data,headers=header) res = requests.post('https://dig.chouti.com/comments/create', data=data, headers=header) print(res.text)
import requests from bs4 import BeautifulSoup import pymysql res = requests.get('https://www.autohome.com.cn/news/1/#liststart') # print(res.text) soup = BeautifulSoup(res.text,'html.parser') ul_list = soup.find_all(name='ul',class_='article') li_list = [] for ul in ul_list: li_list += ul.find_all(name='li') conn = pymysql.connect(host='127.0.0.1', port=3306, user='root',password='111111',database='Test',charset='utf8') corsor = conn.cursor() for li in li_list: h3 = li.find(name='h3') if h3: title = h3.text url = 'http:'+li.find(name='a')['href'] desc = li.find(name='p').text img = 'http:'+li.find(name='img')['src'] # print(''' # 文章标题:%s # 文章地址:%s # 文章图片:%s # 文章摘要:%s # ''' % (title, url, img, desc)) sql = "INSERT INTO journalism(title,url,img,dc) values ('%s','%s','%s','%s');" % (title, url, img, desc) ret = corsor.execute(sql) conn.commit() corsor.close() conn.close()
1 Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库
2 默认有个解析器 html.parser
3 额外安装 lxml
4 html中搜索数据的时候
-css选择器 (通用) -xpath选择器 (通用) -模块提供的查找方法(find,find_all)
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" id='id_p'>p的内容<b>The Dormouse's story<span>孙子</span></b><span>lqz</span></p> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup=BeautifulSoup(html_doc,'lxml') # 美化 # print(soup.prettify()) # 遍历文档树 from bs4.element import Tag soup.html.p.text #1、用法. 遍历 bs4.element.Tag 每个标签对象,用起来跟soup对象一样用 print(soup.html.head) print(soup.html.body.p) 2、获取标签的名称 bs4.element.Tag 有一个name属性 print(soup.html.body.name) 3、获取标签的属性(如果是class就放到列表中) print(soup.html.body.p) print(soup.html.body.p.attrs) print(soup.html.body.p.attrs.get('class')) print(soup.html.body.p.attrs['id']) print(soup.html.body.p['class']) #如果是class就放到列表中 print(soup.html.body.p['id']) #id是一个 4、获取标签的内容 print(soup.html.body.p) print(soup.html.body.p.text) #获取该标签子子孙孙的文本内容 print(soup.html.body.p.string) #这个标签必须没有子孙,才能拿出文本内容 print(list(soup.html.body.p.strings)) #把子子孙孙的文本内容放到一个生成器中 5、嵌套选择 print(soup.p.b.string) 以下了解即可 6、子节点、子孙节点 print(soup.p.contents) #p下所有子节点 print(soup.p.children) #得到一个迭代器,包含p下所有子节点 for i,child in enumerate(soup.p.children): print(i,child) print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来 for i,child in enumerate(soup.p.descendants): print(i,child) 7、父节点、祖先节点 print(soup.b.parent) #获取b标签的父节点 print(list(soup.b.parents)) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲... print(len(list(soup.b.parents))) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲... 8、兄弟节点 print(soup.a.next_sibling) #紧邻的下一个兄弟(如果是空格就会拿出空格) print(soup.a.previous_sibling) #上一个兄弟 print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" id='id_p'>p的内容<b>The Dormouse's story<span>孙子</span></b><span>lqz</span></p> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1" xx='xx'><span>lqz</span>Elsie</a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3" name='lqz'>Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'lxml') 搜索文档树:(find find_all) #1、五种过滤器: 字符串、正则表达式、列表、True、方法 soup.find() # 找到符合的第一个 soup.find_all() # 找到符合的所有 字符串过滤器 res=soup.find(name='body',) 找a标签,id为 link1 res=soup.find(name='a',id='link1') res=soup.find_all(name='a',class_='sister') res=soup.find_all(name='a',href="http://example.com/elsie") res=soup.find_all(name='a',xx='xx') res=soup.find_all(name='a',attrs={'class':'sister'}) # 以属性找attrs res=soup.find_all(attrs={'id':'link1'}) res=soup.find_all(attrs={'xx':'xx'}) res=soup.find_all(name='a',attrs={'name':'lqz'}) print(res) 正则表达式 import re # res=soup.find_all(name=re.compile('^b')) # res=soup.find_all(class_=re.compile('^s')) res=soup.find_all(attrs={'name':'lqz'},id=re.compile('^l')) print(res) 列表 res=soup.find_all(name=['b',]) res=soup.find_all(id=['link1','link2']) print(res) 布尔 res = soup.find_all(class_=True) # 有标签标签 res = soup.find_all(href=True) # 有标签标签 print(res) 方法(了解) 获取有类名,但是没有id的标签 def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') res = soup.find_all(name=has_class_but_no_id) print(res) 遍历文档树和搜索文档树可以连用 res=soup.find(name='a').span.text res=soup.html.body.find('a') print(res) limit 限制取几条 soup.findChild() res=soup.find_all(name='a',limit=1) print(res) recursive 是否递归查找,如果是False是只找一层 res=soup.body.find_all(name='p',recursive=False) res=soup.find_all(name='p',recursive=False) res=soup.find_all(name='p',recursive=True) print(res)
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" id='id_p'>p的内容<b>The Dormouse's story<span>孙子</span></b><span>lqz</span></p> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1" xx='xx'><span>lqz</span>Elsie</a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3" name='lqz'>Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'lxml') # 括号中写css选择器 ''' 直接写标签名 .类 #id div>a 找div的子标签a div a 找div的子子孙孙中的a ''' res=soup.select('.sister') res=soup.select('#link2') res=soup.select('p>b') res=soup.select('p b') print(res) # bs4 可以修改xml格式的文档:后期可以会有一些配置文件是xml格式
1 requests模块不能执行js代码
2 selenium可以操作浏览器,模拟人的行为
3 selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器
# selenium驱动浏览器:谷歌(以谷歌为例),火狐,ie # 下载一个驱动(谷歌驱动) # 本地的谷歌浏览器版本要跟谷歌驱动对应 # 国内镜像:http://npm.taobao.org/mirrors/chromedriver/ # pip3 install selenium from selenium import webdriver import time # 实例化得到对象,设置驱动的位置 # 跟你用手点开一个浏览器是一样的 driver=webdriver.Chrome(executable_path='chromedriver.exe') #地址栏中输入百度地址 driver.get('https://www.baidu.com') time.sleep(3) print(driver.page_source) # 当前页面的html内容 # 把浏览器关闭 driver.close()