本文主要是介绍python简单爬虫,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
python简单爬虫某网站简历模板
from lxml import etree
import requests
def main():
url = "https://sc.chinaz.com/jianli/free.html"
header = {"User-Agent":"hahhaha"}
res = requests.get(url = url ,headers = header)
print(res.status_code)
tree = etree.HTML(res.text) #使用xpath导入页面源码
div_list = tree.xpath('//div[@id="container"]/div') #定位到div标签因为有好多所以返回列表
#print(div_list)
jl_list = []
for div in div_list:
href = div.xpath('./a/@href') #取出每个标签下的网站地址
#haha = "http:"+href
jl_list.append(href) #添加到列表
#print(href)
#print(jl_list)
main2(jl_list)
def main2(jl_list):
xz_list = []
for xz in jl_list: #从列表中把地址拿出
url = "http:"+xz[0] #组建正确地址
header = {"User-Agent":"haha"}
res = requests.get(url = url,headers = header)
#print(res.status_code)
tree = etree.HTML(res.text)
li = tree.xpath('//div[@class="clearfix mt20 downlist"]//li')[0] #定位标签
href = li.xpath('./a/@href') #取出地址
xz_list.append(href)
#print(xz_list)
main3(xz_list)
def main3(xz_list):
t = 0
for rarxz in xz_list:
url = rarxz[0]
header = {"User-Agent":"haha"}
res = requests.get(url = url , headers = header)
with open("./jx模板/"+str(t)+".rar","wb") as f: #以二进制写的方式打开文件
f.write(res.content) #写入文件
print("is ok ")
t +=1
f.close()
if __name__=="__main__":
main()
这篇关于python简单爬虫的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!