打开网站
http://bang.dangdang.com/books/fivestars/1-1
一共25页,每一页最后面的值会发生改变
我们打算获得 排名 书名 图片地址 作者 推荐指数 五星评分 次数价格
主要思路:
def main(page): url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page) html = request_dandan(url) items = parse_result(html) # 解析过滤我们想要的信息 for item in items: write_item_to_file(item)
请求当当网 当我们请求成功之后 拿到源代码
def request_dandan(url): try: response = requests.get(url) if response.status_code == 200: return response.text except requests.RequestException: return None
拿到源代码了就要对其解析 使用正则表达式获取我们想要的关键信息
获取到了之后我们封装一下数据
def parse_result(html): # pattern=re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)"/>.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng>.*?<span>(.*?)</span></div>.*?<p><span>\sclass="price_n">¥(.*?)</span>.*?</li>',re.S) pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?' 'class="name".*?title="(.*?)">.*?class="star">.*?' 'class="tuijian">(.*?)</span>.*?class="publisher_info">.*?' 'target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?' '<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',re.S) items=re.findall(pattern,html) for item in items: yield{ "range":item[0], "image":item[1], "title":item[2], "recommend":item[3], "author":item[4], "items":item[5], "price":item[6], }
写到文件里面
def write_item_to_file(item): print('开始写入数据 ====> ' + str(item)) with open('book.txt', 'a', encoding='UTF-8') as f: f.write(json.dumps(item, ensure_ascii=False) + '\n') f.close()
import json import requests import re def main(page): url='http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-'+str(page) html=request_dandan(url) items=parse_result(html)# 解析过滤我们想要的信息 for item in items: write_item_to_file(item) def request_dandan(url): try: response=requests.get(url) if response.status_code==200: return response.text except requests.RequestException: return None #使用正则表达式获取我们想要的关键信息 获取到了之后我们封装一下数据 def parse_result(html): # pattern=re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)"/>.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng>.*?<span>(.*?)</span></div>.*?<p><span>\sclass="price_n">¥(.*?)</span>.*?</li>',re.S) pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?' 'class="name".*?title="(.*?)">.*?class="star">.*?' 'class="tuijian">(.*?)</span>.*?class="publisher_info">.*?' 'target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?' '<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',re.S) items=re.findall(pattern,html) for item in items: yield{ "range":item[0], "image":item[1], "title":item[2], "recommend":item[3], "author":item[4], "items":item[5], "price":item[6], } #写入文件 def write_item_to_file(item): print('开始写入数据 ====> ' + str(item)) with open('book.txt', 'a', encoding='UTF-8') as f: f.write(json.dumps(item, ensure_ascii=False) + '\n') f.close() if __name__=="__main__": for i in range(1,26): main(i)
最终的效果如下: