tmp/blog_spider.py
import requests urls = [ f"https://www.cnblogs.com/#p{page}" for page in range(1, 50+1) ] def craw(url): r = requests.get(url) print(url, len(r.text)) craw(urls[0])
01.multi_thread_craw.py
import blog_spider import threading, time def single_thread(): print("single_thread begin") for url in blog_spider.urls: blog_spider.craw(url) print("single_thread end") def multi_thread(): print("single_thread begin") threads = [] for url in blog_spider.urls: threads.append( threading.Thread(target=blog_spider.craw, args=(url, )) ) for thread in threads: thread.start() for thread in threads: thread.join() print("single_thread end") if __name__ == '__main__': start = time.time() single_thread() end = time.time() print("single_thread cost: ", end - start, "seconds") start = time.time() multi_thread() end = time.time() print("multi_thread cost: ", end - start, "seconds")
爬取速度对比
可以看到,速度足足提高了8倍,当然和电脑的性能也有关系
单线程
多线程