Python的Queue模块中提供了同步的、线程安全的队列类,包括FIFO(先入先出)队列Queue,LIFO(后入先出)队列LifoQueue,和优先级队列PriorityQueue。这些队列都实现了锁原语,能够在多线程中直接使用。可以使用队列来实现线程间的同步
Queue.qsize() 返回队列的大小
Queue.empty() 如果队列为空,返回True,反之False
Queue.full() 如果队列满了,返回True,反之False
Queue.full 与 maxsize 大小对应
Queue.get([block[, timeout]])获取队列,timeout等待时间
Queue.get_nowait() 相当Queue.get(False)
Queue.put(item) 写入队列,timeout等待时间
Queue.put_nowait(item) 相当Queue.put(item, False)
Queue.task_done() 在完成一项工作之后,Queue.task_done()函数向任务已经完成的队列发送一个信号
Queue.join() 实际上意味着等到队列为空,再执行别的操作
from queue import Queue from time import sleep from threading import Thread class Mythread(Thread): def __init__(self,queue_url): # 初始化父类 Thread.__init__(self) self.queue_url = queue_url def run(self): while not self.queue_url.empty(): i = self.queue_url.get() print(i) sleep(3) if __name__=='__main__': url_queue=Queue() for i in range(1,11): url_queue.put(i) for i in range(4): t=Mythread(url_queue) t.start()
from queue import Queue from threading import Thread from time import sleep def spider(url_queue): while not url_queue.empty(): url=url_queue.get() print(url) sleep(3) if __name__=='__main__': url_queue=Queue() for i in range(1,11): url_queue.put(i) for i in range(4): t = Thread(target=spider,args=(url_queue,)) t.start()
案例
import requests from lxml import etree from queue import Queue from threading import Thread from fake_useragent import UserAgent import os # def get_headers(): # base_path = os.path.abspath(os.path.dirname(__file__)) # locations = os.path.join(base_path,'fakeuseragent.json/fake_useragent_0.1.11.json') # ua = UserAgent(path=locations) # return ua.random class Spider(Thread): def __init__(self,queue_url): Thread.__init__(self) # 初始化父类 self.queue_url = queue_url def run(self): while not self.queue_url.empty(): url = self.queue_url.get() print(url) headers = { 'User-Agent':'aaaaa'} reps = requests.get(url,headers=headers) e = etree.HTML(reps.text) contents = [div.xpath('string(.)').strip() for div in e.xpath('//div[@class="content"]')] with open('duanzi.txt','a',encoding='utf-8') as f: for content in contents: f.write(content + '/n') f.close() if __name__=='__main__': base_url='https://www.qiushibaike.com/text/page/{}/' queue_url = Queue() for num in range(1,6): queue_url.put(base_url.format(num)) # 创建线程 for num in range(3): s = Spider(queue_url) s.start()