import csv import requests from pyquery import PyQuery as pq url='https://www.zhihu.com/explore' headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36'} html=requests.get(url,headers=headers).text #pyquery写法01 doc=pq(html) items=doc('.ExploreCollectionCard-contentItem').items() def save_json(): for item in items: url = item.find('.ExploreCollectionCard-contentTitle').attr('href') # print(url) contentExcerpt = item.find('.ExploreCollectionCard-contentExcerpt').text() # print(contentExcerpt) span_txt = item.find('.ExploreCollectionCard-contentTags').find('span').filter( '.ExploreCollectionCard-contentCountTag').text() data =[url,contentExcerpt,span_txt] with open('data.csv','a',encoding='utf-8',newline='') as file: writer=csv.writer(file) writer.writerow(data) if __name__ == '__main__': save_json()
import requests from pyquery import PyQuery as pq import json url='https://www.zhihu.com/explore' headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36'} html=requests.get(url,headers=headers).text #pyquery写法01 doc=pq(html) items=doc('.ExploreCollectionCard-contentItem').items() objs = [] def save_json(): with open('data.json','a',encoding='utf-8') as file: for item in items: url = item.find('.ExploreCollectionCard-contentTitle').attr('href') # print(url) contentExcerpt = item.find('.ExploreCollectionCard-contentExcerpt').text() # print(contentExcerpt) span_txt = item.find('.ExploreCollectionCard-contentTags').find('span').filter( '.ExploreCollectionCard-contentCountTag').text() # print(span_txt) data = { "url": url, "contentExcerpt": contentExcerpt, "span_txt": span_txt } # print(data) # 将提取的内容写入json格式的文件 # file.write(json.dumps(data,ensure_ascii=False)+'\n') objs.append(data) print(objs) file.write(json.dumps(objs,ensure_ascii=False,indent=2)) if __name__ == '__main__': save_json()
代码示例:
import csv import requests from pyquery import PyQuery as pq url='https://www.zhihu.com/explore' headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36'} html=requests.get(url,headers=headers).text #pyquery写法01 doc=pq(html) items=doc('.ExploreCollectionCard-contentItem').items() def save_json(): for item in items: url = item.find('.ExploreCollectionCard-contentTitle').attr('href') # print(url) contentExcerpt = item.find('.ExploreCollectionCard-contentExcerpt').text() # print(contentExcerpt) span_txt = item.find('.ExploreCollectionCard-contentTags').find('span').filter( '.ExploreCollectionCard-contentCountTag').text() data =[url,contentExcerpt,span_txt] with open('data.csv','a',encoding='utf-8',newline='') as file: writer=csv.writer(file) writer.writerow(data) if __name__ == '__main__': save_json()