一、观察网页规律
url:https://www.qcc.com/elib_ipo_p_1.html
数字1对应相应的页面
获取页面信息
上代码:
# -*- codeing=utf-8 -*- from bs4 import BeautifulSoup import urllib.request, urllib.error import json import os
主方法:
def main(): # 从企查查获得所有企业列表 datalist = getCompanies() # 保存到json文件 如果文件存在,先删除 if os.path.exists('./company.json'): os.remove('./company.json') with open('./company.json', 'w', encoding='utf-8') as file: file.write(json.dumps(datalist, ensure_ascii=False, indent=1)) # 按照json格式换行
页面内容爬取
def getCompanies(): baseurl = "https://www.qcc.com/elib_ipo_p_" companies = [] for i in range(1, 451): url = baseurl + str(i) + ".html" html = askURL(url) soup = BeautifulSoup(html, "html.parser") table = soup.find('table', attrs={'class': 'ntable'}) results = table.find_all('tr') for result in results: company = {} data = result.find_all('td') if len(data) == 0: continue # 公司名称 company["公司名称:"] = str(data[1].getText()).replace('\n', '').replace('\t', '').strip() # 代码 company["代码:"] = data[2].getText() # 企业名称 company["企业名称:"] = str(data[3].getText().replace('\n', '')).replace('\t', '').strip() # 交易所名称 company["交易所名称:"] = str(data[4].getText().replace('\n', '')).replace('\t', '').strip() # 上市日期 company["上市日期:"] = data[5].getText() companies.append(company) return companies def askURL(url): # 模拟浏览器头部信息 head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html
结束:
if __name__ == '__main__': main()
生成的文件