首先进行网页分析,具体操作:省略。
# -*- coding: utf-8 -*- """ Created on Fri Dec 10 16:25:59 2021 @author: Hider """ # 爬虫学习:8684公交路线 # 网站:https://www.8684.cn/ # 公交站点、地铁站点、违章、资讯等等数据 ''' --------- 网页分析 ---------- 广州公交:https://guangzhou.8684.cn/ div class="bus-layer depth w120" 第3个 div class="p110" 市区编码线路:https://guangzhou.8684.cn/line1 div class="list clearfix" a标签 href title 广州1路公交车路线:https://guangzhou.8684.cn/x_322e21c5 '''
上代码!!!
import requests from bs4 import BeautifulSoup import pandas as pd import numpy as np import random import time def get_ua(): user_agents = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Opera/8.0 (Windows NT 5.1; U; en)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ', ] user_agent = random.choice(user_agents) # 随机抽取对象 return user_agent # 请求 url = 'https://guangzhou.8684.cn/' response = requests.get(url=url, headers={'User-Agent':get_ua()}, timeout=10) # 获取数据并解析 soup = BeautifulSoup(response.text, 'lxml') soup_bus_layer = soup.find('div', class_='bus-layer depth w120') # 解析分类数据 dict_result = {} soup_bus_list = soup_bus_layer.find_all('div', class_='pl10') for soup_bus in soup_bus_list: name = soup_bus.find('span', class_='kt').get_text() # print(name) if '线路分类' in name: soup_a_list = soup_bus.find('div', class_='list') for soup_a in soup_a_list.find_all('a'): text = soup_a.get_text() href = soup_a.get('href') dict_result[text] = 'https://guangzhou.8684.cn' + href print(dict_result) # 遍历各个线路 bus = [] for key, value in dict_result.items(): print('Key is:', key) print('Value is:', value) response = requests.get(url=value, headers={'User-Agent':get_ua()}, timeout=10) # 获取数据并解析 soup = BeautifulSoup(response.text, 'lxml') # 详细线路 soup_bus_list = soup.find('div', class_='list clearfix') for soup_a in soup_bus_list.find_all('a'): text = soup_a.get_text() href = soup_a.get('href') title = soup_a.get('title') bus.append([key, value, title, text, 'https://guangzhou.8684.cn' + href]) # print(bus) # 公交线路明细车站 final_bus_result = [] # bus_test = bus[0:10] index = 0 # 遍历每一条线路 for i in bus: print(f'正在爬取{i[2]}...') index += 1 if index % 100 == 0: print('休息一下吧!~ZzzZ~ ') time.sleep(random.randint(5, 10)) # 添加随机时间 print(index) url = i[4] response = requests.get(url=url, headers={'User-Agent':get_ua()}, timeout=10) # 获取数据并解析 soup = BeautifulSoup(response.text, 'lxml') soup_bus_run = soup.find('ul', class_='bus-desc') # 运行时间 bus_run_time = soup_bus_run.find_all('li')[0].get_text() # 参考票价 bus_price = soup_bus_run.find_all('li')[1].get_text() # 公交公司 try: bus_company = soup_bus_run.find_all('li')[2].find('a').get_text() except: bus_company = soup_bus_run.find_all('li')[2].get_text() # 最后更新 bus_update_time = soup_bus_run.find_all('li')[3].get_text() # 此处应该可优化 只取内容 剔除div # 站点信息 soup_bus_station = soup.find_all('div', class_='bus-lzlist mb15')[0] bus_station = {} for soup_bus in soup_bus_station.find_all('li'): text = soup_bus.get_text() href = soup_bus.find('a').get('href') bus_station[text] = 'https://guangzhou.8684.cn' + href final_bus_result.append([i[0], i[1], i[2], i[3], url, bus_run_time, bus_price, bus_company, bus_update_time, bus_station]) df = pd.DataFrame(final_bus_result).rename(columns={0:'线路分类', 1:'线路分类网址', 2:'线路', 3:'线路名称', 4:'线路网址', 5:'运行时间', 6:'参考票价', 7:'公交公司', 8:'最后更新', 9:'站点信息'}) df.to_csv(r'C:\Users\Hider\Desktop\bus.csv', index=False, encoding='utf-8-sig')
参考链接:手把手教学,正式开始!