python爬虫+数据分析+数据可视化
import csv import pandas as pd import numpy as np import asyncio import aiohttp from pandas import Series, DataFrame # import matplotlib as mpl import matplotlib.pyplot as plt from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/30 ' } async def get_page(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url, headers=headers) as response: # text()返回字符串形式的响应数据 # read()返回二进制形式的响应数据 # json()返回的就是json对象 # 注意:获取响应数据操作之前一定要使用await进行手动挂起 page_text = await response.text() tree = etree.HTML(page_text) titles = tree.xpath('//div[@class="property-content-title"]/h3//text()') values = tree.xpath('//p[@class="property-price-total"]/span[1]/text()') layouts = tree.xpath('//div[@class="property-content-info"]/p[1]//text()') a = '' for i in layouts: if i != ' ': a = a + i layout = [] for i in range(int(len(a) / 6)): layout.append(''.join(list(a)[6 * i:6 * i + 6])) mi = tree.xpath('//div[@class="property-content-info"]/p[2]//text()') location = tree.xpath('//div[@class="property-content-info"]/p[3]//text()') high = tree.xpath('//div[@class="property-content-info"]/p[4]//text()') build_times = tree.xpath('//div[@class="property-content-info"]/p[5]//text()') address = tree.xpath('//div[@class="property-content-info property-content-info-comm"]/p[1]//text()') specific_address = tree.xpath( '//div[@class="property-content-info property-content-info-comm"]/p[2]//text()') insertion = [] for i in range(int(len(specific_address))): insertion.append(specific_address[i]) if (i + 1) % 3 != 0: insertion.insert(len(specific_address), '-') # print(insertion) name = tree.xpath('//div[@class="property-extra"]/span[1]/text()') grade = tree.xpath('//div[@class="property-extra"]/span[2]/text()') website = tree.xpath('//div[@class="property-extra"]/span[3]/text()') urls = tree.xpath('//div[@class="property"]/a[1]/@href') # for url in urls: # print(url) # new_page_text = requests.get(url=url, headers=headers, proxies={'HTTP': 'HTTP://121.230.210.132:3256'}).text # new_tree = etree.HTML(new_page_text) # add_time = new_tree.xpath('//div[@class="houseInfo"]/table/tbody/tr[6]/td[2]/span[2]/text()') # print(add_time) new_specific_address = [] for i in range(int(len(insertion) / 5)): new_specific_address.append(''.join(insertion[5 * i:5 * i + 5])) # print(new_specific_address) print(len(build_times)) print(len(titles)) for i in range(len(titles) - 1): new_data = [titles[i], values[i] + '万', layout[i], mi[i], location[i], high[i], build_times[i], address[i], new_specific_address[i], name[i], grade[i], website[i], urls[i]] writer.writerow(new_data) # fp.write('房子描述:' + titles[i] + ' ¥' + '价格:' + values[i] + '万' + '房子构造:' + layout[i] + '房子面积:' + mi[ # i] + '房子朝向:' + location[i] + '楼房层数:' + high[i] + '建造时间:' + build_times[i] + # '地址:' + address[i] + '详细地址:' + new_specific_address[i] + '户主姓名:' + name[i] + '评分:' + grade[ # i] + '发布公司:' + website[i] + '网站地址:' + urls[i] + '\n') async def main(): urls = [] url = 'https://bj.58.com/ershoufang/p%d/' for pageNum in range(1, 9): urls.append(format(url % pageNum)) tasks = [] for url in urls: c = get_page(url) task = asyncio.create_task(c) tasks.append(task) await asyncio.wait(tasks) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) plt.rcParams["font.sans-serif"] = [u"SimHei"] plt.rcParams["axes.unicode_minus"] = False data = pd.read_csv('room01.csv', encoding='gbk') print(data.shape) print(data.dtypes) print(data.columns) # 将多余的行删除 index01 = data[data["建造时间"].str[29:33] == ''].index data.drop(index01, inplace=True) # 将房子面积转变为double类型新加一列mi data['mi'] = data["房子面积"].str[29:-26].astype('double') # 将价格转变为double类型新加一列price data['price'] = data["¥价格"].str[:-1].astype('double') # 将建造时间转变为int类型新加一列year data['year'] = data["建造时间"].str[29:33].astype('int') # print(data["建造时间"].str[29:33]) # print(data.dtypes) # 添加一列months表示使用多少个月 data['months'] = (2021 - data['year']) * 12 + 6 # 删除评分中无用的数据 index02 = data[data['评分'].str[3:4] != '分'].index data.drop(index02, inplace=True) # 将评分转变为double类型新加一列grade data['grade'] = data['评分'].str[:-1].astype('double') def plot01(): # 将价格分组 price_cut = pd.cut(data['price'], bins=[data['price'].min(), 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, data['price'].max()]) # print(price_cut) # 计算不同价格的房子数量 price_count = price_cut.value_counts() # print(price_count) # 查看占比情况 for i in price_count / price_count.sum(): print(i) # 打印price_count索引 print(price_count.index) # 绘制不同价格区间占比情况的柱状图 X = np.arange(len(price_count)) print(X) Y = price_count print(Y) plt.figure(figsize=(8, 6)) plt.bar(X, Y, color='b', alpha=0.5) plt.title("二手房价格分布图") plt.xlabel("价格区间") plt.ylabel("数量") plt.xticks(np.arange(len(price_count)), price_count.index, rotation=30) plt.ylim([0, price_count.max() + 100]) percents = [str(round(i * 100, 2)) + '%' for i in price_count / price_count.sum()] for x, y, z in zip(X, Y, percents): plt.text(x - 0.3, y + 5, z) plt.show() def plot02(): # 平均面积集合 means = [int(data[(data['price'] < 100) & (data['price'] >= data['price'].min())]['mi'].mean()), int(data[(data['price'] < 200) & (data['price'] >= 100)]['mi'].mean()), int(data[(data['price'] < 300) & (data['price'] >= 200)]['mi'].mean()), int(data[(data['price'] < 400) & (data['price'] >= 300)]['mi'].mean()), int(data[(data['price'] < 500) & (data['price'] >= 400)]['mi'].mean()), int(data[(data['price'] < 600) & (data['price'] >= 500)]['mi'].mean()), int(data[(data['price'] < 700) & (data['price'] >= 600)]['mi'].mean()), int(data[(data['price'] < 800) & (data['price'] >= 700)]['mi'].mean()), int(data[(data['price'] < 900) & (data['price'] >= 800)]['mi'].mean()), int(data[(data['price'] < 1000) & (data['price'] >= 900)]['mi'].mean()), int(data[(data['price'] < data['price'].max()) & (data['price'] >= 1000)]['mi'].mean())] x = [f"[{data['price'].min()},100)", "[100,200)", "[200,300)", "[300,400)", "[400,500)", "[500,600)", "[600,700)", "[700,800)", "[800,900)", "[900,1000)", f"[1000,{data['price'].max()})"] X = np.arange(len(x)) Y = means plt.figure(figsize=(8, 10)) plt.plot(X, Y, '-..', color='b') plt.title('房子价格和面积之间的关系') plt.xlabel('价格区间') plt.ylabel('平均面积') plt.xticks(np.arange(len(X)), x, rotation=30) ax = plt.gca() for i, j in zip(X, Y): ax.text(i + 0.2, j + 4, j, bbox=dict(facecolor='red', alpha=0.3)) plt.grid(True) plt.show() def plot03(): # 分析房子使用时长、面积及价格之间的关系 plt.figure(figsize=(10, 8)) plt.scatter(data['mi'], data['months'], s=data['price'] / 10, c='r') plt.xlabel("面积") plt.ylabel("使用月份") plt.show() def plot04(): # 分析房子评分、面积及价格之间的关系 plt.figure(figsize=(10, 8)) plt.scatter(data['mi'], data['grade'], s=data['price'] / 10, c='r') plt.xlabel("面积") plt.ylabel("评分") plt.show() if __name__ == '__main__': head = ['房子描述', '¥价格', '房子构造', '房子面积', '房子朝向', '楼房层数', '建造时间', '地址', '详细地址', '户主姓名', '评分', '发布公司', '网站地址'] with open('room02.csv', 'a', encoding='gbk', newline='')as f: writer = csv.writer(f) writer.writerow(head) loop = asyncio.get_event_loop() loop.run_until_complete(main()) f.close() plot04()