数据集来自数据网站Kaggle的美国新冠肺炎疫情数据集,(从百度网盘下载,提取码:t7tu)该数据集以数据表us-counties.csv组织
import pandas as pd #.csv->.txt data = pd.read_csv('/home/hadoop/data/us-counties.csv') with open('/home/hadoop/data/us-counties.txt','a+',encoding='utf-8') as f: for line in data.values: f.write((str(line[0])+'\t'+str(line[1])+'\t' +str(line[2])+'\t'+str(line[3])+'\t'+str(line[4])+'\n'))
./bin/hdfs dfs -mkdir -p /user/hadoop ./bin/hdfs dfs -put /home/hadoop/data/us-counties.txt /user/hadoop ./bin/hdfs dfs -ls /user/hadoop
from pyspark import SparkConf,SparkContext from pyspark.sql import Row from pyspark.sql.types import * from pyspark.sql import SparkSession from datetime import datetime import pyspark.sql.functions as func def toDate(inputStr): newStr = "" if len(inputStr) == 8: s1 = inputStr[0:4] s2 = inputStr[5:6] s3 = inputStr[7] newStr = s1+"-"+"0"+s2+"-"+"0"+s3 else: s1 = inputStr[0:4] s2 = inputStr[5:6] s3 = inputStr[7:] newStr = s1+"-"+"0"+s2+"-"+s3 date = datetime.strptime(newStr, "%Y-%m-%d") return date #主程序: spark = SparkSession.builder.config(conf = SparkConf()).getOrCreate() fields = [StructField("date", DateType(),False),StructField("county", StringType(),False),StructField("state", StringType(),False), StructField("cases", IntegerType(),False),StructField("deaths", IntegerType(),False),] schema = StructType(fields) rdd0 = spark.sparkContext.textFile("/user/hadoop/us-counties.txt") rdd1 = rdd0.map(lambda x:x.split("\t")).map(lambda p: Row(toDate(p[0]),p[1],p[2],int(p[3]),int(p[4]))) shemaUsInfo = spark.createDataFrame(rdd1,schema) shemaUsInfo.createOrReplaceTempView("usInfo") #1.计算每日的累计确诊病例数和死亡数 df = shemaUsInfo.groupBy("date").agg(func.sum("cases"),func.sum("deaths")).sort(shemaUsInfo["date"].asc()) #列重命名 df1 = df.withColumnRenamed("sum(cases)","cases").withColumnRenamed("sum(deaths)","deaths") df1.repartition(1).write.json("result1.json") #写入hdfs #注册为临时表供下一步使用 df1.createOrReplaceTempView("ustotal") #2.计算每日较昨日的新增确诊病例数和死亡病例数 df2 = spark.sql("select t1.date,t1.cases-t2.cases as caseIncrease,t1.deaths-t2.deaths as deathIncrease from ustotal t1,ustotal t2 where t1.date = date_add(t2.date,1)") df2.sort(df2["date"].asc()).repartition(1).write.json("result2.json") #写入hdfs #3.统计截止5.19日 美国各州的累计确诊人数和死亡人数 df3 = spark.sql("select date,state,sum(cases) as totalCases,sum(deaths) as totalDeaths,round(sum(deaths)/sum(cases),4) as deathRate from usInfo where date = to_date('2020-05-19','yyyy-MM-dd') group by date,state") df3.sort(df3["totalCases"].desc()).repartition(1).write.json("result3.json") #写入hdfs df3.createOrReplaceTempView("eachStateInfo") #4.找出美国确诊最多的10个州 df4 = spark.sql("select date,state,totalCases from eachStateInfo order by totalCases desc limit 10") df4.repartition(1).write.json("result4.json") #5.找出美国死亡最多的10个州 df5 = spark.sql("select date,state,totalDeaths from eachStateInfo order by totalDeaths desc limit 10") df5.repartition(1).write.json("result5.json") #6.找出美国确诊最少的10个州 df6 = spark.sql("select date,state,totalCases from eachStateInfo order by totalCases asc limit 10") df6.repartition(1).write.json("result6.json") #7.找出美国死亡最少的10个州 df7 = spark.sql("select date,state,totalDeaths from eachStateInfo order by totalDeaths asc limit 10") df7.repartition(1).write.json("result7.json") #8.统计截止5.19全美和各州的病死率 df8 = spark.sql("select 1 as sign,date,'USA' as state,round(sum(totalDeaths)/sum(totalCases),4) as deathRate from eachStateInfo group by date union select 2 as sign,date,state,deathRate from eachStateInfo").cache() df8.sort(df8["sign"].asc(),df8["deathRate"].desc()).repartition(1).write.json("result8.json")
cd ~/result mkdir result cd result mkdir result1 result2 result3 result4 result5 result6 result7 result8
./bin/hdfs dfs -get /user/hadoop/result1.json/*.json ~/result/result1/ ./bin/hdfs dfs -get /user/hadoop/result2.json/*.json ~/result/result2/ # 对于其他文件以此类推,将文件路径修改即可
pip install pyecharts
2. 创建showdata.py文件中。具体代码如下:
from pyecharts import options as opts from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.components import Table from pyecharts.charts import WordCloud from pyecharts.charts import Pie from pyecharts.charts import Funnel from pyecharts.charts import Scatter from pyecharts.charts import PictorialBar from pyecharts.options import ComponentTitleOpts from pyecharts.globals import SymbolType import json #1.画出每日的累计确诊病例数和死亡数——>双柱状图 def drawChart_1(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" date = [] cases = [] deaths = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) date.append(str(js['date'])) cases.append(int(js['cases'])) deaths.append(int(js['deaths'])) d = ( Bar() .add_xaxis(date) .add_yaxis("累计确诊人数", cases, stack="stack1") .add_yaxis("累计死亡人数", deaths, stack="stack1") .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title="美国每日累计确诊和死亡人数")) .render("/home/hadoop/result/result1/result1.html") ) #2.画出每日的新增确诊病例数和死亡数——>折线图 def drawChart_2(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" date = [] cases = [] deaths = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) date.append(str(js['date'])) cases.append(int(js['caseIncrease'])) deaths.append(int(js['deathIncrease'])) ( Line(init_opts=opts.InitOpts(width="1600px", height="800px")) .add_xaxis(xaxis_data=date) .add_yaxis( series_name="新增确诊", y_axis=cases, markpoint_opts=opts.MarkPointOpts( data=[ opts.MarkPointItem(type_="max", name="最大值") ] ), markline_opts=opts.MarkLineOpts( data=[opts.MarkLineItem(type_="average", name="平均值")] ), ) .set_global_opts( title_opts=opts.TitleOpts(title="美国每日新增确诊折线图", subtitle=""), tooltip_opts=opts.TooltipOpts(trigger="axis"), toolbox_opts=opts.ToolboxOpts(is_show=True), xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False), ) .render("/home/hadoop/result/result2/result1.html") ) ( Line(init_opts=opts.InitOpts(width="1600px", height="800px")) .add_xaxis(xaxis_data=date) .add_yaxis( series_name="新增死亡", y_axis=deaths, markpoint_opts=opts.MarkPointOpts( data=[opts.MarkPointItem(type_="max", name="最大值")] ), markline_opts=opts.MarkLineOpts( data=[ opts.MarkLineItem(type_="average", name="平均值"), opts.MarkLineItem(symbol="none", x="90%", y="max"), opts.MarkLineItem(symbol="circle", type_="max", name="最高点"), ] ), ) .set_global_opts( title_opts=opts.TitleOpts(title="美国每日新增死亡折线图", subtitle=""), tooltip_opts=opts.TooltipOpts(trigger="axis"), toolbox_opts=opts.ToolboxOpts(is_show=True), xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False), ) .render("/home/hadoop/result/result2/result2.html") ) #3.画出截止5.19,美国各州累计确诊、死亡人数和病死率--->表格 def drawChart_3(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" allState = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) row = [] row.append(str(js['state'])) row.append(int(js['totalCases'])) row.append(int(js['totalDeaths'])) row.append(float(js['deathRate'])) allState.append(row) table = Table() headers = ["State name", "Total cases", "Total deaths", "Death rate"] rows = allState table.add(headers, rows) table.set_global_opts( title_opts=ComponentTitleOpts(title="美国各州疫情一览", subtitle="") ) table.render("/home/hadoop/result/result3/result1.html") #4.画出美国确诊最多的10个州——>词云图 def drawChart_4(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" data = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) row=(str(js['state']),int(js['totalCases'])) data.append(row) c = ( WordCloud() .add("", data, word_size_range=[20, 100], shape=SymbolType.DIAMOND) .set_global_opts(title_opts=opts.TitleOpts(title="美国各州确诊Top10")) .render("/home/hadoop/result/result4/result1.html") ) #5.画出美国死亡最多的10个州——>象柱状图 def drawChart_5(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" state = [] totalDeath = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) state.insert(0,str(js['state'])) totalDeath.insert(0,int(js['totalDeaths'])) c = ( PictorialBar() .add_xaxis(state) .add_yaxis( "", totalDeath, label_opts=opts.LabelOpts(is_show=False), symbol_size=18, symbol_repeat="fixed", symbol_offset=[0, 0], is_symbol_clip=True, symbol=SymbolType.ROUND_RECT, ) .reversal_axis() .set_global_opts( title_opts=opts.TitleOpts(title="PictorialBar-美国各州死亡人数Top10"), xaxis_opts=opts.AxisOpts(is_show=False), yaxis_opts=opts.AxisOpts( axistick_opts=opts.AxisTickOpts(is_show=False), axisline_opts=opts.AxisLineOpts( linestyle_opts=opts.LineStyleOpts(opacity=0) ), ), ) .render("/home/hadoop/result/result5/result1.html") ) #6.找出美国确诊最少的10个州——>词云图 def drawChart_6(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" data = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) row=(str(js['state']),int(js['totalCases'])) data.append(row) c = ( WordCloud() .add("", data, word_size_range=[100, 20], shape=SymbolType.DIAMOND) .set_global_opts(title_opts=opts.TitleOpts(title="美国各州确诊最少的10个州")) .render("/home/hadoop/result/result6/result1.html") ) #7.找出美国死亡最少的10个州——>漏斗图 def drawChart_7(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" data = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) data.insert(0,[str(js['state']),int(js['totalDeaths'])]) c = ( Funnel() .add( "State", data, sort_="ascending", label_opts=opts.LabelOpts(position="inside"), ) .set_global_opts(title_opts=opts.TitleOpts(title="")) .render("/home/hadoop/result/result7/result1.html") ) #8.美国的病死率--->饼状图 def drawChart_8(index): root = "/home/hadoop/result/result" + str(index) +"/part-00000.json" values = [] with open(root, 'r') as f: while True: line = f.readline() if not line: # 到 EOF,返回空字符串,则终止循环 break js = json.loads(line) if str(js['state'])=="USA": values.append(["Death(%)",round(float(js['deathRate'])*100,2)]) values.append(["No-Death(%)",100-round(float(js['deathRate'])*100,2)]) c = ( Pie() .add("", values) .set_colors(["blcak","orange"]) .set_global_opts(title_opts=opts.TitleOpts(title="全美的病死率")) .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}")) .render("/home/hadoop/result/result8/result1.html") ) #可视化主程序: index = 1 while index<9: funcStr = "drawChart_" + str(index) eval(funcStr)(index) index+=1
http://dblab.xmu.edu.cn/blog/2636-2/
注:本篇文章是基于林子雨老师博客的文章,经本人实操后发表。