消除饥饿,消除贫困,自然资源循环利用探索性分析案例
#导入需要的常用库 import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd import os,sys import warnings import folium import missingno as msno warnings.filterwarnings('ignore') sns.set_context("poster",font_scale=1.3) import gzip import scipy #读入数据文件(压缩格式) data=pd.read_csv('aquastat.csv.gzip',compression='gzip') print(data.head()) print(data.shape) print(data.info) # 将不重复的指标列来出来(指标,指标说明) data[['variable','variable_full']].drop_duplicates() #看一共统计多少个国家 print(data.country.nunique()) countries = data.country.unique() #看有多少个时间周期 print(data.time_period.nunique()) time_periods = data.time_period.unique() print(time_periods) mid_periods = range(1960,2017,5) #看总面积指标是否完整 data[data.variable=='total_area'].value.isnull().sum() #切片 #横截面:看一个时期内所有的国家不同指标情况 def time_slice(df,time_period): df = df[df.time_period == time_period] df = df.pivot(index='country',columns='variable',values='value') df.columns.name = time_period return df print(time_slice(data,time_periods[0]).head()) #切片 #时间序列:看一个国家,各个时间周期指标 def country_slice(df,country): df = df[df.country==country] df=df.pivot(index='variable',columns='time_period',values='value') df.index.name = country return df print(country_slice(data,countries[40]).head()) #切片 #面板数据:所有国家随时间推移,作为数据给出 def variable_slice(df,variable): df=df[df.variable==variable] df=df.pivot(index='country',columns='time_period',values='value') return df print(variable_slice(data,'total_pop').head()) #切片 #地理空间:所有地理上相互联系的国家 def time_series(df,country,variable): series = df[(df.country==country) & (df.variable==variable)] series = series.dropna()[['year_measured','value']] series.year_measured = series.year_measured.astype(int) series.set_index('year_measured',inplace=True) series.columns=[variable] return series print(time_series(data,'Belarus','total_pop')) print(data.region.unique()) #粒度太小,进行合并成较大粒度 # simple_regions = { # 'World | Asia':'Asia', # 'Americas | Central America and Caribbean | Central America': 'North America', # 'Americas | Central America and Caribbean | Greater Antilles': 'North America', # 'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America', # 'Americas | Northern America | Northern America': 'North America', # 'Americas | Northern America | Mexico': 'North America', # 'Americas | Southern America | Guyana': 'South America', # 'Americas | Southern America | Andean': 'South America', # 'Americas | Southern America | Brazil': 'South America', # 'Americas | Southern America | Southern America' 'World | Africa': 'South America', # 'World | Africa':'Africa', # 'World | Europe':'Europe', # 'World | Oceania':'Oceania' # } # # data.region = data.region.apply(lambda x: simple_regions[x]) # print(data.region.unique()) def subregion(data,region): return data[data.region==region] #数据质量评估 recent= time_slice(data,'2013-2017') msno.matrix(recent,labels=True) #水资源总量 msno.matrix(variable_slice(data,'exploitable_total'),inline=False,sort='descending') plt.xlabel('Time period') plt.ylabel('Country') plt.title('Missing total exploitable water resources data cross counties and time jperiods \n \n \n \n') plt.show() #去掉缺失严重的列exploitab data=data.loc[~data.variable.str.contains('exploitab'),:] #全国降雨指数缺失统计 msno.matrix(variable_slice(data,'national_rainfall_index'), inline=False,sort='descending') plt.xlabel('Time period') plt.ylabel('Country') plt.title('Missing national rainfall index data across coutries and time periods \n \n \n') plt.show() print('************************************') # null_data=recent['agg_to_gdp'].notnull()*1 # map=folium.Map(location=[48,-102],zoom_start=2) # map.choropleth(geo_data=r'world.json', # data=null_data, # columns=['country','agg_to_gdp'], # key_no='feature.properties.name',reset=True, # fill_color='GnBu',fill_opacity=1,line_opacity=0.2, # legend_name='Missing agricultural contribution to GDP data 2013-2017') # print(map) # plt.show() #地图上统计 def plot_null_map(df,time_period,variable,legend_name=None): geo = r'world.json' ts = time_slice(df,time_period).reset_index().copy() ts[variable]=ts[variable].notnull()*1 map = folium.Map(location=[48,-102],zoom_start=2) map plt.show() map.choropleth(geo_data=geo, data=ts, columns=['country',variable], key_no='feature.properties.name',reset=True, fill_color='GnBu',fill_opacity=1,line_opacity=0.2, legend_name=legend_name if legend_name else variable) return map save_map = plot_null_map(data,'2013-2017','number_undernourished','Number undernourished is missing') save_map.save('save_map.html') #统计时间周期,不同指标变化情况,不同变量在不同时间上是否被收集 fig,ax = plt.subplots(figsize=(16,16)) sns.heatmap(data.groupby(['time_period','variable']).value.count().unstack().T,ax=ax) plt.xticks(rotation=45) plt.xlabel('Time period') plt.ylabel('Variable') plt.title('Number of countries with data reportes') plt.show() # recent[['total_pop','urban_pop','rural_pop']].describe().astype(int) #排序 recent_sort = recent.sort_values('rural_pop')[['total_pop','urban_pop','rural_pop']].head() print(recent_sort) #分析峰度,倾斜,偏度 recent[['total_pop','urban_pop','rural_pop']].apply(scipy.stats.skew) recent[['total_pop','urban_pop','rural_pop']].apply(scipy.stats.kurtosis) #看看当前数据分布 fig,ax = plt.subplots(figsize=(12,8)) ax.hist(recent.total_pop.values,bins=50) ax.set_xlabel('Total population') ax.set_ylabel('Number of contries') ax.set_title('Distrbution of population of countries 2013-2017') plt.show() #理论上,我们将分布标准差与其均值线性相关,数据对数变换 recent_log = recent[['total_pop']].apply(np.log).apply(scipy.stats.skew) print(recent_log) # 总结 学习案例