来源:15 Python Snippets to Optimize your Data Science Pipeline
翻译:RankFan
在我的日常中,我经常处理许多同样的状况,主要是从加载 csv
文件到数据可视化。因此,为了流水线这个过程,我有兴趣去储存一些 code
片段, 在不同的情形下,加载csv
文件到数据可视化是非常有帮助的。
在这篇短文中,我将分享15个Python片段去简化你不同的数据分析管道。
import glob import pandas as pd csv_files = glob.glob("path/to/folder/with/csvs/*.csv") dfs = [pd.read_csv(filename) for filename in csv_flies]
import pandas as pd df = pd.read_csv("path/to/csv/file.csv") df = ["Item_Identifier"].unique() array['FDA15', 'DRC01', 'FDN15', ..., 'NCF55', 'NCW30', 'NCW05'],dtype = object]
Pandas Dataframe
from IPython.display import display_html from itertools import chain, cycle def display_side_by_side(*arg, title = cycle([''])): html_str = "" for df, title in zip(args, chain(title, cycle(['</br>']))): html_str += '< the style = " text-align : center "> < td style = "vertical-align : top">' html_str += "<br>" html_str += f'<h2>{title}</h2>' html_str += df.to_html().replace('table', tabel style="display:inline") html_str += '</td></th>' display_html(html_str, raw = True) df1 = pd.csv_read("file_csv") df2 = pd.csv_read("file2") dispaly_side_by_side(df1.head(), df2.head(), titles=[Sales, Advertising])
Pandas DataFrame
中的缺失值df = pd.DataFrame(dict(a = [1, 2, 3, None])) df df.dropna(inplace = True) df
def FindNanCol(df): for col in df: print(f"Column : {col}") num_Nans = df[col].isnull().sum() print(f"Number of Nans : {num_Nans}") df = pd.DataFrame(dict(a = [1, 2, 3, None], b = [None, None, 5, 6])) FindNanCol(df)
.apply
函数和 lambda
函数 转变列df = pd.DataFrame(dict(a = [10, 20 ,30, 40, 50])) square = lambda x: x**2 df["a"] = df["a"].apply(square) df
DataFrame
列转化为字典df = pd.DataFrame(dict(a = ["a", "b", "c"], b = [1, 2, 3])) df_dictionary = dict(zip(df["a"], df["b"])) df_dictionary
import numpy as np import matplotlib.pyplot as plt import seaborns as sns import pandas as pd sns.set() df = pd.DataFrame(dict(a = np.random.randint(0, 100, 100), b = np.arange(0, 100, 1))) plt.figure(figsize = (15,7)) plt.subplot(1, 2, 1) df["b"][df["a"]>50].hist(color='green', label="bigger than 50") plt.legend() plt.subplot(1, 2, 1) df["b"][df["a"]<50].hist(color='orange', label="small than 50") plt.legend() plt.show
pandas
中对不同的列进行t
检验from scipy.stats import ttest_rel data = np.arange(0, 1000, 1) data_plus_noise = np.arange(0, 1000, 1) + np.random.normal(0, 1, 1000) df = pd.DataFrame(dict(data = data, data_plus_noise = data_plus_noise)) print(ttest_rel(df["data"], df["data_plus_noise"]))
df1 = pd.DataFrame(dict(a = [1, 2, 3], b=[10, 20, 30], col_to_merge= ["a", "b", "c"])) df2 = pd.DataFrame(dict(d = [10, 20, 30], col_to_merge=["a", "b", "c"])) df_merged = df1.merge(df2, on='col_to_merge')
sklearn
进行标准化from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scores = scaler.fit_transform(df["a"].values.reshape(-1, 1))
df.dropna(subset = ["col_to_remove_NaNs_from"], inplace = True)
dataframe
的子集df = pd.Dataframe(dict(result = ["pass", "Fail", "pass", "Fail", "Distinction", "Distinction"])) pass_index = (df["result"] == "pass") | (df["result"] == "Distinction") df_pass = df['pass_index'] df_pass
import matplotlib.pyplot as plt df = pd.DataFrame(dict(a = [10, 20, 50, 10, 10], b=["A", "B", "C", "D", "E"])) labels = df["b"] sizes = df["a"] plt.pie(sizes, labels = labels, autopct = '%1.1f%%', shadow = True, startangle=140) plt.axis('equal') plt.show
def change_to_numerical(x): try: x = int(x.strip("%")[:2]) except: x = int(x.strip("%")[:1]) return x df = pd.DataFrame(dict(a =["A", "B" ,"C"], col_with_percentage = ["10%", "20%", "70%"])) df["col_with_percentage"] = df["col_with_percentage"].apply(change_to_numerical) df
我认为代码片段是非常有用,重新写是浪费时间的,因此,有一个完整的工具包可以对数据分析进行流水线处理,这是非常有帮助的。