一、导入库
import pandas as pd import numpy as np import matplotlib.pyplot as plt import os import openpyxl import datetime as dt
df=pd.read_excel(os.getcwd() + os.path.sep + "data.xlsx")
df.sort_values('memberid').head(5) def retention(data,n,dur=False): result = {} dates = sorted(set(data['购买日期'])) new_member = data[['memberid','购买日期']].groupby('memberid').min() for date in dates: date_pay_p = set(new_member[new_member['购买日期']==date].index) # 当日首次下单用户 daten = date + dt.timedelta(days=n) if dur == True: daten_pay_p = set(df[(df['购买日期'] > date) & (df['购买日期']<=daten)]['memberid']) # 首次下单后n日内(含第n日)再次下单用户 else: daten_pay_p = set(df[df['购买日期']==daten]['memberid']) # 首次下单第n日再次下单用户 day_count = len(date_pay_p) nday_count = len(daten_pay_p) retention_count = len(date_pay_p.intersection(daten_pay_p)) if len(date_pay_p) == 0: retention_rate = np.nan else: retention_rate = retention_count/day_count result[date] = [day_count,nday_count,retention_count,retention_rate] result = pd.DataFrame.from_dict(result).T result.columns=['day_count','nday_count','retention_count','retention_rate'] return(result)
retention_table = retention(df,3) retention_table['Month'] = pd.to_datetime(retention_table.index).month retention_table.groupby(retention_table['Month'])['retention_rate'].mean().plot()
阿里云盘:https://www.aliyundrive.com/s/R1y61iU6zdT