#本次采用Kmans分析数据 import pandas as pd import numpy as npp from matplotlib import pyplot as plt from sklearn.cluster import KMeans import matplotlib import warnings warnings.filterwarnings('ignore')
#设置全部列显示和浮点数格式 #pd.set_option pd.set_option("display.max_rows",None) pd.set_option("display.max_columns",None) pd.set_option('display.float_format',lambda x:'%.6f'%x)
#读取数据并重命名 df = pd.read_csv('Mall_Customers.csv') df=df.rename(columns={'CustomerID':'顾客编号','Genre':'性别','Age':'年纪','Annual Income (k$)':'年收入','Spending Score (1-100)':'消费分数'}) df.性别.replace(['Male','Female'],[1,0],inplace=True)
#数据处理 #数据不多200条,粗略一看无缺失值,当可以用isnull查看 #对缺失值处理 df.isnull().sum()
#计算出数据的均值和标准差 dfms=pd.concat([df.mean().to_frame(),df.std().to_frame()],axis=1).transpose() dfms.index=['mean','std'] #数据标准化 df_scaled=pd.DataFrame() for i in df.columns: if (i=='性别'): df_scaled[i]=df[i] else: df_scaled[i]=(df[i] - dfms.loc['mean', i]) / dfms.loc['std', i] df_scaled
#按照男女划分 dff=df_scaled.loc[df_scaled.性别==0].iloc[:,1:] dfm=df_scaled.loc[df_scaled.性别==1].iloc[:,1:]
#选质心最优解 def numbers_of_clusters(df): demo = [] for i in range(1,20): km=KMeans(n_clusters=i,random_state=158) km.fit(df) demo.append(km.inertia_)#用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 df_ = pd.DataFrame(demo).reset_index() #重新设置表格 df_.columns=['n_clusters','within_cluster_sum_of_square'] return df_
#生成质心点 df_final = numbers_of_clusters(dff) df_final = numbers_of_clusters(dfm) df_final
#分别绘制折线图,选择方差最小值对应的质心点数。最终选择5作为数据的质心点数 plt.subplot(1,2,1) #画布 matplotlib.rcParams['font.family']='SimHei' # 用来显示正常中文标签 matplotlib.rcParams['figure.figsize']=(16,10) matplotlib.rcParams['font.size']=12 plt.plot(df_final.n_clusters,df_final.within_cluster_sum_of_square) plt.xticks(range(1,19,1)) plt.title('Female') plt.scatter(x=df_final.n_clusters[5:6],y=df_final.within_cluster_sum_of_square[5:6],color='black',marker='*') plt.subplot(1,2,2) matplotlib.rcParams['font.family']='SimHei' matplotlib.rcParams['figure.figsize']=(16,6) matplotlib.rcParams['font.size']=12 plt.plot(df_final.n_clusters,df_final.within_cluster_sum_of_square) plt.xticks(range(1,19,1)) plt.title('Male') plt.scatter(x=df_final.n_clusters[5:6],y=df_final.within_cluster_sum_of_square[5:6],color='black',marker='*')
#客户分类 def k_means(n_clusters,df,gender): kmf=KMeans(n_clusters=n_clusters,random_state=0) kmf.fit(df) centroids=kmf.cluster_centers_ #查看质心 cdf=pd.DataFrame(centroids,columns=df.columns) cdf['性别']=gender cdf['count']=pd.Series(kmf.labels_).value_counts() return cdf df1=k_means(5,dfm,'Male') df2=k_means(5,dff,'Female') dfc_scaled=pd.concat([df1,df2],axis=0) dfc_scaled
#数据非标准化 dfc=pd.DataFrame() for i in dfc_scaled.columns: if (i=='性别'):dfc[i]=dfc_scaled[i] elif (i=='count'):dfc[i]=dfc_scaled[i] else: dfc[i]=(dfc_scaled[i]*dfms.loc['std',i]+dfms.loc['mean',i]) dfc[i]=dfc[i].astype(int) dfc
#分类 dfc['type']=1 a_i=dfms.loc['mean']['年收入'] s_s=dfms.loc['mean']['消费分数'] dfcm=dfc[dfc['性别']=='Male'] dfcf=dfc[dfc['性别']=='Female'] remark=['年长/有孩子的收入一般的潜在男性客户','中年/有孩子的收入较高的优质男客户','年轻的收入一般的潜力男客户','年长/有孩子的收入较低的男客户','中年/有孩子的收入较高的潜在男客户'] dfcm['type']=pd.Series(remark) remark=['年长/有孩子的收入一般的潜在女性客户','年轻的收入一般的潜力女客户','中年/有孩子的收入较高的优质女客户','年轻的收入较低的可发展女客户','中年/有孩子的收入较高的一般女客户'] dfcf['type']=pd.Series(remark) dfcm
dfcf