仅仅记录一下简单的特征分析法:
数据集仍是:Pima印第安人糖尿病数据集
主要是:数据分布、散点图、相关性矩阵分析等
代码如下:
from operator import index from numpy import loadtxt from numpy import sort from matplotlib import pyplot import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # #----------------------Load Data----------------------------------------------- dataset = loadtxt(r'C:\Users\Administrator\Desktop\pima-indians-diabetes.csv', delimiter=",",skiprows=1) # #--------------------- array转化为DataFrame,并添加列名# ----------------------------------- data = np.array(dataset) df = pd.DataFrame(data=data[0:,0:], columns=['pregnants','Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI','Diabetes_pedigree_function','Age','Target' ] ) # #---------------------绘制数据的分布情况----------------------------------------- df.hist(figsize=(16,14)) # #----------------------散点图分析----------------------------------------------- sns.pairplot(df,hue = "Target") # #----------------------Correlation computer------------------------------------- figure, ax = plt.subplots(figsize=(12, 12)) sns.heatmap(df.corr(), square=True, annot=True, ax=ax) plt.show()
最后结果: