pd.read_csv('/home/xxx/xxxx_0818.csv')
# 需要配置 spark.sql(''' select * from tbl where member_id = '001' ''').toPandas()
# 一共三列,列名:feature,ex,pvalue: pd.DataFrame({'feature':[], 'ex':[], 'pvalue':[]} # 带数据: pd.DataFrame({'feature':index, 'ex':example, 'pvalue':p, 'chi':chi}, index = [1])
df_new[df_new['col1'].isnull() == False]
dataframe[dataframe['col1']<0.05]
df_orig = df_orig.append(df_new, ignore_index=True)
cols = gender.join(is_married) cols = basic.join(age_segment1)
member_level_merge = pd.merge(member_level, m_info, on = 'member_id', how = 'left')
dataframe.sort_values(['col1'], ascending = False)
dataframe.drop_duplicates('member_id')
table['date_difference'].value_counts() table.groupby('date_difference).size()
在这里插入代码片
table.iloc[:, 0:3] # 第一至三列
for index, row in dataframe.iteritems(): print(index) # 打印每列的列名
或者
for col in df.columns: print(index) # 打印每列的列名
X = df.drop(['col1', 'col2', 'col3'], axis=1) # 删除列 y = df.label # 选取列
pd.crosstab(label, feature, margins=True)
# p-value scipy.stats.chi2_contingency(cross_table)[1] # chi^2 scipy.stats.chi2_contingency(cross_table)[0]
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 skb = SelectKBest(chi2, k=2) skb = skb.fit(cols, feat) skb.get_support()
# 所有行 pd.set_option('display.max_rows', None) # 所有列 pd.set_option('display.max_columns', None)