This article mainly documents the implementation method of Python for correlation statistical testing of data.
Mainly:
1. Continuous variable vs continuous variable: Pearson correlation coefficient.
2. Categorical variables vs. categorical variables: chi square test.
3. Continuous variable vs. multi categorical variable: F-test.
4. Continuous variable vs. binary variable: If the continuous variable satisfies a normal distribution, t-test is used; otherwise, Mann Whitney U-test is applied.
The main Python scripts are as follows:
from sklearn.feature_selection import SelectKBest,chi2
from scipy import stats
from scipy.stats import ttest_ind, levene
def corr_continue_continue(series_1,series_2):
corr = pearsonr(series_1,series_2)[1]
return corr
def corr_cate_cate(series_1,series_2):
series_1 = np.array(series_1).reshape(-1,1)
model1 = SelectKBest(chi2, k=1)
model1.fit_transform(series_1, series_2)
corr = model1.scores_[0]
p_value = model1.pvalues_[0]
return corr,p_value
def corr_continue_multicate(fenlei_series_1,lianxu_series_2):
f,p = stats.f_oneway(fenlei_series_1,lianxu_series_2)
return f,p
def normal_test(series):
u = series.mean()
std = series.std()
statistic,pvalue = stats.kstest(series,'norm',(u,std))
return statistic,pvalue
def corr_continue_twocate(data,fenlei_var,lianxu_var):
fenlei = list(set(data[fenlei_var]))
series_1 = data[lianxu_var][data[fenlei_var]==fenlei[0]]
series_2 = data[lianxu_var][data[fenlei_var]==fenlei[1]]
stats_1,pvalue_1 = normal_test(series_1)
stats_2,pvalue_2 = normal_test(series_2)
if pvalue_1 < 0.05 or pvalue_2 < 0.05:
t_value,p_value = stats.mannwhitneyu(series_1,series_2,alternative='two-sided')
else:
_,levene_p_value = levene(series_1,series_2)
if levene_p_value>0.05:
t_value,p_value = ttest_ind(series_1,series_2)
else:
t_value,p_value = ttest_ind(series_1,series_2,equal_var=False)
return t_value,p_value,pvalue_1,pvalue_2