# plotnine:python middle ggplot2
import plotnine as pn
from plotnine import data
import numpy as np
import pandas as pd
#statistical analysis
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols, glm, poisson
import copy
#prevent pandas produce warnings (hint) DataFrame generate references for related operations or copy)
import warnings
warnings.filterwarnings("ignore")
# jupyter same in the middle cell multiple results are automatically output without the need for manual processing one by one print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Using the mtcars car dataset that comes with the plotnine library
Select subset df of mtcars, with a total of 32 records and 6 variables.
df = data.mtcars[["wt", "mpg", "cyl", "vs", "am", "gear"]]
df.shape
df.dtypes
print(df.head())
(32, 6)
wt float64
mpg float64
cyl int64
vs int64
am int64
gear int64
dtype: object
wt mpg cyl vs am gear
0 2.620 21.0 6 0 1 4
1 2.875 21.0 6 0 1 4
2 2.320 22.8 4 1 1 4
3 3.215 21.4 6 1 0 3
4 3.440 18.7 8 0 0 3
Convert variables vs, am, and gear from numerical continuous variables to character categorical variables.
df["vs"] = df["vs"].astype(str)
df["am"] = df["am"].astype(str)
df["gear"] = df["gear"].astype(str)
df.dtypes
wt float64
mpg float64
cyl int64
vs object
am object
gear object
dtype: object
Variable distribution.
#common distribution statistics of continuous variables
print(df.describe())
#category and frequency of categorical variables
df["vs"].value_counts()
df["am"].value_counts()
df["gear"].value_counts()
wt mpg cyl
count 32.000000 32.000000 32.000000
mean 3.217250 20.090625 6.187500
std 0.978457 6.026948 1.785922
min 1.513000 10.400000 4.000000
25% 2.581250 15.425000 4.000000
50% 3.325000 19.200000 6.000000
75% 3.610000 22.800000 8.000000
max 5.424000 33.900000 8.000000
0 18
1 14
Name: vs, dtype: int64
0 19
1 13
Name: am, dtype: int64
3 15
4 12
5 5
Name: gear, dtype: int64
Plotnine drawing
Plotnine is a Python drawing library that mimics the syntax and drawing styles of ggplot2. If familiar with R's ggplot2, this library can be quickly mastered
Official documents.
There are two main differences between the use of this library and R's ggplot2:
- The mapping parameter of
ggplot()is written asaes(x = mpg, y = wt)in R andaes(x="mpg", y="wt")in plotnine - Plotnine requires the entire drawing statement to be a single statement. If there is a need for line breaks in between, a connection or parentheses should be used at the beginning and end of the statement
Here are a few simple examples.
Scatter plot; Regression line
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt")) +
pn.geom_point() +
pn.geom_smooth(method="lm") +
pn.theme_classic()

<ggplot: (-9223371901789411352)>
Grouping
Divide by vs and plot scatter plots, regression lines, and their 95% confidence intervals separately.
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt", color="vs")) +
pn.geom_point() +
pn.geom_smooth(method="lm") +
pn.theme_classic()

<ggplot: (-9223371901789391480)>
Splitting (xkcd theme)
Splitting according to variables vs and gear.
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt")) +
pn.facet_grid("vs ~ gear") +
pn.geom_point() +
pn.geom_smooth(method="lm") +
pn.theme_xkcd()

<ggplot: (-9223371901789166508)>
Descriptive statistics
Missing values of each variable.
df.apply(lambda x: sum(x.isnull())).sort_values()
wt 0
mpg 0
cyl 0
vs 0
am 0
gear 0
count 0
dtype: int64
Distribution information of continuous and categorical variables.
print(df.describe())
# count: the number of non missing values for categorical variables
# unique: the number of unique values for categorical variables
# top: the highest frequency of occurrence in categorical variables
# freq: how many times did the number with the highest frequency appear in the categorical variable
print(df.describe(include="object"))
wt mpg cyl
count 32.000000 32.000000 32.000000
mean 3.217250 20.090625 6.187500
std 0.978457 6.026948 1.785922
min 1.513000 10.400000 4.000000
25% 2.581250 15.425000 4.000000
50% 3.325000 19.200000 6.000000
75% 3.610000 22.800000 8.000000
max 5.424000 33.900000 8.000000
vs am gear
count 32 32 32
unique 2 2 3
top 0 0 3
freq 18 19 15
#frequency
df["vs"].value_counts()
#constituent ratio
df["vs"].value_counts(normalize=True)
0 18
1 14
Name: vs, dtype: int64
0 0.5625
1 0.4375
Name: vs, dtype: float64
Other descriptive statistics
#variance
np.var(df["wt"])
#standard deviation
np.std(df["wt"])
0.927460875
0.9630477013107918
#mode
stats.mode(df["wt"])
ModeResult(mode=array([3.44]), count=array([3]))
#skewness
stats.skew(df["wt"])
0.44378553550607736
#kurtosis
stats.kurtosis(df["wt"])
0.1724705401587343
The standard deviation (i.e. standard error) of the mean of a normal distribution sample and its 95% confidence interval.
#standard error
se = np.std(df["wt"]) / np.sqrt(len(df["wt"]))
se
#upper and lower limits of the interval
np.mean(df["wt"]) - 1.96 * se
np.mean(df["wt"]) + 1.96 * se
0.17024439005074438
2.883570995500541
3.550929004499459
Statistical testing
Normality test
P= 0.09, accept the null hypothesis at a significance level of 0.05, that is, no variable wt was found to be non normally distributed.
stats.shapiro(df["wt"])
ShapiroResult(statistic=0.9432578682899475, pvalue=0.09265592694282532)
Two independent sample mean t-test
P= 0.38, no uneven variance found.
stats.bartlett(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"])
BartlettResult(statistic=0.7611752294629192, pvalue=0.38296098768025166)
P= 0.001, there is a significant difference in the mean values between the two populations.
#the homogeneity of variance test shows homogeneity of variance, therefore equal_var=True; if it is not aligned, a corrected one needs to be used t inspection (specified equal_var=False )or use rank sum test
stats.ttest_ind(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"], equal_var=True)
Ttest_indResult(statistic=3.653533152238974, pvalue=0.0009798492309250216)
T-test for the mean of two correlated samples
df_copy = copy.deepcopy(df)
df_copy.loc[:16, "group"] = "0"
df_copy.loc[16:, "group"] = "1"
df_copy["group"].value_counts()
0 16
1 16
Name: group, dtype: int64
stats.ttest_rel(df_copy.loc[df_copy["group"] == "0", "wt"],
df_copy.loc[df_copy["group"] == "1", "wt"])
Ttest_relResult(statistic=2.0782957471723527, pvalue=0.05526298801644364)
Regarding independent samples or related samples, for example:
A randomized controlled trial (RCT) was conducted to investigate the antihypertensive effects of drug A and drug B on elderly individuals. The study was divided into two groups: Group A and Group B, with each group taking medication for 14 days. Blood pressure values were measured at baseline (at enrollment) and at 14 days, respectively. For internal groups A or B, to investigate whether there is a significant difference in blood pressure values before and after medication, two related sample t-tests can be used (equivalent to comparing whether the mean difference before and after is significantly different from 0); If you want to compare whether there is a significant difference in the antihypertensive effect between Group A and Group B, you can first obtain the changes in blood pressure before and after the A and B groups, and then perform independent sample t-tests on the changes in the two groups [other methods such as covariance analysis, baseline blood pressure as covariate].
Rank sum test for two independent sample distributions
stats.ranksums(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"])
RanksumsResult(statistic=3.2668698585096214, pvalue=0.0010874365767340446)
Rank sum test for the distribution of two correlated samples
stats.wilcoxon(df_copy.loc[df_copy["group"] == "0", "wt"],
df_copy.loc[df_copy["group"] == "1", "wt"])
WilcoxonResult(statistic=27.0, pvalue=0.033538818359375)
Analysis of variance
First fit the regression model, then perform an analysis of variance on the regression model to obtain the analysis of variance table P= 0.0003 indicates a significant difference in mean values among the three groups.
print(sm.stats.anova_lm(ols("wt ~ C(gear)", data = df).fit()))
df sum_sq mean_sq F PR(>F)
C(gear) 2.0 12.878947 6.439473 11.115889 0.000261
Residual 29.0 16.799801 0.579303 NaN NaN
mc = sm.stats.multicomp.MultiComparison(df["wt"], df["gear"])
print(mc.tukeyhsd())
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
3 4 -1.2759 0.001 -2.0038 -0.5481 True
3 5 -1.26 0.0089 -2.2305 -0.2895 True
4 5 0.0159 0.9 -0.9844 1.0163 False
----------------------------------------------------
Chi square test
Method selection: (n is the total number of samples, E is the theoretical frequency of each cell).
- When n>= 40, and E>= 5: Use regular chi square test; When p ≈ α P approx alphap ≈ α ( α Alpha α Using Fisher's exact probability method for significance level
- When n>= 40, but 1< E< 5: Use corrected chi square test or Fisher's exact probability method
- When n< 40 or E<= 1: Using Fisher's exact probability method
print(pd.crosstab(df["vs"], df["am"]))
am 0 1
vs
0 12 6
1 7 7
Due to n= 32< 40. Fisher's exact probability method should be directly used; The use of Correction= True here is only for displaying its usage.
stats.chi2_contingency(pd.crosstab(df["vs"], df["am"]), correction = True)
(0.34753550543024225,
0.5555115470131495,
1,
array([[10.6875, 7.3125],
[ 8.3125, 5.6875]]))
Fisher's exact probability method: p= 0.47, no significant correlation was found between the distribution of the two variables.
stats.fisher_exact(pd.crosstab(df["vs"], df["am"]))
(2.0, 0.4726974416017807)
Related analysis
Pearson related
stats.pearsonr(df["wt"], df["mpg"])
(-0.8676593765172278, 1.2939587013505124e-10)
Spearman related
stats.spearmanr(df["wt"], df["mpg"])
SpearmanrResult(correlation=-0.8864220332702978, pvalue=1.487594858127497e-11)
Regression analysis
Multiple linear regression
lm.summary() returns a large amount of information, and the regression coefficients of each variable are coef. The regression coefficient of mpg is -0.1191 and p< 0.05 means that for every 1 unit decrease in mpg, while keeping Cyl constant, wt correspondingly decreases by 0.12 (95% CI, -0.18~-0.06) units.
lm = ols("wt ~ mpg + cyl", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wt R-squared: 0.760
Model: OLS Adj. R-squared: 0.743
Method: Least Squares F-statistic: 45.82
Date: Sun, 19 Jul 2020 Prob (F-statistic): 1.05e-09
Time: 17:06:33 Log-Likelihood: -21.393
No. Observations: 32 AIC: 48.79
Df Residuals: 29 BIC: 53.18
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.0760 1.117 4.544 0.000 2.791 7.361
mpg -0.1191 0.028 -4.216 0.000 -0.177 -0.061
cyl 0.0863 0.095 0.905 0.373 -0.109 0.281
==============================================================================
Omnibus: 5.603 Durbin-Watson: 0.987
Prob(Omnibus): 0.061 Jarque-Bera (JB): 4.488
Skew: 0.910 Prob(JB): 0.106
Kurtosis: 3.234 Cond. No. 277.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Regression fitting value of wt.
lm.predict(df)
0 3.092788
1 3.092788
2 2.705930
3 3.045155
4 3.539185
5 3.438123
6 4.063142
7 2.515401
8 2.705930
9 3.307134
10 3.473847
11 3.813072
12 3.705899
13 3.955969
14 4.527559
15 4.527559
16 4.015510
17 1.562751
18 1.800914
19 1.384130
20 2.860736
21 3.920245
22 3.955969
23 4.182224
24 3.479645
25 2.170065
26 2.324871
27 1.800914
28 3.884521
29 3.247593
30 3.979786
31 2.872644
dtype: float64
Contains qualitative variables
Qualitative variables are written into C(), with reference specifying the reference level
If the regression coefficient is 0.0738, it means that while keeping mpg constant, vs= Compared to vs; The mean wt. of group 0 is 0.07 units higher than that of group 0, but the p-value is not significant (usually, first look at the p-value, if the p-value
Significantly greater than The set significance level is as follows: p= 0.65, then there is no need to look at the regression coefficient again.
lm = ols("wt ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wt R-squared: 0.754
Model: OLS Adj. R-squared: 0.737
Method: Least Squares F-statistic: 44.36
Date: Sun, 19 Jul 2020 Prob (F-statistic): 1.51e-09
Time: 17:06:33 Log-Likelihood: -21.786
No. Observations: 32 AIC: 49.57
Df Residuals: 29 BIC: 53.97
Df Model: 2
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 6.0973 0.353 17.274 0.000 5.375 6.819
C(vs, Treatment(reference=0))[T.1] 0.0738 0.239 0.308 0.760 -0.416 0.563
mpg -0.1450 0.020 -7.243 0.000 -0.186 -0.104
==============================================================================
Omnibus: 5.895 Durbin-Watson: 1.108
Prob(Omnibus): 0.052 Jarque-Bera (JB): 4.733
Skew: 0.932 Prob(JB): 0.0938
Kurtosis: 3.279 Cond. No. 89.3
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Add nonlinear transformation
Transforming the dependent variable is equivalent to first taking the logarithm of wt to obtain the variable log_Wt, reuse log_Regression of wt to other variables
The regression coefficient of mpg -0.0495 can be explained as: for every 1 unit increase in mpg, wt decreases [e-0.05-1] × 100% [e ^ {-0.05} -1] times 100% [e − 0.05 − 1] × 100%, significant p-value.
lm = ols("np.log(wt) ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: np.log(wt) R-squared: 0.812
Model: OLS Adj. R-squared: 0.799
Method: Least Squares F-statistic: 62.66
Date: Sun, 19 Jul 2020 Prob (F-statistic): 2.97e-11
Time: 17:06:33 Log-Likelihood: 18.563
No. Observations: 32 AIC: -31.13
Df Residuals: 29 BIC: -26.73
Df Model: 2
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 2.0995 0.100 20.988 0.000 1.895 2.304
C(vs, Treatment(reference=0))[T.1] 0.0371 0.068 0.547 0.588 -0.102 0.176
mpg -0.0495 0.006 -8.724 0.000 -0.061 -0.038
==============================================================================
Omnibus: 2.197 Durbin-Watson: 1.607
Prob(Omnibus): 0.333 Jarque-Bera (JB): 1.872
Skew: 0.471 Prob(JB): 0.392
Kurtosis: 2.282 Cond. No. 89.3
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Transforming the independent variable here is equivalent to taking the square of mpg to obtain variable mpg2, and then using mpg2 to participate in regression
The regression equation can be written as wt ^= 4.53 − 0.003 × Mpg2 − 0.10 × Vs hat {wt}= 4.53-0.003 times mpg ^ {2} -0.10 times vswt ^= 4.53 − 0.003 × Mpg2 − 0.10 × Vs.
lm = ols("wt ~ np.square(mpg) + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wt R-squared: 0.680
Model: OLS Adj. R-squared: 0.658
Method: Least Squares F-statistic: 30.78
Date: Sun, 19 Jul 2020 Prob (F-statistic): 6.75e-08
Time: 17:06:33 Log-Likelihood: -25.982
No. Observations: 32 AIC: 57.96
Df Residuals: 29 BIC: 62.36
Df Model: 2
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 4.5263 0.198 22.906 0.000 4.122 4.930
C(vs, Treatment(reference=0))[T.1] -0.0965 0.265 -0.364 0.718 -0.638 0.445
np.square(mpg) -0.0029 0.000 -5.803 0.000 -0.004 -0.002
==============================================================================
Omnibus: 6.860 Durbin-Watson: 0.883
Prob(Omnibus): 0.032 Jarque-Bera (JB): 5.754
Skew: 1.027 Prob(JB): 0.0563
Kurtosis: 3.307 Cond. No. 1.35e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.35e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Logistic regression
Binomial specifies that the dependent variable is a binomial distribution, and logit specifies that the connection function is logit: logit (p)= Ln (p1-p) logit (p)= Ln ( frac {p} {1- p}) logit (p)= Ln (1-pp).
logit = glm("vs ~ mpg + C(am, Treatment(reference=0))", df,
family=sm.families.Binomial(sm.families.links.logit)).fit()
print(logit.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: ['vs[0]', 'vs[1]'] No. Observations: 32
Model: GLM Df Residuals: 29
Model Family: Binomial Df Model: 2
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -10.323
Date: Sun, 19 Jul 2020 Deviance: 20.646
Time: 17:06:33 Pearson chi2: 20.2
No. Iterations: 6
Covariance Type: nonrobust
======================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 12.7051 4.625 2.747 0.006 3.640 21.770
C(am, Treatment(reference=0))[T.1] 3.0073 1.599 1.880 0.060 -0.128 6.142
mpg -0.6809 0.252 -2.698 0.007 -1.176 -0.186
======================================================================================================
OR value
In logistic regression, the regression coefficients of each variable can be explained as OR (Odds Ratio, odds ratio or odds ratio) after natural exponential transformation. When the incidence of outcomes is low (< 10%), they can be used as estimates of RR to reflect the size of the exposure variable effect. For example, if the regression coefficient of am is 3.0073, it means that when mpg remains constant, am= The risk of outcome occurrence for group 1 is am; E3.01= for Group 0; 20.23e ^ {3.01}= 20.23e3.01= 20.23 times, p-value margin is not significant (p= 0.06). [Due to the fact that the incidence rate of vs outcomes is over 40%, it is not appropriate to use logistic regression to estimate the OR value to reflect the effects of exposure variables. Here, the Log binomial model can be used to estimate the present to present ratio PR].
np.exp(logit.params[1:])
C(am, Treatment(reference=0))[T.1] 20.232169
mpg 0.506151
dtype: float64
Poisson regression
The dependent variable of Poisson regression is the number of events per unit time or unit space, which is a count variable (positive integer).
#generate dependent variables that conform to the poisson distribution count , with an average of 5
df["count"] = np.random.poisson(lam=5, size=32)
df["count"].value_counts()
5 11
4 5
3 4
6 3
2 3
10 2
8 2
13 1
7 1
Name: count, dtype: int64
poi = poisson("count ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(poi.summary())
Optimization terminated successfully.
Current function value: 2.174190
Iterations 5
Poisson Regression Results
==============================================================================
Dep. Variable: count No. Observations: 32
Model: Poisson Df Residuals: 29
Method: MLE Df Model: 2
Date: Sun, 19 Jul 2020 Pseudo R-squ.: 0.02153
Time: 17:06:33 Log-Likelihood: -69.574
converged: True LL-Null: -71.105
Covariance Type: nonrobust LLR p-value: 0.2163
======================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 2.0939 0.313 6.691 0.000 1.481 2.707
C(vs, Treatment(reference=0))[T.1] -0.0290 0.211 -0.137 0.891 -0.443 0.385
mpg -0.0218 0.018 -1.199 0.230 -0.057 0.014
======================================================================================================
RR value
In Poisson's regression, the regression coefficients of each variable can be explained as RR (Relative Risk) after natural exponential transformation, and the size of the exposure variable effect is reflected in the correlation study
If the regression coefficient of vs is -0.0290, it means that while keeping mpg constant, vs= The risk of outcome occurrence for Group 1 is vs; E-0.03= for group 0; 0.98e ^ {-0.03}= 0.98e − 0.03= 0.98 times, but p-value is not significant (p= 0.89).
np.exp(poi.params[1:])
C(vs, Treatment(reference=0))[T.1] 0.971449
mpg 0.978414
dtype: float64