Statistical Analysis
SciPy's scipy.stats module is one of the most comprehensive statistical analysis tools in Python. It provides a large number of probability distributions, statistical functions, and hypothesis testing methods, making it an important tool for data science and statistical analysis. This chapter will provide a detailed introduction to using these functions for various statistical analyses.
scipy.stats Module Overview
The scipy.stats module includes the following main functions:
- 80+ continuous and discrete probability distributions
- Descriptive statistics functions
- Hypothesis testing methods
- Correlation and regression analysis
- Kernel density estimation
- Statistical distance computation
python
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
# Set plot style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# View main functions of stats module
print("scipy.stats Main Functions:")
functions = [attr for attr in dir(stats) if not attr.startswith('_')]
print(f"Total {len(functions)} functions and classes")
print("Some functions:", functions[:15])Probability Distributions
1. Continuous Distributions
Normal Distribution
python
# Normal distribution is the most important continuous distribution
mu, sigma = 0, 1 # Standard normal distribution
# Create normal distribution object
norm_dist = stats.norm(loc=mu, scale=sigma)
# Probability Density Function (PDF)
x = np.linspace(-4, 4, 1000)
pdf_values = norm_dist.pdf(x)
# Cumulative Distribution Function (CDF)
cdf_values = norm_dist.cdf(x)
# Percent Point Function (PPF)
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
ppf_values = norm_dist.ppf(quantiles)
print(f"Normal Distribution N({mu}, {sigma}²):")
print(f"Quantiles: {dict(zip(quantiles, ppf_values))}")
# Random number generation
random_samples = norm_dist.rvs(size=1000, random_state=42)
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# PDF
axes[0, 0].plot(x, pdf_values, 'b-', linewidth=2, label='PDF')
axes[0, 0].fill_between(x, pdf_values, alpha=0.3)
axes[0, 0].set_title('Probability Density Function (PDF)')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('Density')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# CDF
axes[0, 1].plot(x, cdf_values, 'r-', linewidth=2, label='CDF')
axes[0, 1].set_title('Cumulative Distribution Function (CDF)')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('Probability')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# Random sample histogram
axes[1, 0].hist(random_samples, bins=50, density=True, alpha=0.7, color='green')
axes[1, 0].plot(x, pdf_values, 'b-', linewidth=2, label='Theoretical PDF')
axes[1, 0].set_title('Random Sample Distribution')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('Density')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# Q-Q plot
stats.probplot(random_samples, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Other Important Continuous Distributions
python
# t distribution
t_dist = stats.t(df=5) # Degrees of freedom 5
# Chi-square distribution
chi2_dist = stats.chi2(df=3) # Degrees of freedom 3
# F distribution
f_dist = stats.f(dfn=2, dfd=10) # Numerator df 2, denominator df 10
# Exponential distribution
expon_dist = stats.expon(scale=2) # Scale parameter 2
# Gamma distribution
gamma_dist = stats.gamma(a=2, scale=1) # Shape parameter 2, scale parameter 1
# Beta distribution
beta_dist = stats.beta(a=2, b=5) # Parameters a=2, b=5
# Uniform distribution
uniform_dist = stats.uniform(loc=0, scale=1) # Uniform distribution on [0, 1]
# Compare different distribution PDFs
x_range = np.linspace(0, 5, 1000)
plt.figure(figsize=(15, 10))
# Plot various distributions
distributions = {
'Normal N(1,0.5²)': stats.norm(1, 0.5),
't Distribution (df=5)': t_dist,
'Exponential (λ=0.5)': expon_dist,
'Gamma (α=2,β=1)': gamma_dist,
'Chi-square (df=3)': chi2_dist
}
for i, (name, dist) in enumerate(distributions.items()):
plt.subplot(2, 3, i+1)
if name.startswith('Beta'):
x_plot = np.linspace(0, 1, 1000)
else:
x_plot = x_range
pdf_vals = dist.pdf(x_plot)
plt.plot(x_plot, pdf_vals, linewidth=2, label=name)
plt.fill_between(x_plot, pdf_vals, alpha=0.3)
plt.title(name)
plt.xlabel('x')
plt.ylabel('Density')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
# Distribution statistics
print("\nStatistics of Each Distribution:")
for name, dist in distributions.items():
mean = dist.mean()
var = dist.var()
std = dist.std()
skew = dist.stats(moments='s')
kurt = dist.stats(moments='k')
print(f"{name}:")
print(f" Mean: {mean:.3f}, Variance: {var:.3f}, Std: {std:.3f}")
print(f" Skewness: {skew:.3f}, Kurtosis: {kurt:.3f}")
print()2. Discrete Distributions
python
# Binomial distribution
binom_dist = stats.binom(n=20, p=0.3)
# Poisson distribution
poisson_dist = stats.poisson(mu=3)
# Geometric distribution
geom_dist = stats.geom(p=0.2)
# Negative binomial distribution
nbinom_dist = stats.nbinom(n=5, p=0.3)
# Hypergeometric distribution
hypergeom_dist = stats.hypergeom(M=50, n=10, N=15)
# Visualize discrete distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# Binomial distribution
k_binom = np.arange(0, 21)
pmf_binom = binom_dist.pmf(k_binom)
axes[0, 0].bar(k_binom, pmf_binom, alpha=0.7)
axes[0, 0].set_title('Binomial B(20, 0.3)')
axes[0, 0].set_xlabel('k')
axes[0, 0].set_ylabel('P(X=k)')
axes[0, 0].grid(True, alpha=0.3)
# Poisson distribution
k_poisson = np.arange(0, 15)
pmf_poisson = poisson_dist.pmf(k_poisson)
axes[0, 1].bar(k_poisson, pmf_poisson, alpha=0.7, color='orange')
axes[0, 1].set_title('Poisson Pois(3)')
axes[0, 1].set_xlabel('k')
axes[0, 1].set_ylabel('P(X=k)')
axes[0, 1].grid(True, alpha=0.3)
# Geometric distribution
k_geom = np.arange(1, 21)
pmf_geom = geom_dist.pmf(k_geom)
axes[0, 2].bar(k_geom, pmf_geom, alpha=0.7, color='green')
axes[0, 2].set_title('Geometric Geom(0.2)')
axes[0, 2].set_xlabel('k')
axes[0, 2].set_ylabel('P(X=k)')
axes[0, 2].grid(True, alpha=0.3)
# Negative binomial distribution
k_nbinom = np.arange(0, 30)
pmf_nbinom = nbinom_dist.pmf(k_nbinom)
axes[1, 0].bar(k_nbinom, pmf_nbinom, alpha=0.7, color='red')
axes[1, 0].set_title('Negative Binomial NB(5, 0.3)')
axes[1, 0].set_xlabel('k')
axes[1, 0].set_ylabel('P(X=k)')
axes[1, 0].grid(True, alpha=0.3)
# Hypergeometric distribution
k_hypergeom = np.arange(0, 11)
pmf_hypergeom = hypergeom_dist.pmf(k_hypergeom)
axes[1, 1].bar(k_hypergeom, pmf_hypergeom, alpha=0.7, color='purple')
axes[1, 1].set_title('Hypergeometric H(50,10,15)')
axes[1, 1].set_xlabel('k')
axes[1, 1].set_ylabel('P(X=k)')
axes[1, 1].grid(True, alpha=0.3)
# Compare binomial and Poisson distributions (when n large, p small)
n_large, p_small = 100, 0.03
binom_approx = stats.binom(n=n_large, p=p_small)
poisson_approx = stats.poisson(mu=n_large * p_small)
k_compare = np.arange(0, 15)
pmf_binom_approx = binom_approx.pmf(k_compare)
pmf_poisson_approx = poisson_approx.pmf(k_compare)
axes[1, 2].bar(k_compare - 0.2, pmf_binom_approx, width=0.4, alpha=0.7,
label='Binomial B(100,0.03)')
axes[1, 2].bar(k_compare + 0.2, pmf_poisson_approx, width=0.4, alpha=0.7,
label='Poisson Pois(3)')
axes[1, 2].set_title('Poisson Approximation of Binomial')
axes[1, 2].set_xlabel('k')
axes[1, 2].set_ylabel('P(X=k)')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Statistics of discrete distributions
discrete_distributions = {
'Binomial B(20,0.3)': binom_dist,
'Poisson Pois(3)': poisson_dist,
'Geometric Geom(0.2)': geom_dist,
'Negative Binomial NB(5,0.3)': nbinom_dist
}
print("\nStatistics of Discrete Distributions:")
for name, dist in discrete_distributions.items():
mean = dist.mean()
var = dist.var()
std = dist.std()
print(f"{name}:")
print(f" Mean: {mean:.3f}, Variance: {var:.3f}, Std: {std:.3f}")
print()Descriptive Statistics
1. Basic Statistics
python
# Generate example data
np.random.seed(42)
data1 = np.random.normal(100, 15, 1000) # Normal distribution data
data2 = np.random.exponential(2, 1000) # Exponential distribution data
data3 = np.random.uniform(0, 10, 1000) # Uniform distribution data
# Calculate various statistics
def describe_data(data, name):
print(f"\nDescriptive Statistics for {name}:")
print(f"Sample Size: {len(data)}")
print(f"Mean: {np.mean(data):.3f}")
print(f"Median: {np.median(data):.3f}")
print(f"Mode: {stats.mode(data, keepdims=True)[0][0]:.3f}")
print(f"Std Dev: {np.std(data, ddof=1):.3f}")
print(f"Variance: {np.var(data, ddof=1):.3f}")
print(f"Skewness: {stats.skew(data):.3f}")
print(f"Kurtosis: {stats.kurtosis(data):.3f}")
print(f"Min: {np.min(data):.3f}")
print(f"Max: {np.max(data):.3f}")
print(f"Range: {np.ptp(data):.3f}")
# Percentiles
percentiles = [5, 25, 50, 75, 95]
quantile_values = np.percentile(data, percentiles)
print(f"Percentiles: {dict(zip(percentiles, quantile_values))}")
# Interquartile range
q1, q3 = np.percentile(data, [25, 75])
iqr = q3 - q1
print(f"Interquartile Range (IQR): {iqr:.3f}")
# Coefficient of variation
cv = np.std(data, ddof=1) / np.mean(data)
print(f"Coefficient of Variation: {cv:.3f}")
return {
'mean': np.mean(data),
'median': np.median(data),
'std': np.std(data, ddof=1),
'skew': stats.skew(data),
'kurtosis': stats.kurtosis(data)
}
# Analyze three groups of data
stats1 = describe_data(data1, "Normal Distribution Data")
stats2 = describe_data(data2, "Exponential Distribution Data")
stats3 = describe_data(data3, "Uniform Distribution Data")
# Use scipy.stats.describe function
print("\nUsing scipy.stats.describe:")
for i, (data, name) in enumerate([(data1, "Normal"), (data2, "Exponential"), (data3, "Uniform")]):
desc = stats.describe(data)
print(f"\n{name}:")
print(f" Sample Count: {desc.nobs}")
print(f" Min-Max: ({desc.minmax[0]:.3f}, {desc.minmax[1]:.3f})")
print(f" Mean: {desc.mean:.3f}")
print(f" Variance: {desc.variance:.3f}")
print(f" Skewness: {desc.skewness:.3f}")
print(f" Kurtosis: {desc.kurtosis:.3f}")2. Data Visualization
python
# Comprehensive visualization analysis
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
datasets = [(data1, "Normal Distribution"), (data2, "Exponential Distribution"), (data3, "Uniform Distribution")]
for i, (data, name) in enumerate(datasets):
# Histogram
axes[i, 0].hist(data, bins=50, density=True, alpha=0.7, edgecolor='black')
axes[i, 0].set_title(f'{name} - Histogram')
axes[i, 0].set_ylabel('Density')
axes[i, 0].grid(True, alpha=0.3)
# Box plot
axes[i, 1].boxplot(data, vert=True)
axes[i, 1].set_title(f'{name} - Box Plot')
axes[i, 1].grid(True, alpha=0.3)
# Q-Q plot (compared to normal distribution)
stats.probplot(data, dist="norm", plot=axes[i, 2])
axes[i, 2].set_title(f'{name} - Q-Q Plot')
axes[i, 2].grid(True, alpha=0.3)
# Empirical cumulative distribution function
sorted_data = np.sort(data)
y_values = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
axes[i, 3].plot(sorted_data, y_values, linewidth=2)
axes[i, 3].set_title(f'{name} - Empirical CDF')
axes[i, 3].set_xlabel('Value')
axes[i, 3].set_ylabel('Cumulative Probability')
axes[i, 3].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Kernel density estimation
plt.figure(figsize=(12, 4))
for i, (data, name) in enumerate(datasets):
plt.subplot(1, 3, i+1)
# Histogram
plt.hist(data, bins=50, density=True, alpha=0.5, label='Histogram')
# Kernel density estimation
kde = stats.gaussian_kde(data)
x_range = np.linspace(data.min(), data.max(), 200)
kde_values = kde(x_range)
plt.plot(x_range, kde_values, linewidth=2, label='Kernel Density Estimation')
plt.title(f'{name}')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Hypothesis Testing
1. One-Sample Tests
python
# One-sample t-test
# H0: μ = μ0 vs H1: μ ≠ μ0
np.random.seed(42)
sample = np.random.normal(105, 15, 50) # True mean is 105
mu0 = 100 # Hypothesized mean
t_stat, p_value = stats.ttest_1samp(sample, mu0)
print("One-Sample t-Test:")
print(f"Sample Mean: {np.mean(sample):.3f}")
print(f"Hypothesized Mean: {mu0}")
print(f"t Statistic: {t_stat:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Conclusion: {'Reject' if p_value < 0.05 else 'Do not reject'} H0 (α=0.05)")
# One-sample Wilcoxon signed-rank test (non-parametric)
wilcoxon_stat, wilcoxon_p = stats.wilcoxon(sample - mu0)
print(f"\nWilcoxon Signed-Rank Test:")
print(f"Statistic: {wilcoxon_stat}")
print(f"p Value: {wilcoxon_p:.6f}")
print(f"Conclusion: {'Reject' if wilcoxon_p < 0.05 else 'Do not reject'} H0 (α=0.05)")
# Normality test
# Shapiro-Wilk test
shapiro_stat, shapiro_p = stats.shapiro(sample)
print(f"\nShapiro-Wilk Normality Test:")
print(f"Statistic: {shapiro_stat:.6f}")
print(f"p Value: {shapiro_p:.6f}")
print(f"Conclusion: Data {'does not' if shapiro_p < 0.05 else 'does'} follow normal distribution (α=0.05)")
# Kolmogorov-Smirnov test
ks_stat, ks_p = stats.kstest(sample, 'norm', args=(np.mean(sample), np.std(sample, ddof=1)))
print(f"\nKolmogorov-Smirnov Normality Test:")
print(f"Statistic: {ks_stat:.6f}")
print(f"p Value: {ks_p:.6f}")
print(f"Conclusion: Data {'does not' if ks_p < 0.05 else 'does'} follow normal distribution (α=0.05)")
# Anderson-Darling test
ad_result = stats.anderson(sample, dist='norm')
print(f"\nAnderson-Darling Normality Test:")
print(f"Statistic: {ad_result.statistic:.6f}")
print(f"Critical Values: {ad_result.critical_values}")
print(f"Significance Level: {ad_result.significance_level}")2. Two-Sample Tests
python
# Generate two samples
np.random.seed(42)
group1 = np.random.normal(100, 15, 50)
group2 = np.random.normal(105, 15, 45)
# Independent samples t-test
# Assuming equal variances
t_stat_equal, p_equal = stats.ttest_ind(group1, group2, equal_var=True)
# Not assuming equal variances (Welch's t-test)
t_stat_unequal, p_unequal = stats.ttest_ind(group1, group2, equal_var=False)
print("Independent Samples t-Test:")
print(f"Group 1 Mean: {np.mean(group1):.3f}, Group 2 Mean: {np.mean(group2):.3f}")
print(f"\nAssuming Equal Variances:")
print(f" t Statistic: {t_stat_equal:.3f}")
print(f" p Value: {p_equal:.6f}")
print(f"\nNot Assuming Equal Variances (Welch's t-test):")
print(f" t Statistic: {t_stat_unequal:.3f}")
print(f" p Value: {p_unequal:.6f}")
# Homogeneity of variance test
# Levene test
levene_stat, levene_p = stats.levene(group1, group2)
print(f"\nLevene Homogeneity of Variance Test:")
print(f"Statistic: {levene_stat:.3f}")
print(f"p Value: {levene_p:.6f}")
print(f"Conclusion: Variances are {'not' if levene_p < 0.05 else 'are'} equal (α=0.05)")
# Bartlett test (assumes normality)
bartlett_stat, bartlett_p = stats.bartlett(group1, group2)
print(f"\nBartlett Homogeneity of Variance Test:")
print(f"Statistic: {bartlett_stat:.3f}")
print(f"p Value: {bartlett_p:.6f}")
# Non-parametric tests
# Mann-Whitney U test
mw_stat, mw_p = stats.mannwhitneyu(group1, group2, alternative='two-sided')
print(f"\nMann-Whitney U Test:")
print(f"Statistic: {mw_stat}")
print(f"p Value: {mw_p:.6f}")
print(f"Conclusion: {'Reject' if mw_p < 0.05 else 'Do not reject'} H0 (α=0.05)")
# Kolmogorov-Smirnov two-sample test
ks2_stat, ks2_p = stats.ks_2samp(group1, group2)
print(f"\nKolmogorov-Smirnov Two-Sample Test:")
print(f"Statistic: {ks2_stat:.6f}")
print(f"p Value: {ks2_p:.6f}")3. Multiple Sample Tests
python
# Generate multiple samples
np.random.seed(42)
group_a = np.random.normal(100, 15, 30)
group_b = np.random.normal(105, 15, 35)
group_c = np.random.normal(110, 15, 25)
group_d = np.random.normal(103, 15, 40)
# One-way ANOVA
f_stat, anova_p = stats.f_oneway(group_a, group_b, group_c, group_d)
print("One-Way ANOVA:")
print(f"F Statistic: {f_stat:.3f}")
print(f"p Value: {anova_p:.6f}")
print(f"Conclusion: {'Reject' if anova_p < 0.05 else 'Do not reject'} H0 (α=0.05)")
# Non-parametric version: Kruskal-Wallis test
kw_stat, kw_p = stats.kruskal(group_a, group_b, group_c, group_d)
print(f"\nKruskal-Wallis Test:")
print(f"Statistic: {kw_stat:.3f}")
print(f"p Value: {kw_p:.6f}")
print(f"Conclusion: {'Reject' if kw_p < 0.05 else 'Do not reject'} H0 (α=0.05)")
# Post-hoc tests (if ANOVA is significant)
if anova_p < 0.05:
print("\nPerforming Post-Hoc Tests...")
# Combine all data
all_data = np.concatenate([group_a, group_b, group_c, group_d])
groups = (['A'] * len(group_a) + ['B'] * len(group_b) +
['C'] * len(group_c) + ['D'] * len(group_d))
# Pairwise t-tests (Tukey HSD requires other libraries)
from itertools import combinations
group_data = {'A': group_a, 'B': group_b, 'C': group_c, 'D': group_d}
group_names = list(group_data.keys())
print("Pairwise t-Tests (Bonferroni Corrected):")
n_comparisons = len(list(combinations(group_names, 2)))
alpha_corrected = 0.05 / n_comparisons
for name1, name2 in combinations(group_names, 2):
t_stat, p_val = stats.ttest_ind(group_data[name1], group_data[name2])
significant = p_val < alpha_corrected
print(f" {name1} vs {name2}: t={t_stat:.3f}, p={p_val:.6f}, Significant: {significant}")
# Visualize multiple group comparisons
plt.figure(figsize=(12, 8))
# Box plot
plt.subplot(2, 2, 1)
data_for_box = [group_a, group_b, group_c, group_d]
labels = ['Group A', 'Group B', 'Group C', 'Group D']
plt.boxplot(data_for_box, labels=labels)
plt.title('Multiple Group Box Plot')
plt.ylabel('Value')
plt.grid(True, alpha=0.3)
# Violin plot
plt.subplot(2, 2, 2)
positions = [1, 2, 3, 4]
parts = plt.violinplot(data_for_box, positions=positions)
plt.xticks(positions, labels)
plt.title('Multiple Group Violin Plot')
plt.ylabel('Value')
plt.grid(True, alpha=0.3)
# Means and confidence intervals
plt.subplot(2, 2, 3)
means = [np.mean(data) for data in data_for_box]
stds = [np.std(data, ddof=1) for data in data_for_box]
ns = [len(data) for data in data_for_box]
ses = [std / np.sqrt(n) for std, n in zip(stds, ns)]
cis = [1.96 * se for se in ses] # 95% confidence interval
plt.errorbar(range(1, 5), means, yerr=cis, fmt='o-', capsize=5, capthick=2)
plt.xticks(range(1, 5), labels)
plt.title('Means and 95% Confidence Intervals')
plt.ylabel('Mean')
plt.grid(True, alpha=0.3)
# Density plot
plt.subplot(2, 2, 4)
for i, (data, label) in enumerate(zip(data_for_box, labels)):
plt.hist(data, bins=20, alpha=0.5, label=label, density=True)
plt.title('Multiple Group Density Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()4. Chi-Square Test
python
# Goodness-of-fit test
# Test if a die is fair
observed_dice = np.array([18, 22, 16, 25, 12, 7]) # Observed frequencies
expected_dice = np.array([100/6] * 6) # Expected frequencies (fair die)
chi2_stat, chi2_p = stats.chisquare(observed_dice, expected_dice)
print("Chi-Square Goodness-of-Fit Test (Die Fairness):")
print(f"Observed Frequencies: {observed_dice}")
print(f"Expected Frequencies: {expected_dice}")
print(f"Chi-Square Statistic: {chi2_stat:.3f}")
print(f"p Value: {chi2_p:.6f}")
print(f"Conclusion: Die is {'not' if chi2_p < 0.05 else 'fair'} (α=0.05)")
# Independence test
# Association between gender and product preference
contingency_table = np.array([[20, 30, 25], # Males preferring products A, B, C
[25, 20, 30]]) # Females preferring products A, B, C
chi2_indep, p_indep, dof, expected = stats.chi2_contingency(contingency_table)
print(f"\nChi-Square Independence Test (Gender vs Product Preference):")
print(f"Observed Frequency Table:\n{contingency_table}")
print(f"Expected Frequency Table:\n{expected}")
print(f"Chi-Square Statistic: {chi2_indep:.3f}")
print(f"Degrees of Freedom: {dof}")
print(f"p Value: {p_indep:.6f}")
print(f"Conclusion: Gender {'is' if p_indep < 0.05 else 'is not'} associated with product preference (α=0.05)")
# Cramér's V (association strength)
n = np.sum(contingency_table)
cramers_v = np.sqrt(chi2_indep / (n * (min(contingency_table.shape) - 1)))
print(f"Cramér's V: {cramers_v:.3f}")
# Fisher's exact test (2x2 table)
table_2x2 = np.array([[10, 15], [5, 20]])
odds_ratio, fisher_p = stats.fisher_exact(table_2x2)
print(f"\nFisher's Exact Test:")
print(f"2x2 Table:\n{table_2x2}")
print(f"Odds Ratio: {odds_ratio:.3f}")
print(f"p Value: {fisher_p:.6f}")
print(f"Conclusion: {'Reject' if fisher_p < 0.05 else 'Do not reject'} independence assumption (α=0.05)")Correlation Analysis
1. Linear Correlation
python
# Generate correlated data
np.random.seed(42)
n = 100
# Strong positive correlation
x1 = np.random.normal(0, 1, n)
y1 = 2 * x1 + np.random.normal(0, 0.5, n)
# Weak negative correlation
x2 = np.random.normal(0, 1, n)
y2 = -0.3 * x2 + np.random.normal(0, 2, n)
# No correlation
x3 = np.random.normal(0, 1, n)
y3 = np.random.normal(0, 1, n)
# Non-linear relationship
x4 = np.linspace(-3, 3, n)
y4 = x4**2 + np.random.normal(0, 1, n)
# Pearson correlation coefficient
corr_datasets = [(x1, y1, "Strong Positive"), (x2, y2, "Weak Negative"),
(x3, y3, "No Correlation"), (x4, y4, "Non-linear Relationship")]
print("Pearson Correlation Coefficient:")
for x, y, name in corr_datasets:
r, p_value = stats.pearsonr(x, y)
print(f"{name}: r = {r:.3f}, p = {p_value:.6f}")
# Spearman rank correlation coefficient
print("\nSpearman Rank Correlation Coefficient:")
for x, y, name in corr_datasets:
rho, p_value = stats.spearmanr(x, y)
print(f"{name}: ρ = {rho:.3f}, p = {p_value:.6f}")
# Kendall's tau
print("\nKendall's tau:")
for x, y, name in corr_datasets:
tau, p_value = stats.kendalltau(x, y)
print(f"{name}: τ = {tau:.3f}, p = {p_value:.6f}")
# Visualize correlations
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
for i, (x, y, name) in enumerate(corr_datasets):
# Scatter plot
axes[0, i].scatter(x, y, alpha=0.6)
axes[0, i].set_title(f'{name}')
axes[0, i].set_xlabel('X')
axes[0, i].set_ylabel('Y')
axes[0, i].grid(True, alpha=0.3)
# Add regression line
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line = slope * x + intercept
axes[0, i].plot(x, line, 'r-', alpha=0.8)
axes[0, i].text(0.05, 0.95, f'r = {r_value:.3f}',
transform=axes[0, i].transAxes, verticalalignment='top')
# Residual plot
residuals = y - line
axes[1, i].scatter(x, residuals, alpha=0.6)
axes[1, i].axhline(y=0, color='r', linestyle='--')
axes[1, i].set_title(f'{name} - Residual Plot')
axes[1, i].set_xlabel('X')
axes[1, i].set_ylabel('Residuals')
axes[1, i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()2. Multivariate Correlation
python
# Generate multivariate data
np.random.seed(42)
n = 200
# Create correlation matrix
corr_matrix = np.array([[1.0, 0.8, 0.3, -0.2],
[0.8, 1.0, 0.1, -0.4],
[0.3, 0.1, 1.0, 0.6],
[-0.2, -0.4, 0.6, 1.0]])
# Use Cholesky decomposition to generate correlated data
L = np.linalg.cholesky(corr_matrix)
uncorr_data = np.random.normal(0, 1, (n, 4))
corr_data = uncorr_data @ L.T
# Calculate sample correlation matrix
sample_corr = np.corrcoef(corr_data.T)
print("Theoretical Correlation Matrix:")
print(corr_matrix)
print("\nSample Correlation Matrix:")
print(sample_corr)
# Significance testing of correlation matrix
from scipy.stats import pearsonr
n_vars = corr_data.shape[1]
corr_matrix_sample = np.zeros((n_vars, n_vars))
p_matrix = np.zeros((n_vars, n_vars))
for i in range(n_vars):
for j in range(n_vars):
if i != j:
r, p = pearsonr(corr_data[:, i], corr_data[:, j])
corr_matrix_sample[i, j] = r
p_matrix[i, j] = p
else:
corr_matrix_sample[i, j] = 1.0
p_matrix[i, j] = 0.0
print("\np-Value Matrix for Correlation Coefficients:")
print(p_matrix)
# Visualize correlation matrix
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Theoretical correlation matrix
im1 = axes[0].imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
axes[0].set_title('Theoretical Correlation Matrix')
for i in range(4):
for j in range(4):
axes[0].text(j, i, f'{corr_matrix[i, j]:.2f}',
ha='center', va='center')
# Sample correlation matrix
im2 = axes[1].imshow(sample_corr, cmap='RdBu_r', vmin=-1, vmax=1)
axes[1].set_title('Sample Correlation Matrix')
for i in range(4):
for j in range(4):
axes[1].text(j, i, f'{sample_corr[i, j]:.2f}',
ha='center', va='center')
# p-value matrix
im3 = axes[2].imshow(p_matrix, cmap='Reds_r', vmin=0, vmax=0.05)
axes[2].set_title('p-Value Matrix')
for i in range(4):
for j in range(4):
color = 'white' if p_matrix[i, j] < 0.025 else 'black'
axes[2].text(j, i, f'{p_matrix[i, j]:.3f}',
ha='center', va='center', color=color)
# Add colorbars
fig.colorbar(im1, ax=axes[0])
fig.colorbar(im2, ax=axes[1])
fig.colorbar(im3, ax=axes[2])
plt.tight_layout()
plt.show()
# Scatter plot matrix
fig, axes = plt.subplots(4, 4, figsize=(12, 12))
for i in range(4):
for j in range(4):
if i == j:
# Diagonal: histogram
axes[i, j].hist(corr_data[:, i], bins=20, alpha=0.7)
axes[i, j].set_title(f'Variable {i+1}')
else:
# Off-diagonal: scatter plot
axes[i, j].scatter(corr_data[:, j], corr_data[:, i], alpha=0.5)
r, p = pearsonr(corr_data[:, j], corr_data[:, i])
axes[i, j].set_title(f'r = {r:.3f}')
axes[i, j].grid(True, alpha=0.3)
if i == 3:
axes[i, j].set_xlabel(f'Variable {j+1}')
if j == 0:
axes[i, j].set_ylabel(f'Variable {i+1}')
plt.tight_layout()
plt.show()Regression Analysis
1. Simple Linear Regression
python
# Generate regression data
np.random.seed(42)
n = 100
x = np.random.uniform(0, 10, n)
y = 2 + 3 * x + np.random.normal(0, 2, n) # y = 2 + 3x + ε
# Linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print("Simple Linear Regression Results:")
print(f"Slope: {slope:.3f} ± {std_err:.3f}")
print(f"Intercept: {intercept:.3f}")
print(f"Correlation Coefficient: {r_value:.3f}")
print(f"Coefficient of Determination R²: {r_value**2:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Standard Error: {std_err:.3f}")
# Prediction
y_pred = slope * x + intercept
residuals = y - y_pred
# Calculate confidence intervals and prediction intervals
from scipy import stats as scipy_stats
# Calculate statistics
n = len(x)
mean_x = np.mean(x)
ss_xx = np.sum((x - mean_x)**2)
ss_res = np.sum(residuals**2)
mse = ss_res / (n - 2)
# t value (95% confidence interval)
t_val = scipy_stats.t.ppf(0.975, n - 2)
# Confidence and prediction intervals
x_new = np.linspace(x.min(), x.max(), 100)
y_new = slope * x_new + intercept
# Confidence interval (mean confidence interval)
se_mean = np.sqrt(mse * (1/n + (x_new - mean_x)**2 / ss_xx))
ci_lower = y_new - t_val * se_mean
ci_upper = y_new + t_val * se_mean
# Prediction interval (individual value prediction interval)
se_pred = np.sqrt(mse * (1 + 1/n + (x_new - mean_x)**2 / ss_xx))
pi_lower = y_new - t_val * se_pred
pi_upper = y_new + t_val * se_pred
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Regression plot
axes[0, 0].scatter(x, y, alpha=0.6, label='Data Points')
axes[0, 0].plot(x_new, y_new, 'r-', linewidth=2, label='Regression Line')
axes[0, 0].fill_between(x_new, ci_lower, ci_upper, alpha=0.3, label='95% Confidence Interval')
axes[0, 0].fill_between(x_new, pi_lower, pi_upper, alpha=0.2, label='95% Prediction Interval')
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('Y')
axes[0, 0].set_title('Linear Regression')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# Residual plot
axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Values')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residual Plot')
axes[0, 1].grid(True, alpha=0.3)
# Residual normality test
axes[1, 0].hist(residuals, bins=20, density=True, alpha=0.7, edgecolor='black')
# Overlay normal distribution
resid_mean, resid_std = np.mean(residuals), np.std(residuals)
x_norm = np.linspace(residuals.min(), residuals.max(), 100)
y_norm = scipy_stats.norm.pdf(x_norm, resid_mean, resid_std)
axes[1, 0].plot(x_norm, y_norm, 'r-', linewidth=2, label='Normal Distribution')
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Density')
axes[1, 0].set_title('Residual Distribution')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# Q-Q plot
scipy_stats.probplot(residuals, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Residual Q-Q Plot')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Residual analysis
print(f"\nResidual Analysis:")
print(f"Residual Mean: {np.mean(residuals):.6f}")
print(f"Residual Std Dev: {np.std(residuals):.3f}")
# Residual normality test
shapiro_stat, shapiro_p = scipy_stats.shapiro(residuals)
print(f"Shapiro-Wilk Test: statistic={shapiro_stat:.6f}, p-value={shapiro_p:.6f}")
# Durbin-Watson test (autocorrelation)
def durbin_watson(residuals):
diff = np.diff(residuals)
return np.sum(diff**2) / np.sum(residuals**2)
dw_stat = durbin_watson(residuals)
print(f"Durbin-Watson Statistic: {dw_stat:.3f}")
print(f"Autocorrelation: {'Exists' if dw_stat < 1.5 or dw_stat > 2.5 else 'Does not exist'}")2. Multiple Linear Regression
python
# Use sklearn for multiple regression (scipy.stats mainly for statistical tests)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# Generate multiple regression data
np.random.seed(42)
n = 200
X = np.random.normal(0, 1, (n, 3)) # 3 predictor variables
beta = np.array([2, -1.5, 3]) # True coefficients
y = 5 + X @ beta + np.random.normal(0, 2, n) # y = 5 + 2x1 - 1.5x2 + 3x3 + ε
# Fit model
model = LinearRegression()
model.fit(X, y)
# Prediction
y_pred = model.predict(X)
residuals = y - y_pred
print("Multiple Linear Regression Results:")
print(f"Intercept: {model.intercept_:.3f}")
print(f"Coefficients: {model.coef_}")
print(f"True Coefficients: {beta}")
print(f"R²: {r2_score(y, y_pred):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y, y_pred):.3f}")
# Significance testing of coefficients (need to calculate manually)
# Calculate standard errors
X_with_intercept = np.column_stack([np.ones(n), X])
XTX_inv = np.linalg.inv(X_with_intercept.T @ X_with_intercept)
mse = np.sum(residuals**2) / (n - X.shape[1] - 1)
se_coeffs = np.sqrt(np.diag(XTX_inv) * mse)
# t statistics
coeffs_with_intercept = np.array([model.intercept_] + list(model.coef_))
t_stats = coeffs_with_intercept / se_coeffs
p_values = 2 * (1 - scipy_stats.t.cdf(np.abs(t_stats), n - X.shape[1] - 1))
print(f"\nCoefficient Significance Testing:")
coeff_names = ['Intercept', 'X1', 'X2', 'X3']
for i, name in enumerate(coeff_names):
print(f"{name}: Coefficient={coeffs_with_intercept[i]:.3f}, "
f"Std Error={se_coeffs[i]:.3f}, t={t_stats[i]:.3f}, p={p_values[i]:.6f}")
# Model overall significance test (F test)
ss_res = np.sum(residuals**2)
ss_tot = np.sum((y - np.mean(y))**2)
ss_reg = ss_tot - ss_res
df_reg = X.shape[1]
df_res = n - X.shape[1] - 1
ms_reg = ss_reg / df_reg
ms_res = ss_res / df_res
f_stat = ms_reg / ms_res
f_p_value = 1 - scipy_stats.f.cdf(f_stat, df_reg, df_res)
print(f"\nModel Overall Significance Test:")
print(f"F Statistic: {f_stat:.3f}")
print(f"p Value: {f_p_value:.6f}")
print(f"Conclusion: Model is {'significant' if f_p_value < 0.05 else 'not significant'} (α=0.05)")Non-Parametric Statistics
1. Sign Test and Rank Test
python
# Sign test
np.random.seed(42)
data = np.random.normal(2, 1, 50) # True median is 2
median_0 = 0 # Hypothesized median
# Calculate signs
signs = np.sign(data - median_0)
positive_signs = np.sum(signs > 0)
negative_signs = np.sum(signs < 0)
zero_signs = np.sum(signs == 0)
print("Sign Test:")
print(f"Number of Positive Signs: {positive_signs}")
print(f"Number of Negative Signs: {negative_signs}")
print(f"Number of Zeros: {zero_signs}")
# Use binomial test
n_nonzero = positive_signs + negative_signs
binom_p = scipy_stats.binom_test(positive_signs, n_nonzero, p=0.5)
print(f"Binomial Test p Value: {binom_p:.6f}")
print(f"Conclusion: {'Reject' if binom_p < 0.05 else 'Do not reject'} H0 (α=0.05)")
# Wilcoxon signed-rank test
wilcoxon_stat, wilcoxon_p = scipy_stats.wilcoxon(data - median_0)
print(f"\nWilcoxon Signed-Rank Test:")
print(f"Statistic: {wilcoxon_stat}")
print(f"p Value: {wilcoxon_p:.6f}")
print(f"Conclusion: {'Reject' if wilcoxon_p < 0.05 else 'Do not reject'} H0 (α=0.05)")2. Runs Test
python
# Runs test (randomness test)
def runs_test(data, cutoff=None):
"""
Runs test
"""
if cutoff is None:
cutoff = np.median(data)
# Convert to binary sequence
binary_seq = (data > cutoff).astype(int)
# Calculate runs
runs = []
current_run = 1
for i in range(1, len(binary_seq)):
if binary_seq[i] == binary_seq[i-1]:
current_run += 1
else:
runs.append(current_run)
current_run = 1
runs.append(current_run)
# Number of runs
n_runs = len(runs)
n1 = np.sum(binary_seq)
n2 = len(binary_seq) - n1
# Expected number of runs and variance
expected_runs = (2 * n1 * n2) / (n1 + n2) + 1
var_runs = (2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) / ((n1 + n2)**2 * (n1 + n2 - 1))
# Z statistic
z_stat = (n_runs - expected_runs) / np.sqrt(var_runs)
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
return n_runs, expected_runs, z_stat, p_value
# Test randomness
np.random.seed(42)
random_data = np.random.normal(0, 1, 100)
trend_data = np.linspace(-2, 2, 100) + np.random.normal(0, 0.1, 100)
print("Runs Test (Randomness Test):")
for data, name in [(random_data, "Random Data"), (trend_data, "Trend Data")]:
n_runs, expected, z_stat, p_val = runs_test(data)
print(f"\n{name}:")
print(f" Observed Runs: {n_runs}")
print(f" Expected Runs: {expected:.2f}")
print(f" Z Statistic: {z_stat:.3f}")
print(f" p Value: {p_val:.6f}")
print(f" Conclusion: Data is {'not' if p_val < 0.05 else 'random'} (α=0.05)")
# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i, (data, name) in enumerate([(random_data, "Random Data"), (trend_data, "Trend Data")]):
# Time series plot
axes[i, 0].plot(data)
axes[i, 0].axhline(y=np.median(data), color='r', linestyle='--', label='Median')
axes[i, 0].set_title(f'{name} - Time Series')
axes[i, 0].set_xlabel('Time')
axes[i, 0].set_ylabel('Value')
axes[i, 0].legend()
axes[i, 0].grid(True, alpha=0.3)
# Runs plot
binary_seq = (data > np.median(data)).astype(int)
axes[i, 1].plot(binary_seq, 'o-', markersize=3)
axes[i, 1].set_title(f'{name} - Runs Plot')
axes[i, 1].set_xlabel('Time')
axes[i, 1].set_ylabel('Above Median (1) / Below Median (0)')
axes[i, 1].set_ylim(-0.1, 1.1)
axes[i, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Kernel Density Estimation
python
# Kernel density estimation
np.random.seed(42)
# Generate mixed distribution data
data1 = np.random.normal(-2, 0.5, 300)
data2 = np.random.normal(2, 0.8, 200)
mixed_data = np.concatenate([data1, data2])
# Kernel density estimation with different bandwidths
bandwidths = [0.1, 0.3, 0.5, 1.0]
x_range = np.linspace(mixed_data.min() - 1, mixed_data.max() + 1, 1000)
plt.figure(figsize=(15, 10))
for i, bw in enumerate(bandwidths):
plt.subplot(2, 2, i+1)
# Histogram
plt.hist(mixed_data, bins=30, density=True, alpha=0.5, color='lightblue',
edgecolor='black', label='Histogram')
# Kernel density estimation
kde = scipy_stats.gaussian_kde(mixed_data, bw_method=bw)
kde_values = kde(x_range)
plt.plot(x_range, kde_values, linewidth=2, label=f'KDE (bandwidth={bw})')
# True density (theoretical)
true_density = (0.6 * scipy_stats.norm.pdf(x_range, -2, 0.5) +
0.4 * scipy_stats.norm.pdf(x_range, 2, 0.8))
plt.plot(x_range, true_density, 'r--', linewidth=2, label='True Density')
plt.title(f'Kernel Density Estimation (Bandwidth = {bw})')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Automatic bandwidth selection
kde_auto = scipy_stats.gaussian_kde(mixed_data)
print(f"Automatically Selected Bandwidth: {kde_auto.factor:.4f}")
# Comparison of different kernel functions (need to use sklearn)
from sklearn.neighbors import KernelDensity
kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential']
plt.figure(figsize=(12, 8))
for i, kernel in enumerate(kernels):
plt.subplot(2, 2, i+1)
# Histogram
plt.hist(mixed_data, bins=30, density=True, alpha=0.5, color='lightblue',
edgecolor='black', label='Histogram')
# Kernel density estimation
kde_sk = KernelDensity(kernel=kernel, bandwidth=0.5)
kde_sk.fit(mixed_data.reshape(-1, 1))
log_density = kde_sk.score_samples(x_range.reshape(-1, 1))
density = np.exp(log_density)
plt.plot(x_range, density, linewidth=2, label=f'{kernel} Kernel')
plt.title(f'{kernel.capitalize()} Kernel Density Estimation')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Practical Application Cases
Case 1: Quality Control Analysis
python
# Quality control data analysis
np.random.seed(42)
# Simulate production data
control_data = np.random.normal(100, 2, 200) # Normal production
defective_data = np.random.normal(95, 3, 50) # Abnormal production
all_data = np.concatenate([control_data, defective_data])
labels = ['Normal'] * 200 + ['Defective'] * 50
print("Quality Control Statistical Analysis:")
print(f"Total Samples: {len(all_data)}")
print(f"Normal Samples: {len(control_data)}, Defective Samples: {len(defective_data)}")
# Descriptive statistics
print(f"\nNormal Production Data:")
print(f" Mean: {np.mean(control_data):.3f}")
print(f" Std Dev: {np.std(control_data, ddof=1):.3f}")
print(f" Range: [{np.min(control_data):.3f}, {np.max(control_data):.3f}]")
print(f"\nDefective Production Data:")
print(f" Mean: {np.mean(defective_data):.3f}")
print(f" Std Dev: {np.std(defective_data, ddof=1):.3f}")
print(f" Range: [{np.min(defective_data):.3f}, {np.max(defective_data):.3f}]")
# Hypothesis testing
t_stat, p_value = scipy_stats.ttest_ind(control_data, defective_data)
print(f"\nIndependent Samples t-Test:")
print(f"t Statistic: {t_stat:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Conclusion: There {'is' if p_value < 0.05 else 'is no'} significant difference between means (α=0.05)")
# Control chart
control_mean = np.mean(control_data)
control_std = np.std(control_data, ddof=1)
ucl = control_mean + 3 * control_std # Upper control limit
lcl = control_mean - 3 * control_std # Lower control limit
plt.figure(figsize=(15, 10))
# Control chart
plt.subplot(2, 2, 1)
plt.plot(range(len(all_data)), all_data, 'o-', markersize=3, alpha=0.7)
plt.axhline(y=control_mean, color='g', linestyle='-', label='Center Line')
plt.axhline(y=ucl, color='r', linestyle='--', label='Upper Control Limit (UCL)')
plt.axhline(y=lcl, color='r', linestyle='--', label='Lower Control Limit (LCL)')
plt.axvline(x=200, color='orange', linestyle=':', label='Defective Start')
plt.title('Quality Control Chart')
plt.xlabel('Sample Number')
plt.ylabel('Quality Metric')
plt.legend()
plt.grid(True, alpha=0.3)
# Histogram comparison
plt.subplot(2, 2, 2)
plt.hist(control_data, bins=30, alpha=0.7, label='Normal', density=True)
plt.hist(defective_data, bins=20, alpha=0.7, label='Defective', density=True)
plt.title('Quality Distribution Comparison')
plt.xlabel('Quality Metric')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
# Box plot
plt.subplot(2, 2, 3)
data_for_box = [control_data, defective_data]
plt.boxplot(data_for_box, labels=['Normal', 'Defective'])
plt.title('Quality Metric Box Plot')
plt.ylabel('Quality Metric')
plt.grid(True, alpha=0.3)
# Cumulative distribution function
plt.subplot(2, 2, 4)
sorted_control = np.sort(control_data)
sorted_defective = np.sort(defective_data)
y_control = np.arange(1, len(sorted_control) + 1) / len(sorted_control)
y_defective = np.arange(1, len(sorted_defective) + 1) / len(sorted_defective)
plt.plot(sorted_control, y_control, label='Normal', linewidth=2)
plt.plot(sorted_defective, y_defective, label='Defective', linewidth=2)
plt.title('Cumulative Distribution Function Comparison')
plt.xlabel('Quality Metric')
plt.ylabel('Cumulative Probability')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Case 2: A/B Testing Analysis
python
# A/B testing data analysis
np.random.seed(42)
# Simulate A/B test data
n_a, n_b = 1000, 1200
conversion_rate_a = 0.12 # Group A conversion rate
conversion_rate_b = 0.15 # Group B conversion rate
# Generate conversion data
conversions_a = np.random.binomial(1, conversion_rate_a, n_a)
conversions_b = np.random.binomial(1, conversion_rate_b, n_b)
# Calculate statistics
conv_a = np.sum(conversions_a)
conv_b = np.sum(conversions_b)
rate_a = conv_a / n_a
rate_b = conv_b / n_b
print("A/B Test Results Analysis:")
print(f"Group A: {conv_a}/{n_a} = {rate_a:.4f} ({rate_a*100:.2f}%)")
print(f"Group B: {conv_b}/{n_b} = {rate_b:.4f} ({rate_b*100:.2f}%)")
print(f"Improvement: {(rate_b - rate_a)/rate_a*100:.2f}%")
# Proportion test
from statsmodels.stats.proportion import proportions_ztest
counts = np.array([conv_a, conv_b])
nobs = np.array([n_a, n_b])
z_stat, p_value = proportions_ztest(counts, nobs)
print(f"\nProportion Test:")
print(f"Z Statistic: {z_stat:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Conclusion: Group B conversion rate is {'significantly' if p_value < 0.05 else 'not significantly'} higher than Group A (α=0.05)")
# Confidence interval
from statsmodels.stats.proportion import proportion_confint
ci_a = proportion_confint(conv_a, n_a, alpha=0.05)
ci_b = proportion_confint(conv_b, n_b, alpha=0.05)
print(f"\n95% Confidence Interval:")
print(f"Group A: [{ci_a[0]:.4f}, {ci_a[1]:.4f}]")
print(f"Group B: [{ci_b[0]:.4f}, {ci_b[1]:.4f}]")
# Effect size (Cohen's h)
def cohens_h(p1, p2):
return 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2)))
effect_size = cohens_h(rate_b, rate_a)
print(f"\nEffect Size (Cohen's h): {effect_size:.4f}")
# Power analysis
from statsmodels.stats.power import ttest_power
power = ttest_power(effect_size, n_a, alpha=0.05)
print(f"Statistical Power: {power:.4f}")
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Conversion rate comparison
rates = [rate_a, rate_b]
errors = [ci_a[1] - rate_a, ci_b[1] - rate_b]
axes[0, 0].bar(['Group A', 'Group B'], rates, yerr=errors, capsize=5, alpha=0.7)
axes[0, 0].set_title('Conversion Rate Comparison')
axes[0, 0].set_ylabel('Conversion Rate')
axes[0, 0].grid(True, alpha=0.3)
# Conversion count comparison
axes[0, 1].bar(['Group A', 'Group B'], [conv_a, conv_b], alpha=0.7)
axes[0, 1].set_title('Conversion Count Comparison')
axes[0, 1].set_ylabel('Conversion Count')
for i, v in enumerate([conv_a, conv_b]):
axes[0, 1].text(i, v + 5, str(v), ha='center')
axes[0, 1].grid(True, alpha=0.3)
# Confidence interval visualization
x_pos = [0, 1]
axes[1, 0].errorbar(x_pos, rates,
yerr=[[rate_a - ci_a[0], rate_b - ci_b[0]],
[ci_a[1] - rate_a, ci_b[1] - rate_b]],
fmt='o', capsize=10, capthick=2, markersize=8)
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(['Group A', 'Group B'])
axes[1, 0].set_title('Conversion Rate and 95% Confidence Interval')
axes[1, 0].set_ylabel('Conversion Rate')
axes[1, 0].grid(True, alpha=0.3)
# Sample size and power relationship
sample_sizes = np.arange(100, 2000, 50)
powers = [ttest_power(effect_size, n, alpha=0.05) for n in sample_sizes]
axes[1, 1].plot(sample_sizes, powers, linewidth=2)
axes[1, 1].axhline(y=0.8, color='r', linestyle='--', label='Power=0.8')
axes[1, 1].axvline(x=n_a, color='g', linestyle=':', label=f'Current Sample Size={n_a}')
axes[1, 1].set_title('Sample Size and Statistical Power Relationship')
axes[1, 1].set_xlabel('Sample Size')
axes[1, 1].set_ylabel('Statistical Power')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Summary
This chapter provided a detailed introduction to the core functions of SciPy's statistical analysis module:
- Probability Distributions: Mastered the use of continuous and discrete distributions
- Descriptive Statistics: Learned to calculate various statistics and data visualization
- Hypothesis Testing: Understood one-sample, two-sample, and multiple-sample testing methods
- Correlation Analysis: Mastered methods for measuring linear and non-linear correlations
- Regression Analysis: Learned simple and multiple linear regression implementation
- Non-Parametric Statistics: Learned statistical methods not dependent on distribution assumptions
- Kernel Density Estimation: Mastered non-parametric estimation of data distribution
- Practical Applications: Learned practical applications through quality control and A/B testing cases
Practice Exercises
- Generate 1000 samples from a normal distribution and perform Shapiro-Wilk normality test
- Compare the mean differences between two groups and select appropriate test methods
- Perform ANOVA on multiple groups and conduct post-hoc tests
- Calculate Pearson and Spearman correlation coefficients for two variables and explain the differences
- Use kernel density estimation to analyze characteristics of mixed distribution data
- Design an A/B testing analysis workflow including sample size calculation and result interpretation
- Implement a quality control chart to identify anomalous data points
- Use runs test to analyze randomness of time series data