Skip to content

Statistical Analysis

SciPy's scipy.stats module is one of the most comprehensive statistical analysis tools in Python. It provides a large number of probability distributions, statistical functions, and hypothesis testing methods, making it an important tool for data science and statistical analysis. This chapter will provide a detailed introduction to using these functions for various statistical analyses.

scipy.stats Module Overview

The scipy.stats module includes the following main functions:

  • 80+ continuous and discrete probability distributions
  • Descriptive statistics functions
  • Hypothesis testing methods
  • Correlation and regression analysis
  • Kernel density estimation
  • Statistical distance computation
python
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# View main functions of stats module
print("scipy.stats Main Functions:")
functions = [attr for attr in dir(stats) if not attr.startswith('_')]
print(f"Total {len(functions)} functions and classes")
print("Some functions:", functions[:15])

Probability Distributions

1. Continuous Distributions

Normal Distribution

python
# Normal distribution is the most important continuous distribution
mu, sigma = 0, 1  # Standard normal distribution

# Create normal distribution object
norm_dist = stats.norm(loc=mu, scale=sigma)

# Probability Density Function (PDF)
x = np.linspace(-4, 4, 1000)
pdf_values = norm_dist.pdf(x)

# Cumulative Distribution Function (CDF)
cdf_values = norm_dist.cdf(x)

# Percent Point Function (PPF)
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
ppf_values = norm_dist.ppf(quantiles)

print(f"Normal Distribution N({mu}, {sigma}²):")
print(f"Quantiles: {dict(zip(quantiles, ppf_values))}")

# Random number generation
random_samples = norm_dist.rvs(size=1000, random_state=42)

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# PDF
axes[0, 0].plot(x, pdf_values, 'b-', linewidth=2, label='PDF')
axes[0, 0].fill_between(x, pdf_values, alpha=0.3)
axes[0, 0].set_title('Probability Density Function (PDF)')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('Density')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# CDF
axes[0, 1].plot(x, cdf_values, 'r-', linewidth=2, label='CDF')
axes[0, 1].set_title('Cumulative Distribution Function (CDF)')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('Probability')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Random sample histogram
axes[1, 0].hist(random_samples, bins=50, density=True, alpha=0.7, color='green')
axes[1, 0].plot(x, pdf_values, 'b-', linewidth=2, label='Theoretical PDF')
axes[1, 0].set_title('Random Sample Distribution')
axes[1, 0].set_xlabel('x')
axes[1, 0].set_ylabel('Density')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plot
stats.probplot(random_samples, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Other Important Continuous Distributions

python
# t distribution
t_dist = stats.t(df=5)  # Degrees of freedom 5

# Chi-square distribution
chi2_dist = stats.chi2(df=3)  # Degrees of freedom 3

# F distribution
f_dist = stats.f(dfn=2, dfd=10)  # Numerator df 2, denominator df 10

# Exponential distribution
expon_dist = stats.expon(scale=2)  # Scale parameter 2

# Gamma distribution
gamma_dist = stats.gamma(a=2, scale=1)  # Shape parameter 2, scale parameter 1

# Beta distribution
beta_dist = stats.beta(a=2, b=5)  # Parameters a=2, b=5

# Uniform distribution
uniform_dist = stats.uniform(loc=0, scale=1)  # Uniform distribution on [0, 1]

# Compare different distribution PDFs
x_range = np.linspace(0, 5, 1000)

plt.figure(figsize=(15, 10))

# Plot various distributions
distributions = {
    'Normal N(1,0.5²)': stats.norm(1, 0.5),
    't Distribution (df=5)': t_dist,
    'Exponential (λ=0.5)': expon_dist,
    'Gamma (α=2,β=1)': gamma_dist,
    'Chi-square (df=3)': chi2_dist
}

for i, (name, dist) in enumerate(distributions.items()):
    plt.subplot(2, 3, i+1)
    
    if name.startswith('Beta'):
        x_plot = np.linspace(0, 1, 1000)
    else:
        x_plot = x_range
    
    pdf_vals = dist.pdf(x_plot)
    plt.plot(x_plot, pdf_vals, linewidth=2, label=name)
    plt.fill_between(x_plot, pdf_vals, alpha=0.3)
    plt.title(name)
    plt.xlabel('x')
    plt.ylabel('Density')
    plt.grid(True, alpha=0.3)
    plt.legend()

plt.tight_layout()
plt.show()

# Distribution statistics
print("\nStatistics of Each Distribution:")
for name, dist in distributions.items():
    mean = dist.mean()
    var = dist.var()
    std = dist.std()
    skew = dist.stats(moments='s')
    kurt = dist.stats(moments='k')
    
    print(f"{name}:")
    print(f"  Mean: {mean:.3f}, Variance: {var:.3f}, Std: {std:.3f}")
    print(f"  Skewness: {skew:.3f}, Kurtosis: {kurt:.3f}")
    print()

2. Discrete Distributions

python
# Binomial distribution
binom_dist = stats.binom(n=20, p=0.3)

# Poisson distribution
poisson_dist = stats.poisson(mu=3)

# Geometric distribution
geom_dist = stats.geom(p=0.2)

# Negative binomial distribution
nbinom_dist = stats.nbinom(n=5, p=0.3)

# Hypergeometric distribution
hypergeom_dist = stats.hypergeom(M=50, n=10, N=15)

# Visualize discrete distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Binomial distribution
k_binom = np.arange(0, 21)
pmf_binom = binom_dist.pmf(k_binom)
axes[0, 0].bar(k_binom, pmf_binom, alpha=0.7)
axes[0, 0].set_title('Binomial B(20, 0.3)')
axes[0, 0].set_xlabel('k')
axes[0, 0].set_ylabel('P(X=k)')
axes[0, 0].grid(True, alpha=0.3)

# Poisson distribution
k_poisson = np.arange(0, 15)
pmf_poisson = poisson_dist.pmf(k_poisson)
axes[0, 1].bar(k_poisson, pmf_poisson, alpha=0.7, color='orange')
axes[0, 1].set_title('Poisson Pois(3)')
axes[0, 1].set_xlabel('k')
axes[0, 1].set_ylabel('P(X=k)')
axes[0, 1].grid(True, alpha=0.3)

# Geometric distribution
k_geom = np.arange(1, 21)
pmf_geom = geom_dist.pmf(k_geom)
axes[0, 2].bar(k_geom, pmf_geom, alpha=0.7, color='green')
axes[0, 2].set_title('Geometric Geom(0.2)')
axes[0, 2].set_xlabel('k')
axes[0, 2].set_ylabel('P(X=k)')
axes[0, 2].grid(True, alpha=0.3)

# Negative binomial distribution
k_nbinom = np.arange(0, 30)
pmf_nbinom = nbinom_dist.pmf(k_nbinom)
axes[1, 0].bar(k_nbinom, pmf_nbinom, alpha=0.7, color='red')
axes[1, 0].set_title('Negative Binomial NB(5, 0.3)')
axes[1, 0].set_xlabel('k')
axes[1, 0].set_ylabel('P(X=k)')
axes[1, 0].grid(True, alpha=0.3)

# Hypergeometric distribution
k_hypergeom = np.arange(0, 11)
pmf_hypergeom = hypergeom_dist.pmf(k_hypergeom)
axes[1, 1].bar(k_hypergeom, pmf_hypergeom, alpha=0.7, color='purple')
axes[1, 1].set_title('Hypergeometric H(50,10,15)')
axes[1, 1].set_xlabel('k')
axes[1, 1].set_ylabel('P(X=k)')
axes[1, 1].grid(True, alpha=0.3)

# Compare binomial and Poisson distributions (when n large, p small)
n_large, p_small = 100, 0.03
binom_approx = stats.binom(n=n_large, p=p_small)
poisson_approx = stats.poisson(mu=n_large * p_small)

k_compare = np.arange(0, 15)
pmf_binom_approx = binom_approx.pmf(k_compare)
pmf_poisson_approx = poisson_approx.pmf(k_compare)

axes[1, 2].bar(k_compare - 0.2, pmf_binom_approx, width=0.4, alpha=0.7, 
               label='Binomial B(100,0.03)')
axes[1, 2].bar(k_compare + 0.2, pmf_poisson_approx, width=0.4, alpha=0.7, 
               label='Poisson Pois(3)')
axes[1, 2].set_title('Poisson Approximation of Binomial')
axes[1, 2].set_xlabel('k')
axes[1, 2].set_ylabel('P(X=k)')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics of discrete distributions
discrete_distributions = {
    'Binomial B(20,0.3)': binom_dist,
    'Poisson Pois(3)': poisson_dist,
    'Geometric Geom(0.2)': geom_dist,
    'Negative Binomial NB(5,0.3)': nbinom_dist
}

print("\nStatistics of Discrete Distributions:")
for name, dist in discrete_distributions.items():
    mean = dist.mean()
    var = dist.var()
    std = dist.std()
    
    print(f"{name}:")
    print(f"  Mean: {mean:.3f}, Variance: {var:.3f}, Std: {std:.3f}")
    print()

Descriptive Statistics

1. Basic Statistics

python
# Generate example data
np.random.seed(42)
data1 = np.random.normal(100, 15, 1000)  # Normal distribution data
data2 = np.random.exponential(2, 1000)   # Exponential distribution data
data3 = np.random.uniform(0, 10, 1000)   # Uniform distribution data

# Calculate various statistics
def describe_data(data, name):
    print(f"\nDescriptive Statistics for {name}:")
    print(f"Sample Size: {len(data)}")
    print(f"Mean: {np.mean(data):.3f}")
    print(f"Median: {np.median(data):.3f}")
    print(f"Mode: {stats.mode(data, keepdims=True)[0][0]:.3f}")
    print(f"Std Dev: {np.std(data, ddof=1):.3f}")
    print(f"Variance: {np.var(data, ddof=1):.3f}")
    print(f"Skewness: {stats.skew(data):.3f}")
    print(f"Kurtosis: {stats.kurtosis(data):.3f}")
    print(f"Min: {np.min(data):.3f}")
    print(f"Max: {np.max(data):.3f}")
    print(f"Range: {np.ptp(data):.3f}")
    
    # Percentiles
    percentiles = [5, 25, 50, 75, 95]
    quantile_values = np.percentile(data, percentiles)
    print(f"Percentiles: {dict(zip(percentiles, quantile_values))}")
    
    # Interquartile range
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    print(f"Interquartile Range (IQR): {iqr:.3f}")
    
    # Coefficient of variation
    cv = np.std(data, ddof=1) / np.mean(data)
    print(f"Coefficient of Variation: {cv:.3f}")
    
    return {
        'mean': np.mean(data),
        'median': np.median(data),
        'std': np.std(data, ddof=1),
        'skew': stats.skew(data),
        'kurtosis': stats.kurtosis(data)
    }

# Analyze three groups of data
stats1 = describe_data(data1, "Normal Distribution Data")
stats2 = describe_data(data2, "Exponential Distribution Data")
stats3 = describe_data(data3, "Uniform Distribution Data")

# Use scipy.stats.describe function
print("\nUsing scipy.stats.describe:")
for i, (data, name) in enumerate([(data1, "Normal"), (data2, "Exponential"), (data3, "Uniform")]):
    desc = stats.describe(data)
    print(f"\n{name}:")
    print(f"  Sample Count: {desc.nobs}")
    print(f"  Min-Max: ({desc.minmax[0]:.3f}, {desc.minmax[1]:.3f})")
    print(f"  Mean: {desc.mean:.3f}")
    print(f"  Variance: {desc.variance:.3f}")
    print(f"  Skewness: {desc.skewness:.3f}")
    print(f"  Kurtosis: {desc.kurtosis:.3f}")

2. Data Visualization

python
# Comprehensive visualization analysis
fig, axes = plt.subplots(3, 4, figsize=(16, 12))

datasets = [(data1, "Normal Distribution"), (data2, "Exponential Distribution"), (data3, "Uniform Distribution")]

for i, (data, name) in enumerate(datasets):
    # Histogram
    axes[i, 0].hist(data, bins=50, density=True, alpha=0.7, edgecolor='black')
    axes[i, 0].set_title(f'{name} - Histogram')
    axes[i, 0].set_ylabel('Density')
    axes[i, 0].grid(True, alpha=0.3)
    
    # Box plot
    axes[i, 1].boxplot(data, vert=True)
    axes[i, 1].set_title(f'{name} - Box Plot')
    axes[i, 1].grid(True, alpha=0.3)
    
    # Q-Q plot (compared to normal distribution)
    stats.probplot(data, dist="norm", plot=axes[i, 2])
    axes[i, 2].set_title(f'{name} - Q-Q Plot')
    axes[i, 2].grid(True, alpha=0.3)
    
    # Empirical cumulative distribution function
    sorted_data = np.sort(data)
    y_values = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    axes[i, 3].plot(sorted_data, y_values, linewidth=2)
    axes[i, 3].set_title(f'{name} - Empirical CDF')
    axes[i, 3].set_xlabel('Value')
    axes[i, 3].set_ylabel('Cumulative Probability')
    axes[i, 3].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Kernel density estimation
plt.figure(figsize=(12, 4))

for i, (data, name) in enumerate(datasets):
    plt.subplot(1, 3, i+1)
    
    # Histogram
    plt.hist(data, bins=50, density=True, alpha=0.5, label='Histogram')
    
    # Kernel density estimation
    kde = stats.gaussian_kde(data)
    x_range = np.linspace(data.min(), data.max(), 200)
    kde_values = kde(x_range)
    plt.plot(x_range, kde_values, linewidth=2, label='Kernel Density Estimation')
    
    plt.title(f'{name}')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Hypothesis Testing

1. One-Sample Tests

python
# One-sample t-test
# H0: μ = μ0 vs H1: μ ≠ μ0
np.random.seed(42)
sample = np.random.normal(105, 15, 50)  # True mean is 105
mu0 = 100  # Hypothesized mean

t_stat, p_value = stats.ttest_1samp(sample, mu0)

print("One-Sample t-Test:")
print(f"Sample Mean: {np.mean(sample):.3f}")
print(f"Hypothesized Mean: {mu0}")
print(f"t Statistic: {t_stat:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Conclusion: {'Reject' if p_value < 0.05 else 'Do not reject'} H0 (α=0.05)")

# One-sample Wilcoxon signed-rank test (non-parametric)
wilcoxon_stat, wilcoxon_p = stats.wilcoxon(sample - mu0)
print(f"\nWilcoxon Signed-Rank Test:")
print(f"Statistic: {wilcoxon_stat}")
print(f"p Value: {wilcoxon_p:.6f}")
print(f"Conclusion: {'Reject' if wilcoxon_p < 0.05 else 'Do not reject'} H0 (α=0.05)")

# Normality test
# Shapiro-Wilk test
shapiro_stat, shapiro_p = stats.shapiro(sample)
print(f"\nShapiro-Wilk Normality Test:")
print(f"Statistic: {shapiro_stat:.6f}")
print(f"p Value: {shapiro_p:.6f}")
print(f"Conclusion: Data {'does not' if shapiro_p < 0.05 else 'does'} follow normal distribution (α=0.05)")

# Kolmogorov-Smirnov test
ks_stat, ks_p = stats.kstest(sample, 'norm', args=(np.mean(sample), np.std(sample, ddof=1)))
print(f"\nKolmogorov-Smirnov Normality Test:")
print(f"Statistic: {ks_stat:.6f}")
print(f"p Value: {ks_p:.6f}")
print(f"Conclusion: Data {'does not' if ks_p < 0.05 else 'does'} follow normal distribution (α=0.05)")

# Anderson-Darling test
ad_result = stats.anderson(sample, dist='norm')
print(f"\nAnderson-Darling Normality Test:")
print(f"Statistic: {ad_result.statistic:.6f}")
print(f"Critical Values: {ad_result.critical_values}")
print(f"Significance Level: {ad_result.significance_level}")

2. Two-Sample Tests

python
# Generate two samples
np.random.seed(42)
group1 = np.random.normal(100, 15, 50)
group2 = np.random.normal(105, 15, 45)

# Independent samples t-test
# Assuming equal variances
t_stat_equal, p_equal = stats.ttest_ind(group1, group2, equal_var=True)

# Not assuming equal variances (Welch's t-test)
t_stat_unequal, p_unequal = stats.ttest_ind(group1, group2, equal_var=False)

print("Independent Samples t-Test:")
print(f"Group 1 Mean: {np.mean(group1):.3f}, Group 2 Mean: {np.mean(group2):.3f}")
print(f"\nAssuming Equal Variances:")
print(f"  t Statistic: {t_stat_equal:.3f}")
print(f"  p Value: {p_equal:.6f}")
print(f"\nNot Assuming Equal Variances (Welch's t-test):")
print(f"  t Statistic: {t_stat_unequal:.3f}")
print(f"  p Value: {p_unequal:.6f}")

# Homogeneity of variance test
# Levene test
levene_stat, levene_p = stats.levene(group1, group2)
print(f"\nLevene Homogeneity of Variance Test:")
print(f"Statistic: {levene_stat:.3f}")
print(f"p Value: {levene_p:.6f}")
print(f"Conclusion: Variances are {'not' if levene_p < 0.05 else 'are'} equal (α=0.05)")

# Bartlett test (assumes normality)
bartlett_stat, bartlett_p = stats.bartlett(group1, group2)
print(f"\nBartlett Homogeneity of Variance Test:")
print(f"Statistic: {bartlett_stat:.3f}")
print(f"p Value: {bartlett_p:.6f}")

# Non-parametric tests
# Mann-Whitney U test
mw_stat, mw_p = stats.mannwhitneyu(group1, group2, alternative='two-sided')
print(f"\nMann-Whitney U Test:")
print(f"Statistic: {mw_stat}")
print(f"p Value: {mw_p:.6f}")
print(f"Conclusion: {'Reject' if mw_p < 0.05 else 'Do not reject'} H0 (α=0.05)")

# Kolmogorov-Smirnov two-sample test
ks2_stat, ks2_p = stats.ks_2samp(group1, group2)
print(f"\nKolmogorov-Smirnov Two-Sample Test:")
print(f"Statistic: {ks2_stat:.6f}")
print(f"p Value: {ks2_p:.6f}")

3. Multiple Sample Tests

python
# Generate multiple samples
np.random.seed(42)
group_a = np.random.normal(100, 15, 30)
group_b = np.random.normal(105, 15, 35)
group_c = np.random.normal(110, 15, 25)
group_d = np.random.normal(103, 15, 40)

# One-way ANOVA
f_stat, anova_p = stats.f_oneway(group_a, group_b, group_c, group_d)

print("One-Way ANOVA:")
print(f"F Statistic: {f_stat:.3f}")
print(f"p Value: {anova_p:.6f}")
print(f"Conclusion: {'Reject' if anova_p < 0.05 else 'Do not reject'} H0 (α=0.05)")

# Non-parametric version: Kruskal-Wallis test
kw_stat, kw_p = stats.kruskal(group_a, group_b, group_c, group_d)
print(f"\nKruskal-Wallis Test:")
print(f"Statistic: {kw_stat:.3f}")
print(f"p Value: {kw_p:.6f}")
print(f"Conclusion: {'Reject' if kw_p < 0.05 else 'Do not reject'} H0 (α=0.05)")

# Post-hoc tests (if ANOVA is significant)
if anova_p < 0.05:
    print("\nPerforming Post-Hoc Tests...")
    
    # Combine all data
    all_data = np.concatenate([group_a, group_b, group_c, group_d])
    groups = (['A'] * len(group_a) + ['B'] * len(group_b) + 
              ['C'] * len(group_c) + ['D'] * len(group_d))
    
    # Pairwise t-tests (Tukey HSD requires other libraries)
    from itertools import combinations
    
    group_data = {'A': group_a, 'B': group_b, 'C': group_c, 'D': group_d}
    group_names = list(group_data.keys())
    
    print("Pairwise t-Tests (Bonferroni Corrected):")
    n_comparisons = len(list(combinations(group_names, 2)))
    alpha_corrected = 0.05 / n_comparisons
    
    for name1, name2 in combinations(group_names, 2):
        t_stat, p_val = stats.ttest_ind(group_data[name1], group_data[name2])
        significant = p_val < alpha_corrected
        print(f"  {name1} vs {name2}: t={t_stat:.3f}, p={p_val:.6f}, Significant: {significant}")

# Visualize multiple group comparisons
plt.figure(figsize=(12, 8))

# Box plot
plt.subplot(2, 2, 1)
data_for_box = [group_a, group_b, group_c, group_d]
labels = ['Group A', 'Group B', 'Group C', 'Group D']
plt.boxplot(data_for_box, labels=labels)
plt.title('Multiple Group Box Plot')
plt.ylabel('Value')
plt.grid(True, alpha=0.3)

# Violin plot
plt.subplot(2, 2, 2)
positions = [1, 2, 3, 4]
parts = plt.violinplot(data_for_box, positions=positions)
plt.xticks(positions, labels)
plt.title('Multiple Group Violin Plot')
plt.ylabel('Value')
plt.grid(True, alpha=0.3)

# Means and confidence intervals
plt.subplot(2, 2, 3)
means = [np.mean(data) for data in data_for_box]
stds = [np.std(data, ddof=1) for data in data_for_box]
ns = [len(data) for data in data_for_box]
ses = [std / np.sqrt(n) for std, n in zip(stds, ns)]
cis = [1.96 * se for se in ses]  # 95% confidence interval

plt.errorbar(range(1, 5), means, yerr=cis, fmt='o-', capsize=5, capthick=2)
plt.xticks(range(1, 5), labels)
plt.title('Means and 95% Confidence Intervals')
plt.ylabel('Mean')
plt.grid(True, alpha=0.3)

# Density plot
plt.subplot(2, 2, 4)
for i, (data, label) in enumerate(zip(data_for_box, labels)):
    plt.hist(data, bins=20, alpha=0.5, label=label, density=True)
plt.title('Multiple Group Density Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

4. Chi-Square Test

python
# Goodness-of-fit test
# Test if a die is fair
observed_dice = np.array([18, 22, 16, 25, 12, 7])  # Observed frequencies
expected_dice = np.array([100/6] * 6)  # Expected frequencies (fair die)

chi2_stat, chi2_p = stats.chisquare(observed_dice, expected_dice)

print("Chi-Square Goodness-of-Fit Test (Die Fairness):")
print(f"Observed Frequencies: {observed_dice}")
print(f"Expected Frequencies: {expected_dice}")
print(f"Chi-Square Statistic: {chi2_stat:.3f}")
print(f"p Value: {chi2_p:.6f}")
print(f"Conclusion: Die is {'not' if chi2_p < 0.05 else 'fair'} (α=0.05)")

# Independence test
# Association between gender and product preference
contingency_table = np.array([[20, 30, 25],   # Males preferring products A, B, C
                              [25, 20, 30]])  # Females preferring products A, B, C

chi2_indep, p_indep, dof, expected = stats.chi2_contingency(contingency_table)

print(f"\nChi-Square Independence Test (Gender vs Product Preference):")
print(f"Observed Frequency Table:\n{contingency_table}")
print(f"Expected Frequency Table:\n{expected}")
print(f"Chi-Square Statistic: {chi2_indep:.3f}")
print(f"Degrees of Freedom: {dof}")
print(f"p Value: {p_indep:.6f}")
print(f"Conclusion: Gender {'is' if p_indep < 0.05 else 'is not'} associated with product preference (α=0.05)")

# Cramér's V (association strength)
n = np.sum(contingency_table)
cramers_v = np.sqrt(chi2_indep / (n * (min(contingency_table.shape) - 1)))
print(f"Cramér's V: {cramers_v:.3f}")

# Fisher's exact test (2x2 table)
table_2x2 = np.array([[10, 15], [5, 20]])
odds_ratio, fisher_p = stats.fisher_exact(table_2x2)

print(f"\nFisher's Exact Test:")
print(f"2x2 Table:\n{table_2x2}")
print(f"Odds Ratio: {odds_ratio:.3f}")
print(f"p Value: {fisher_p:.6f}")
print(f"Conclusion: {'Reject' if fisher_p < 0.05 else 'Do not reject'} independence assumption (α=0.05)")

Correlation Analysis

1. Linear Correlation

python
# Generate correlated data
np.random.seed(42)
n = 100

# Strong positive correlation
x1 = np.random.normal(0, 1, n)
y1 = 2 * x1 + np.random.normal(0, 0.5, n)

# Weak negative correlation
x2 = np.random.normal(0, 1, n)
y2 = -0.3 * x2 + np.random.normal(0, 2, n)

# No correlation
x3 = np.random.normal(0, 1, n)
y3 = np.random.normal(0, 1, n)

# Non-linear relationship
x4 = np.linspace(-3, 3, n)
y4 = x4**2 + np.random.normal(0, 1, n)

# Pearson correlation coefficient
corr_datasets = [(x1, y1, "Strong Positive"), (x2, y2, "Weak Negative"), 
                 (x3, y3, "No Correlation"), (x4, y4, "Non-linear Relationship")]

print("Pearson Correlation Coefficient:")
for x, y, name in corr_datasets:
    r, p_value = stats.pearsonr(x, y)
    print(f"{name}: r = {r:.3f}, p = {p_value:.6f}")

# Spearman rank correlation coefficient
print("\nSpearman Rank Correlation Coefficient:")
for x, y, name in corr_datasets:
    rho, p_value = stats.spearmanr(x, y)
    print(f"{name}: ρ = {rho:.3f}, p = {p_value:.6f}")

# Kendall's tau
print("\nKendall's tau:")
for x, y, name in corr_datasets:
    tau, p_value = stats.kendalltau(x, y)
    print(f"{name}: τ = {tau:.3f}, p = {p_value:.6f}")

# Visualize correlations
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i, (x, y, name) in enumerate(corr_datasets):
    # Scatter plot
    axes[0, i].scatter(x, y, alpha=0.6)
    axes[0, i].set_title(f'{name}')
    axes[0, i].set_xlabel('X')
    axes[0, i].set_ylabel('Y')
    axes[0, i].grid(True, alpha=0.3)
    
    # Add regression line
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    line = slope * x + intercept
    axes[0, i].plot(x, line, 'r-', alpha=0.8)
    axes[0, i].text(0.05, 0.95, f'r = {r_value:.3f}', 
                    transform=axes[0, i].transAxes, verticalalignment='top')
    
    # Residual plot
    residuals = y - line
    axes[1, i].scatter(x, residuals, alpha=0.6)
    axes[1, i].axhline(y=0, color='r', linestyle='--')
    axes[1, i].set_title(f'{name} - Residual Plot')
    axes[1, i].set_xlabel('X')
    axes[1, i].set_ylabel('Residuals')
    axes[1, i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

2. Multivariate Correlation

python
# Generate multivariate data
np.random.seed(42)
n = 200

# Create correlation matrix
corr_matrix = np.array([[1.0, 0.8, 0.3, -0.2],
                       [0.8, 1.0, 0.1, -0.4],
                       [0.3, 0.1, 1.0, 0.6],
                       [-0.2, -0.4, 0.6, 1.0]])

# Use Cholesky decomposition to generate correlated data
L = np.linalg.cholesky(corr_matrix)
uncorr_data = np.random.normal(0, 1, (n, 4))
corr_data = uncorr_data @ L.T

# Calculate sample correlation matrix
sample_corr = np.corrcoef(corr_data.T)

print("Theoretical Correlation Matrix:")
print(corr_matrix)
print("\nSample Correlation Matrix:")
print(sample_corr)

# Significance testing of correlation matrix
from scipy.stats import pearsonr

n_vars = corr_data.shape[1]
corr_matrix_sample = np.zeros((n_vars, n_vars))
p_matrix = np.zeros((n_vars, n_vars))

for i in range(n_vars):
    for j in range(n_vars):
        if i != j:
            r, p = pearsonr(corr_data[:, i], corr_data[:, j])
            corr_matrix_sample[i, j] = r
            p_matrix[i, j] = p
        else:
            corr_matrix_sample[i, j] = 1.0
            p_matrix[i, j] = 0.0

print("\np-Value Matrix for Correlation Coefficients:")
print(p_matrix)

# Visualize correlation matrix
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Theoretical correlation matrix
im1 = axes[0].imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
axes[0].set_title('Theoretical Correlation Matrix')
for i in range(4):
    for j in range(4):
        axes[0].text(j, i, f'{corr_matrix[i, j]:.2f}', 
                    ha='center', va='center')

# Sample correlation matrix
im2 = axes[1].imshow(sample_corr, cmap='RdBu_r', vmin=-1, vmax=1)
axes[1].set_title('Sample Correlation Matrix')
for i in range(4):
    for j in range(4):
        axes[1].text(j, i, f'{sample_corr[i, j]:.2f}', 
                    ha='center', va='center')

# p-value matrix
im3 = axes[2].imshow(p_matrix, cmap='Reds_r', vmin=0, vmax=0.05)
axes[2].set_title('p-Value Matrix')
for i in range(4):
    for j in range(4):
        color = 'white' if p_matrix[i, j] < 0.025 else 'black'
        axes[2].text(j, i, f'{p_matrix[i, j]:.3f}', 
                    ha='center', va='center', color=color)

# Add colorbars
fig.colorbar(im1, ax=axes[0])
fig.colorbar(im2, ax=axes[1])
fig.colorbar(im3, ax=axes[2])

plt.tight_layout()
plt.show()

# Scatter plot matrix
fig, axes = plt.subplots(4, 4, figsize=(12, 12))

for i in range(4):
    for j in range(4):
        if i == j:
            # Diagonal: histogram
            axes[i, j].hist(corr_data[:, i], bins=20, alpha=0.7)
            axes[i, j].set_title(f'Variable {i+1}')
        else:
            # Off-diagonal: scatter plot
            axes[i, j].scatter(corr_data[:, j], corr_data[:, i], alpha=0.5)
            r, p = pearsonr(corr_data[:, j], corr_data[:, i])
            axes[i, j].set_title(f'r = {r:.3f}')
        
        axes[i, j].grid(True, alpha=0.3)
        
        if i == 3:
            axes[i, j].set_xlabel(f'Variable {j+1}')
        if j == 0:
            axes[i, j].set_ylabel(f'Variable {i+1}')

plt.tight_layout()
plt.show()

Regression Analysis

1. Simple Linear Regression

python
# Generate regression data
np.random.seed(42)
n = 100
x = np.random.uniform(0, 10, n)
y = 2 + 3 * x + np.random.normal(0, 2, n)  # y = 2 + 3x + ε

# Linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

print("Simple Linear Regression Results:")
print(f"Slope: {slope:.3f} ± {std_err:.3f}")
print(f"Intercept: {intercept:.3f}")
print(f"Correlation Coefficient: {r_value:.3f}")
print(f"Coefficient of Determination R²: {r_value**2:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Standard Error: {std_err:.3f}")

# Prediction
y_pred = slope * x + intercept
residuals = y - y_pred

# Calculate confidence intervals and prediction intervals
from scipy import stats as scipy_stats

# Calculate statistics
n = len(x)
mean_x = np.mean(x)
ss_xx = np.sum((x - mean_x)**2)
ss_res = np.sum(residuals**2)
mse = ss_res / (n - 2)

# t value (95% confidence interval)
t_val = scipy_stats.t.ppf(0.975, n - 2)

# Confidence and prediction intervals
x_new = np.linspace(x.min(), x.max(), 100)
y_new = slope * x_new + intercept

# Confidence interval (mean confidence interval)
se_mean = np.sqrt(mse * (1/n + (x_new - mean_x)**2 / ss_xx))
ci_lower = y_new - t_val * se_mean
ci_upper = y_new + t_val * se_mean

# Prediction interval (individual value prediction interval)
se_pred = np.sqrt(mse * (1 + 1/n + (x_new - mean_x)**2 / ss_xx))
pi_lower = y_new - t_val * se_pred
pi_upper = y_new + t_val * se_pred

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Regression plot
axes[0, 0].scatter(x, y, alpha=0.6, label='Data Points')
axes[0, 0].plot(x_new, y_new, 'r-', linewidth=2, label='Regression Line')
axes[0, 0].fill_between(x_new, ci_lower, ci_upper, alpha=0.3, label='95% Confidence Interval')
axes[0, 0].fill_between(x_new, pi_lower, pi_upper, alpha=0.2, label='95% Prediction Interval')
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('Y')
axes[0, 0].set_title('Linear Regression')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Residual plot
axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Values')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residual Plot')
axes[0, 1].grid(True, alpha=0.3)

# Residual normality test
axes[1, 0].hist(residuals, bins=20, density=True, alpha=0.7, edgecolor='black')
# Overlay normal distribution
resid_mean, resid_std = np.mean(residuals), np.std(residuals)
x_norm = np.linspace(residuals.min(), residuals.max(), 100)
y_norm = scipy_stats.norm.pdf(x_norm, resid_mean, resid_std)
axes[1, 0].plot(x_norm, y_norm, 'r-', linewidth=2, label='Normal Distribution')
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Density')
axes[1, 0].set_title('Residual Distribution')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plot
scipy_stats.probplot(residuals, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Residual Q-Q Plot')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Residual analysis
print(f"\nResidual Analysis:")
print(f"Residual Mean: {np.mean(residuals):.6f}")
print(f"Residual Std Dev: {np.std(residuals):.3f}")

# Residual normality test
shapiro_stat, shapiro_p = scipy_stats.shapiro(residuals)
print(f"Shapiro-Wilk Test: statistic={shapiro_stat:.6f}, p-value={shapiro_p:.6f}")

# Durbin-Watson test (autocorrelation)
def durbin_watson(residuals):
    diff = np.diff(residuals)
    return np.sum(diff**2) / np.sum(residuals**2)

dw_stat = durbin_watson(residuals)
print(f"Durbin-Watson Statistic: {dw_stat:.3f}")
print(f"Autocorrelation: {'Exists' if dw_stat < 1.5 or dw_stat > 2.5 else 'Does not exist'}")

2. Multiple Linear Regression

python
# Use sklearn for multiple regression (scipy.stats mainly for statistical tests)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Generate multiple regression data
np.random.seed(42)
n = 200
X = np.random.normal(0, 1, (n, 3))  # 3 predictor variables
beta = np.array([2, -1.5, 3])  # True coefficients
y = 5 + X @ beta + np.random.normal(0, 2, n)  # y = 5 + 2x1 - 1.5x2 + 3x3 + ε

# Fit model
model = LinearRegression()
model.fit(X, y)

# Prediction
y_pred = model.predict(X)
residuals = y - y_pred

print("Multiple Linear Regression Results:")
print(f"Intercept: {model.intercept_:.3f}")
print(f"Coefficients: {model.coef_}")
print(f"True Coefficients: {beta}")
print(f"R²: {r2_score(y, y_pred):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y, y_pred):.3f}")

# Significance testing of coefficients (need to calculate manually)
# Calculate standard errors
X_with_intercept = np.column_stack([np.ones(n), X])
XTX_inv = np.linalg.inv(X_with_intercept.T @ X_with_intercept)
mse = np.sum(residuals**2) / (n - X.shape[1] - 1)
se_coeffs = np.sqrt(np.diag(XTX_inv) * mse)

# t statistics
coeffs_with_intercept = np.array([model.intercept_] + list(model.coef_))
t_stats = coeffs_with_intercept / se_coeffs
p_values = 2 * (1 - scipy_stats.t.cdf(np.abs(t_stats), n - X.shape[1] - 1))

print(f"\nCoefficient Significance Testing:")
coeff_names = ['Intercept', 'X1', 'X2', 'X3']
for i, name in enumerate(coeff_names):
    print(f"{name}: Coefficient={coeffs_with_intercept[i]:.3f}, "
          f"Std Error={se_coeffs[i]:.3f}, t={t_stats[i]:.3f}, p={p_values[i]:.6f}")

# Model overall significance test (F test)
ss_res = np.sum(residuals**2)
ss_tot = np.sum((y - np.mean(y))**2)
ss_reg = ss_tot - ss_res

df_reg = X.shape[1]
df_res = n - X.shape[1] - 1

ms_reg = ss_reg / df_reg
ms_res = ss_res / df_res

f_stat = ms_reg / ms_res
f_p_value = 1 - scipy_stats.f.cdf(f_stat, df_reg, df_res)

print(f"\nModel Overall Significance Test:")
print(f"F Statistic: {f_stat:.3f}")
print(f"p Value: {f_p_value:.6f}")
print(f"Conclusion: Model is {'significant' if f_p_value < 0.05 else 'not significant'} (α=0.05)")

Non-Parametric Statistics

1. Sign Test and Rank Test

python
# Sign test
np.random.seed(42)
data = np.random.normal(2, 1, 50)  # True median is 2
median_0 = 0  # Hypothesized median

# Calculate signs
signs = np.sign(data - median_0)
positive_signs = np.sum(signs > 0)
negative_signs = np.sum(signs < 0)
zero_signs = np.sum(signs == 0)

print("Sign Test:")
print(f"Number of Positive Signs: {positive_signs}")
print(f"Number of Negative Signs: {negative_signs}")
print(f"Number of Zeros: {zero_signs}")

# Use binomial test
n_nonzero = positive_signs + negative_signs
binom_p = scipy_stats.binom_test(positive_signs, n_nonzero, p=0.5)
print(f"Binomial Test p Value: {binom_p:.6f}")
print(f"Conclusion: {'Reject' if binom_p < 0.05 else 'Do not reject'} H0 (α=0.05)")

# Wilcoxon signed-rank test
wilcoxon_stat, wilcoxon_p = scipy_stats.wilcoxon(data - median_0)
print(f"\nWilcoxon Signed-Rank Test:")
print(f"Statistic: {wilcoxon_stat}")
print(f"p Value: {wilcoxon_p:.6f}")
print(f"Conclusion: {'Reject' if wilcoxon_p < 0.05 else 'Do not reject'} H0 (α=0.05)")

2. Runs Test

python
# Runs test (randomness test)
def runs_test(data, cutoff=None):
    """
    Runs test
    """
    if cutoff is None:
        cutoff = np.median(data)
    
    # Convert to binary sequence
    binary_seq = (data > cutoff).astype(int)
    
    # Calculate runs
    runs = []
    current_run = 1
    
    for i in range(1, len(binary_seq)):
        if binary_seq[i] == binary_seq[i-1]:
            current_run += 1
        else:
            runs.append(current_run)
            current_run = 1
    runs.append(current_run)
    
    # Number of runs
    n_runs = len(runs)
    n1 = np.sum(binary_seq)
    n2 = len(binary_seq) - n1
    
    # Expected number of runs and variance
    expected_runs = (2 * n1 * n2) / (n1 + n2) + 1
    var_runs = (2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) / ((n1 + n2)**2 * (n1 + n2 - 1))
    
    # Z statistic
    z_stat = (n_runs - expected_runs) / np.sqrt(var_runs)
    p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
    
    return n_runs, expected_runs, z_stat, p_value

# Test randomness
np.random.seed(42)
random_data = np.random.normal(0, 1, 100)
trend_data = np.linspace(-2, 2, 100) + np.random.normal(0, 0.1, 100)

print("Runs Test (Randomness Test):")
for data, name in [(random_data, "Random Data"), (trend_data, "Trend Data")]:
    n_runs, expected, z_stat, p_val = runs_test(data)
    print(f"\n{name}:")
    print(f"  Observed Runs: {n_runs}")
    print(f"  Expected Runs: {expected:.2f}")
    print(f"  Z Statistic: {z_stat:.3f}")
    print(f"  p Value: {p_val:.6f}")
    print(f"  Conclusion: Data is {'not' if p_val < 0.05 else 'random'} (α=0.05)")

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

for i, (data, name) in enumerate([(random_data, "Random Data"), (trend_data, "Trend Data")]):
    # Time series plot
    axes[i, 0].plot(data)
    axes[i, 0].axhline(y=np.median(data), color='r', linestyle='--', label='Median')
    axes[i, 0].set_title(f'{name} - Time Series')
    axes[i, 0].set_xlabel('Time')
    axes[i, 0].set_ylabel('Value')
    axes[i, 0].legend()
    axes[i, 0].grid(True, alpha=0.3)
    
    # Runs plot
    binary_seq = (data > np.median(data)).astype(int)
    axes[i, 1].plot(binary_seq, 'o-', markersize=3)
    axes[i, 1].set_title(f'{name} - Runs Plot')
    axes[i, 1].set_xlabel('Time')
    axes[i, 1].set_ylabel('Above Median (1) / Below Median (0)')
    axes[i, 1].set_ylim(-0.1, 1.1)
    axes[i, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Kernel Density Estimation

python
# Kernel density estimation
np.random.seed(42)

# Generate mixed distribution data
data1 = np.random.normal(-2, 0.5, 300)
data2 = np.random.normal(2, 0.8, 200)
mixed_data = np.concatenate([data1, data2])

# Kernel density estimation with different bandwidths
bandwidths = [0.1, 0.3, 0.5, 1.0]
x_range = np.linspace(mixed_data.min() - 1, mixed_data.max() + 1, 1000)

plt.figure(figsize=(15, 10))

for i, bw in enumerate(bandwidths):
    plt.subplot(2, 2, i+1)
    
    # Histogram
    plt.hist(mixed_data, bins=30, density=True, alpha=0.5, color='lightblue', 
             edgecolor='black', label='Histogram')
    
    # Kernel density estimation
    kde = scipy_stats.gaussian_kde(mixed_data, bw_method=bw)
    kde_values = kde(x_range)
    plt.plot(x_range, kde_values, linewidth=2, label=f'KDE (bandwidth={bw})')
    
    # True density (theoretical)
    true_density = (0.6 * scipy_stats.norm.pdf(x_range, -2, 0.5) + 
                   0.4 * scipy_stats.norm.pdf(x_range, 2, 0.8))
    plt.plot(x_range, true_density, 'r--', linewidth=2, label='True Density')
    
    plt.title(f'Kernel Density Estimation (Bandwidth = {bw})')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Automatic bandwidth selection
kde_auto = scipy_stats.gaussian_kde(mixed_data)
print(f"Automatically Selected Bandwidth: {kde_auto.factor:.4f}")

# Comparison of different kernel functions (need to use sklearn)
from sklearn.neighbors import KernelDensity

kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential']

plt.figure(figsize=(12, 8))

for i, kernel in enumerate(kernels):
    plt.subplot(2, 2, i+1)
    
    # Histogram
    plt.hist(mixed_data, bins=30, density=True, alpha=0.5, color='lightblue', 
             edgecolor='black', label='Histogram')
    
    # Kernel density estimation
    kde_sk = KernelDensity(kernel=kernel, bandwidth=0.5)
    kde_sk.fit(mixed_data.reshape(-1, 1))
    log_density = kde_sk.score_samples(x_range.reshape(-1, 1))
    density = np.exp(log_density)
    
    plt.plot(x_range, density, linewidth=2, label=f'{kernel} Kernel')
    
    plt.title(f'{kernel.capitalize()} Kernel Density Estimation')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Practical Application Cases

Case 1: Quality Control Analysis

python
# Quality control data analysis
np.random.seed(42)

# Simulate production data
control_data = np.random.normal(100, 2, 200)  # Normal production
defective_data = np.random.normal(95, 3, 50)  # Abnormal production
all_data = np.concatenate([control_data, defective_data])
labels = ['Normal'] * 200 + ['Defective'] * 50

print("Quality Control Statistical Analysis:")
print(f"Total Samples: {len(all_data)}")
print(f"Normal Samples: {len(control_data)}, Defective Samples: {len(defective_data)}")

# Descriptive statistics
print(f"\nNormal Production Data:")
print(f"  Mean: {np.mean(control_data):.3f}")
print(f"  Std Dev: {np.std(control_data, ddof=1):.3f}")
print(f"  Range: [{np.min(control_data):.3f}, {np.max(control_data):.3f}]")

print(f"\nDefective Production Data:")
print(f"  Mean: {np.mean(defective_data):.3f}")
print(f"  Std Dev: {np.std(defective_data, ddof=1):.3f}")
print(f"  Range: [{np.min(defective_data):.3f}, {np.max(defective_data):.3f}]")

# Hypothesis testing
t_stat, p_value = scipy_stats.ttest_ind(control_data, defective_data)
print(f"\nIndependent Samples t-Test:")
print(f"t Statistic: {t_stat:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Conclusion: There {'is' if p_value < 0.05 else 'is no'} significant difference between means (α=0.05)")

# Control chart
control_mean = np.mean(control_data)
control_std = np.std(control_data, ddof=1)
ucl = control_mean + 3 * control_std  # Upper control limit
lcl = control_mean - 3 * control_std  # Lower control limit

plt.figure(figsize=(15, 10))

# Control chart
plt.subplot(2, 2, 1)
plt.plot(range(len(all_data)), all_data, 'o-', markersize=3, alpha=0.7)
plt.axhline(y=control_mean, color='g', linestyle='-', label='Center Line')
plt.axhline(y=ucl, color='r', linestyle='--', label='Upper Control Limit (UCL)')
plt.axhline(y=lcl, color='r', linestyle='--', label='Lower Control Limit (LCL)')
plt.axvline(x=200, color='orange', linestyle=':', label='Defective Start')
plt.title('Quality Control Chart')
plt.xlabel('Sample Number')
plt.ylabel('Quality Metric')
plt.legend()
plt.grid(True, alpha=0.3)

# Histogram comparison
plt.subplot(2, 2, 2)
plt.hist(control_data, bins=30, alpha=0.7, label='Normal', density=True)
plt.hist(defective_data, bins=20, alpha=0.7, label='Defective', density=True)
plt.title('Quality Distribution Comparison')
plt.xlabel('Quality Metric')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)

# Box plot
plt.subplot(2, 2, 3)
data_for_box = [control_data, defective_data]
plt.boxplot(data_for_box, labels=['Normal', 'Defective'])
plt.title('Quality Metric Box Plot')
plt.ylabel('Quality Metric')
plt.grid(True, alpha=0.3)

# Cumulative distribution function
plt.subplot(2, 2, 4)
sorted_control = np.sort(control_data)
sorted_defective = np.sort(defective_data)
y_control = np.arange(1, len(sorted_control) + 1) / len(sorted_control)
y_defective = np.arange(1, len(sorted_defective) + 1) / len(sorted_defective)

plt.plot(sorted_control, y_control, label='Normal', linewidth=2)
plt.plot(sorted_defective, y_defective, label='Defective', linewidth=2)
plt.title('Cumulative Distribution Function Comparison')
plt.xlabel('Quality Metric')
plt.ylabel('Cumulative Probability')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Case 2: A/B Testing Analysis

python
# A/B testing data analysis
np.random.seed(42)

# Simulate A/B test data
n_a, n_b = 1000, 1200
conversion_rate_a = 0.12  # Group A conversion rate
conversion_rate_b = 0.15  # Group B conversion rate

# Generate conversion data
conversions_a = np.random.binomial(1, conversion_rate_a, n_a)
conversions_b = np.random.binomial(1, conversion_rate_b, n_b)

# Calculate statistics
conv_a = np.sum(conversions_a)
conv_b = np.sum(conversions_b)
rate_a = conv_a / n_a
rate_b = conv_b / n_b

print("A/B Test Results Analysis:")
print(f"Group A: {conv_a}/{n_a} = {rate_a:.4f} ({rate_a*100:.2f}%)")
print(f"Group B: {conv_b}/{n_b} = {rate_b:.4f} ({rate_b*100:.2f}%)")
print(f"Improvement: {(rate_b - rate_a)/rate_a*100:.2f}%")

# Proportion test
from statsmodels.stats.proportion import proportions_ztest

counts = np.array([conv_a, conv_b])
nobs = np.array([n_a, n_b])

z_stat, p_value = proportions_ztest(counts, nobs)
print(f"\nProportion Test:")
print(f"Z Statistic: {z_stat:.3f}")
print(f"p Value: {p_value:.6f}")
print(f"Conclusion: Group B conversion rate is {'significantly' if p_value < 0.05 else 'not significantly'} higher than Group A (α=0.05)")

# Confidence interval
from statsmodels.stats.proportion import proportion_confint

ci_a = proportion_confint(conv_a, n_a, alpha=0.05)
ci_b = proportion_confint(conv_b, n_b, alpha=0.05)

print(f"\n95% Confidence Interval:")
print(f"Group A: [{ci_a[0]:.4f}, {ci_a[1]:.4f}]")
print(f"Group B: [{ci_b[0]:.4f}, {ci_b[1]:.4f}]")

# Effect size (Cohen's h)
def cohens_h(p1, p2):
    return 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2)))

effect_size = cohens_h(rate_b, rate_a)
print(f"\nEffect Size (Cohen's h): {effect_size:.4f}")

# Power analysis
from statsmodels.stats.power import ttest_power

power = ttest_power(effect_size, n_a, alpha=0.05)
print(f"Statistical Power: {power:.4f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Conversion rate comparison
rates = [rate_a, rate_b]
errors = [ci_a[1] - rate_a, ci_b[1] - rate_b]
axes[0, 0].bar(['Group A', 'Group B'], rates, yerr=errors, capsize=5, alpha=0.7)
axes[0, 0].set_title('Conversion Rate Comparison')
axes[0, 0].set_ylabel('Conversion Rate')
axes[0, 0].grid(True, alpha=0.3)

# Conversion count comparison
axes[0, 1].bar(['Group A', 'Group B'], [conv_a, conv_b], alpha=0.7)
axes[0, 1].set_title('Conversion Count Comparison')
axes[0, 1].set_ylabel('Conversion Count')
for i, v in enumerate([conv_a, conv_b]):
    axes[0, 1].text(i, v + 5, str(v), ha='center')
axes[0, 1].grid(True, alpha=0.3)

# Confidence interval visualization
x_pos = [0, 1]
axes[1, 0].errorbar(x_pos, rates, 
                   yerr=[[rate_a - ci_a[0], rate_b - ci_b[0]], 
                         [ci_a[1] - rate_a, ci_b[1] - rate_b]], 
                   fmt='o', capsize=10, capthick=2, markersize=8)
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(['Group A', 'Group B'])
axes[1, 0].set_title('Conversion Rate and 95% Confidence Interval')
axes[1, 0].set_ylabel('Conversion Rate')
axes[1, 0].grid(True, alpha=0.3)

# Sample size and power relationship
sample_sizes = np.arange(100, 2000, 50)
powers = [ttest_power(effect_size, n, alpha=0.05) for n in sample_sizes]
axes[1, 1].plot(sample_sizes, powers, linewidth=2)
axes[1, 1].axhline(y=0.8, color='r', linestyle='--', label='Power=0.8')
axes[1, 1].axvline(x=n_a, color='g', linestyle=':', label=f'Current Sample Size={n_a}')
axes[1, 1].set_title('Sample Size and Statistical Power Relationship')
axes[1, 1].set_xlabel('Sample Size')
axes[1, 1].set_ylabel('Statistical Power')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Summary

This chapter provided a detailed introduction to the core functions of SciPy's statistical analysis module:

  1. Probability Distributions: Mastered the use of continuous and discrete distributions
  2. Descriptive Statistics: Learned to calculate various statistics and data visualization
  3. Hypothesis Testing: Understood one-sample, two-sample, and multiple-sample testing methods
  4. Correlation Analysis: Mastered methods for measuring linear and non-linear correlations
  5. Regression Analysis: Learned simple and multiple linear regression implementation
  6. Non-Parametric Statistics: Learned statistical methods not dependent on distribution assumptions
  7. Kernel Density Estimation: Mastered non-parametric estimation of data distribution
  8. Practical Applications: Learned practical applications through quality control and A/B testing cases

Practice Exercises

  1. Generate 1000 samples from a normal distribution and perform Shapiro-Wilk normality test
  2. Compare the mean differences between two groups and select appropriate test methods
  3. Perform ANOVA on multiple groups and conduct post-hoc tests
  4. Calculate Pearson and Spearman correlation coefficients for two variables and explain the differences
  5. Use kernel density estimation to analyze characteristics of mixed distribution data
  6. Design an A/B testing analysis workflow including sample size calculation and result interpretation
  7. Implement a quality control chart to identify anomalous data points
  8. Use runs test to analyze randomness of time series data

Content is for learning and research only.