SciPy Basic Concepts

In the previous chapter, we learned how to install and configure SciPy. This chapter will delve into the fundamental concepts of SciPy, including data structures, common functions, and programming patterns.

Core Principles of SciPy

1. Based on NumPy Arrays

All SciPy functionality is built on top of NumPy arrays. Understanding NumPy arrays is fundamental to using SciPy:

import numpy as np
import scipy as sp

# NumPy arrays are the foundation of SciPy
data = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
print(f"Data type: {type(data)}")
print(f"Array shape: {data.shape}")
print(f"Data dtype: {data.dtype}")

2. Modular Design

SciPy adopts a modular design, with each module focusing on a specific area of scientific computing:

# Import specific modules
from scipy import stats, optimize, integrate, linalg

# Or import the entire scipy package
import scipy

# View available submodules
print(dir(scipy))

3. Functional Programming Style

SciPy primarily uses a functional programming style, with most operations completed through function calls:

import numpy as np
from scipy import stats

# Functional calls
data = np.random.normal(0, 1, 100)
mean = np.mean(data)
std_dev = np.std(data)
t_stat, p_value = stats.ttest_1samp(data, 0)

print(f"Mean: {mean:.3f}")
print(f"Standard deviation: {std_dev:.3f}")
print(f"t-test p-value: {p_value:.3f}")

Common Data Types

1. Scalars

import numpy as np

# Different types of scalars
integer_scalar = np.int32(42)
float_scalar = np.float64(3.14159)
complex_scalar = np.complex128(1 + 2j)

print(f"Integer: {integer_scalar}, Type: {type(integer_scalar)}")
print(f"Float: {float_scalar}, Type: {type(float_scalar)}")
print(f"Complex: {complex_scalar}, Type: {type(complex_scalar)}")

2. One-Dimensional Arrays (Vectors)

# Create one-dimensional array
vector = np.array([1, 2, 3, 4, 5])
print(f"Vector: {vector}")
print(f"Shape: {vector.shape}")
print(f"Dimensions: {vector.ndim}")

# Common methods for creating 1D arrays
zeros_vec = np.zeros(5)
ones_vec = np.ones(5)
range_vec = np.arange(0, 10, 2)
linspace_vec = np.linspace(0, 1, 5)

print(f"Zero vector: {zeros_vec}")
print(f"Ones vector: {ones_vec}")
print(f"Range vector: {range_vec}")
print(f"Linspace vector: {linspace_vec}")

3. Two-Dimensional Arrays (Matrices)

# Create two-dimensional array
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Matrix:\n{matrix}")
print(f"Shape: {matrix.shape}")
print(f"Dimensions: {matrix.ndim}")

# Common methods for creating 2D arrays
zeros_mat = np.zeros((3, 3))
ones_mat = np.ones((2, 4))
identity_mat = np.eye(3)
random_mat = np.random.random((2, 3))

print(f"Zero matrix:\n{zeros_mat}")
print(f"Identity matrix:\n{identity_mat}")
print(f"Random matrix:\n{random_mat}")

4. Multi-Dimensional Arrays (Tensors)

# Create three-dimensional array
tensor = np.random.random((2, 3, 4))
print(f"Tensor shape: {tensor.shape}")
print(f"Tensor dimensions: {tensor.ndim}")
print(f"Tensor size: {tensor.size}")

# Access tensor elements
print(f"First 2D slice:\n{tensor[0]}")
print(f"Specific element: {tensor[0, 1, 2]}")

Basic Array Operations

1. Array Indexing and Slicing

import numpy as np

# 1D array indexing
arr_1d = np.array([10, 20, 30, 40, 50])
print(f"First element: {arr_1d[0]}")
print(f"Last element: {arr_1d[-1]}")
print(f"Slice [1:4]: {arr_1d[1:4]}")

# 2D array indexing
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Element [1,2]: {arr_2d[1, 2]}")
print(f"Second row: {arr_2d[1, :]}")
print(f"Third column: {arr_2d[:, 2]}")

# Boolean indexing
data = np.array([1, 2, 3, 4, 5, 6])
mask = data > 3
print(f"Elements > 3: {data[mask]}")

2. Array Shape Operations

# Reshape arrays
arr = np.arange(12)
print(f"Original array: {arr}")

# Reshape to 2D array
reshaped = arr.reshape(3, 4)
print(f"Reshaped:\n{reshaped}")

# Flatten array
flattened = reshaped.flatten()
print(f"Flattened: {flattened}")

# Transpose
transposed = reshaped.T
print(f"Transposed:\n{transposed}")

3. Array Concatenation and Splitting

# Array concatenation
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

# Horizontal stacking
hstacked = np.hstack([arr1, arr2])
print(f"Horizontal stack: {hstacked}")

# Vertical stacking
vstacked = np.vstack([arr1, arr2])
print(f"Vertical stack:\n{vstacked}")

# Array splitting
data = np.arange(10)
split_arrays = np.split(data, 5)
print(f"Split result: {split_arrays}")

Common SciPy Function Patterns

1. Statistical Function Patterns

from scipy import stats
import numpy as np

# Generate test data
np.random.seed(42)
data = np.random.normal(100, 15, 1000)

# Descriptive statistics
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Standard deviation: {np.std(data):.2f}")
print(f"Skewness: {stats.skew(data):.3f}")
print(f"Kurtosis: {stats.kurtosis(data):.3f}")

# Hypothesis testing
t_stat, p_value = stats.ttest_1samp(data, 100)
print(f"One-sample t-test: t={t_stat:.3f}, p={p_value:.3f}")

# Distribution fitting
mu, sigma = stats.norm.fit(data)
print(f"Fitted normal distribution parameters: μ={mu:.2f}, σ={sigma:.2f}")

2. Optimization Function Patterns

from scipy import optimize
import numpy as np

# Define objective function
def objective_function(x):
    return x**2 + 10*np.sin(x)

# Find minimum
result = optimize.minimize_scalar(objective_function)
print(f"Minimum location: x={result.x:.3f}")
print(f"Minimum value: f(x)={result.fun:.3f}")

# Solve equation
def equation(x):
    return x**3 - 2*x - 5

root = optimize.fsolve(equation, 2.0)[0]
print(f"Root of equation: x={root:.3f}")
print(f"Verification: f({root:.3f})={equation(root):.6f}")

3. Integration Function Patterns

from scipy import integrate
import numpy as np

# Define integrand
def integrand(x):
    return np.exp(-x**2)

# Numerical integration
result, error = integrate.quad(integrand, 0, np.inf)
print(f"Integration result: {result:.6f}")
print(f"Estimated error: {error:.2e}")
print(f"Theoretical value: {np.sqrt(np.pi)/2:.6f}")

# Multiple integration
def integrand_2d(y, x):
    return x * y

result_2d = integrate.dblquad(integrand_2d, 0, 1, lambda x: 0, lambda x: 1)
print(f"Double integration result: {result_2d[0]:.6f}")

Error Handling and Debugging

1. Common Error Types

import numpy as np
from scipy import stats

# Dimension mismatch error
try:
    a = np.array([1, 2, 3])
    b = np.array([[1, 2], [3, 4]])
    result = a + b  # This will raise a broadcasting error
except ValueError as e:
    print(f"Dimension error: {e}")

# Numerical computation error
try:
    result = np.log(-1)  # Logarithm of negative number
except RuntimeWarning as e:
    print(f"Numerical warning: {e}")

# Statistical function error
try:
    empty_data = np.array([])
    mean = np.mean(empty_data)
except RuntimeWarning as e:
    print(f"Empty array warning: {e}")

2. Debugging Techniques

# Check array properties
def debug_array(arr, name="Array"):
    print(f"{name} information:")
    print(f"  Shape: {arr.shape}")
    print(f"  Data type: {arr.dtype}")
    print(f"  Dimensions: {arr.ndim}")
    print(f"  Size: {arr.size}")
    print(f"  Memory usage: {arr.nbytes} bytes")
    print(f"  Is contiguous: {arr.flags.c_contiguous}")
    print(f"  Min value: {np.min(arr)}")
    print(f"  Max value: {np.max(arr)}")
    print(f"  Contains NaN: {np.any(np.isnan(arr))}")
    print(f"  Contains infinity: {np.any(np.isinf(arr))}")
    print()

# Usage example
test_array = np.random.random((3, 4))
debug_array(test_array, "Test Array")

Performance Optimization Basics

1. Vectorized Operations

import numpy as np
import time

# Avoid loops, use vectorized operations
n = 1000000

# Inefficient loop approach
start_time = time.time()
result_loop = []
for i in range(n):
    result_loop.append(i ** 2)
loop_time = time.time() - start_time

# Efficient vectorized approach
start_time = time.time()
data = np.arange(n)
result_vectorized = data ** 2
vectorized_time = time.time() - start_time

print(f"Loop approach time: {loop_time:.4f} seconds")
print(f"Vectorized time: {vectorized_time:.4f} seconds")
print(f"Performance improvement: {loop_time / vectorized_time:.1f}x")

2. Memory Management

# Use appropriate data types
data_float64 = np.random.random(1000000)  # Default float64
data_float32 = np.random.random(1000000).astype(np.float32)  # Convert to float32

print(f"float64 memory usage: {data_float64.nbytes / 1024 / 1024:.2f} MB")
print(f"float32 memory usage: {data_float32.nbytes / 1024 / 1024:.2f} MB")
print(f"Memory saved: {(1 - data_float32.nbytes / data_float64.nbytes) * 100:.1f}%")

# In-place operations
data = np.random.random(1000000)

# Create new array (uses more memory)
result1 = data * 2

# In-place operation (saves memory)
data *= 2  # Equivalent to data = data * 2, but more memory efficient

3. Choose Appropriate Algorithms

from scipy import linalg
import numpy as np

# For symmetric positive definite matrices, Cholesky decomposition is more efficient
n = 1000
A = np.random.random((n, n))
A = A @ A.T  # Create symmetric positive definite matrix
b = np.random.random(n)

# General LU decomposition
start_time = time.time()
x1 = linalg.solve(A, b)
lu_time = time.time() - start_time

# Cholesky decomposition (for symmetric positive definite matrices)
start_time = time.time()
x2 = linalg.solve(A, b, assume_a='pos')
chol_time = time.time() - start_time

print(f"LU decomposition time: {lu_time:.4f} seconds")
print(f"Cholesky decomposition time: {chol_time:.4f} seconds")
print(f"Performance improvement: {lu_time / chol_time:.1f}x")
print(f"Solution difference: {np.max(np.abs(x1 - x2)):.2e}")

Practical Application Examples

Example 1: Data Preprocessing

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Generate noisy data
np.random.seed(42)
n_samples = 1000
true_signal = np.sin(np.linspace(0, 4*np.pi, n_samples))
noise = np.random.normal(0, 0.3, n_samples)
noisy_data = true_signal + noise

# Data cleaning: remove outliers
z_scores = np.abs(stats.zscore(noisy_data))
threshold = 3
clean_data = noisy_data[z_scores < threshold]
clean_indices = np.where(z_scores < threshold)[0]

print(f"Original data points: {len(noisy_data)}")
print(f"Cleaned data points: {len(clean_data)}")
print(f"Outliers removed: {len(noisy_data) - len(clean_data)}")

# Visualize results
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(true_signal, label='True Signal', linewidth=2)
plt.title('True Signal')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
plt.plot(noisy_data, alpha=0.7, label='Noisy Data')
plt.title('Noisy Data')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
plt.plot(clean_indices, clean_data, 'g.', alpha=0.7, label='Cleaned Data')
plt.title('Cleaned Data')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 4)
plt.hist(noisy_data, bins=50, alpha=0.7, label='Original Data', density=True)
plt.hist(clean_data, bins=50, alpha=0.7, label='Cleaned Data', density=True)
plt.title('Data Distribution Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Example 2: Simple Machine Learning Workflow

import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(42)
n_samples = 200
X = np.random.uniform(-3, 3, n_samples)
y = 2 * X + 1 + np.random.normal(0, 0.5, n_samples)  # y = 2x + 1 + noise

# Data preprocessing
X = X.reshape(-1, 1)  # sklearn requires 2D array

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model parameters:")
print(f"  Slope: {model.coef_[0]:.3f} (true value: 2.000)")
print(f"  Intercept: {model.intercept_:.3f} (true value: 1.000)")
print(f"Model performance:")
print(f"  Mean squared error: {mse:.3f}")
print(f"  R² score: {r2:.3f}")

# Statistical test
residuals = y_test - y_pred
_, p_value = stats.normaltest(residuals)
print(f"Residual normality test p-value: {p_value:.3f}")

# Visualization
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.scatter(X_train, y_train, alpha=0.6, label='Training Data')
plt.scatter(X_test, y_test, alpha=0.6, label='Test Data')
X_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_line = model.predict(X_line)
plt.plot(X_line, y_line, 'r-', linewidth=2, label='Fitted Line')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs True Values')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
plt.hist(residuals, bins=20, density=True, alpha=0.7)
x_norm = np.linspace(residuals.min(), residuals.max(), 100)
y_norm = stats.norm.pdf(x_norm, np.mean(residuals), np.std(residuals))
plt.plot(x_norm, y_norm, 'r-', linewidth=2, label='Normal Distribution')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.title('Residual Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Best Practices

1. Code Organization

# Recommended import style
import numpy as np
import scipy as sp
from scipy import stats, optimize, integrate, linalg
import matplotlib.pyplot as plt

# Avoid using from scipy import *
# This pollutes the namespace

# Use meaningful variable names
data_samples = np.random.normal(0, 1, 1000)  # Good
x = np.random.normal(0, 1, 1000)  # Not clear enough

# Add appropriate comments
def calculate_confidence_interval(data, confidence=0.95):
    """
    Calculate confidence interval for data

    Parameters:
        data: Data array
        confidence: Confidence level (default 0.95)

    Returns:
        (lower_bound, upper_bound): Lower and upper bounds of confidence interval
    """
    alpha = 1 - confidence
    n = len(data)
    mean = np.mean(data)
    std_err = stats.sem(data)  # Standard error
    t_value = stats.t.ppf(1 - alpha/2, n-1)
    margin_error = t_value * std_err

    return mean - margin_error, mean + margin_error

2. Error Handling

def safe_divide(a, b):
    """
    Safe division operation, handles division by zero
    """
    try:
        result = np.divide(a, b)
        # Check for infinity or NaN
        if np.any(np.isinf(result)) or np.any(np.isnan(result)):
            print("Warning: Result contains infinity or NaN values")
        return result
    except ZeroDivisionError:
        print("Error: Division by zero")
        return None
    except Exception as e:
        print(f"Unknown error: {e}")
        return None

# Usage example
a = np.array([1, 2, 3, 4])
b = np.array([2, 0, 1, 2])
result = safe_divide(a, b)
print(f"Safe division result: {result}")

3. Performance Monitoring

import time
from functools import wraps

def timing_decorator(func):
    """
    Decorator: measure function execution time
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} execution time: {end_time - start_time:.4f} seconds")
        return result
    return wrapper

@timing_decorator
def matrix_multiplication(n):
    """
    Matrix multiplication performance test
    """
    A = np.random.random((n, n))
    B = np.random.random((n, n))
    return A @ B

# Test different matrix sizes
for size in [100, 500, 1000]:
    print(f"\nMatrix size: {size}x{size}")
    result = matrix_multiplication(size)

Summary

In this chapter, we learned about SciPy's basic concepts:

Core Principles: Based on NumPy arrays, modular design, functional programming
Data Types: Scalars, vectors, matrices, tensors
Array Operations: Indexing, slicing, shape operations, concatenation and splitting
Function Patterns: Common patterns for statistics, optimization, integration
Error Handling: Common error types and debugging techniques
Performance Optimization: Vectorization, memory management, algorithm selection
Best Practices: Code organization, error handling, performance monitoring

After mastering these basic concepts, you can begin learning specific SciPy functionality modules. Next, we will learn more advanced array processing techniques in Array and Matrix Operations.

Practice Exercises

Array Operations Practice: Create a 5x5 random matrix, calculate its row sums, column sums, and diagonal sum
Performance Comparison: Compare the performance difference between using loops and vectorized operations to calculate array element squares
Error Handling: Write a function to safely calculate the logarithm of an array, handling negative and zero values
Statistical Analysis: Generate two groups of random data, perform a t-test to compare their mean differences

# Practice exercise reference answers

# 1. Array operations practice
import numpy as np

matrix = np.random.random((5, 5))
print(f"Matrix:\n{matrix}")
print(f"Row sums: {np.sum(matrix, axis=1)}")
print(f"Column sums: {np.sum(matrix, axis=0)}")
print(f"Main diagonal sum: {np.trace(matrix)}")
print(f"Anti-diagonal sum: {np.sum(np.fliplr(matrix).diagonal())}")

# 2. Performance comparison
import time

n = 100000
data = np.random.random(n)

# Loop approach
start = time.time()
squared_loop = [x**2 for x in data]
loop_time = time.time() - start

# Vectorized approach
start = time.time()
squared_vectorized = data**2
vectorized_time = time.time() - start

print(f"Loop time: {loop_time:.4f}s")
print(f"Vectorized time: {vectorized_time:.4f}s")
print(f"Performance improvement: {loop_time/vectorized_time:.1f}x")

# 3. Error handling
def safe_log(arr):
    """
    Safely compute logarithm, handling negative and zero values
    """
    result = np.full_like(arr, np.nan, dtype=float)
    positive_mask = arr > 0
    result[positive_mask] = np.log(arr[positive_mask])

    if np.any(arr <= 0):
        print(f"Warning: Found {np.sum(arr <= 0)} non-positive values")

    return result

# Test
test_data = np.array([-1, 0, 1, 2.718, 10])
log_result = safe_log(test_data)
print(f"Safe logarithm result: {log_result}")

# 4. Statistical analysis
from scipy import stats

np.random.seed(42)
group1 = np.random.normal(100, 15, 50)
group2 = np.random.normal(105, 15, 50)

t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"Two-sample t-test:")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}")
print(f"Conclusion: {'Significant difference exists' if p_value < 0.05 else 'No significant difference'}")

#SciPy Basic Concepts

#Core Principles of SciPy

#1. Based on NumPy Arrays

#2. Modular Design

#3. Functional Programming Style

#Common Data Types

#1. Scalars

#2. One-Dimensional Arrays (Vectors)

#3. Two-Dimensional Arrays (Matrices)

#4. Multi-Dimensional Arrays (Tensors)

#Basic Array Operations

#1. Array Indexing and Slicing

#2. Array Shape Operations

#3. Array Concatenation and Splitting

#Common SciPy Function Patterns

#1. Statistical Function Patterns

#2. Optimization Function Patterns

#3. Integration Function Patterns

#Error Handling and Debugging

#1. Common Error Types

#2. Debugging Techniques

#Performance Optimization Basics

#1. Vectorized Operations

#2. Memory Management

#3. Choose Appropriate Algorithms

#Practical Application Examples

#Example 1: Data Preprocessing

#Example 2: Simple Machine Learning Workflow

#Best Practices

#1. Code Organization

#2. Error Handling

#3. Performance Monitoring

#Summary

#Practice Exercises