SciPy Basic Concepts
In the previous chapter, we learned how to install and configure SciPy. This chapter will delve into the fundamental concepts of SciPy, including data structures, common functions, and programming patterns.
Core Principles of SciPy
1. Based on NumPy Arrays
All SciPy functionality is built on top of NumPy arrays. Understanding NumPy arrays is fundamental to using SciPy:
import numpy as np
import scipy as sp
# NumPy arrays are the foundation of SciPy
data = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
print(f"Data type: {type(data)}")
print(f"Array shape: {data.shape}")
print(f"Data dtype: {data.dtype}")2. Modular Design
SciPy adopts a modular design, with each module focusing on a specific area of scientific computing:
# Import specific modules
from scipy import stats, optimize, integrate, linalg
# Or import the entire scipy package
import scipy
# View available submodules
print(dir(scipy))3. Functional Programming Style
SciPy primarily uses a functional programming style, with most operations completed through function calls:
import numpy as np
from scipy import stats
# Functional calls
data = np.random.normal(0, 1, 100)
mean = np.mean(data)
std_dev = np.std(data)
t_stat, p_value = stats.ttest_1samp(data, 0)
print(f"Mean: {mean:.3f}")
print(f"Standard deviation: {std_dev:.3f}")
print(f"t-test p-value: {p_value:.3f}")Common Data Types
1. Scalars
import numpy as np
# Different types of scalars
integer_scalar = np.int32(42)
float_scalar = np.float64(3.14159)
complex_scalar = np.complex128(1 + 2j)
print(f"Integer: {integer_scalar}, Type: {type(integer_scalar)}")
print(f"Float: {float_scalar}, Type: {type(float_scalar)}")
print(f"Complex: {complex_scalar}, Type: {type(complex_scalar)}")2. One-Dimensional Arrays (Vectors)
# Create one-dimensional array
vector = np.array([1, 2, 3, 4, 5])
print(f"Vector: {vector}")
print(f"Shape: {vector.shape}")
print(f"Dimensions: {vector.ndim}")
# Common methods for creating 1D arrays
zeros_vec = np.zeros(5)
ones_vec = np.ones(5)
range_vec = np.arange(0, 10, 2)
linspace_vec = np.linspace(0, 1, 5)
print(f"Zero vector: {zeros_vec}")
print(f"Ones vector: {ones_vec}")
print(f"Range vector: {range_vec}")
print(f"Linspace vector: {linspace_vec}")3. Two-Dimensional Arrays (Matrices)
# Create two-dimensional array
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Matrix:\n{matrix}")
print(f"Shape: {matrix.shape}")
print(f"Dimensions: {matrix.ndim}")
# Common methods for creating 2D arrays
zeros_mat = np.zeros((3, 3))
ones_mat = np.ones((2, 4))
identity_mat = np.eye(3)
random_mat = np.random.random((2, 3))
print(f"Zero matrix:\n{zeros_mat}")
print(f"Identity matrix:\n{identity_mat}")
print(f"Random matrix:\n{random_mat}")4. Multi-Dimensional Arrays (Tensors)
# Create three-dimensional array
tensor = np.random.random((2, 3, 4))
print(f"Tensor shape: {tensor.shape}")
print(f"Tensor dimensions: {tensor.ndim}")
print(f"Tensor size: {tensor.size}")
# Access tensor elements
print(f"First 2D slice:\n{tensor[0]}")
print(f"Specific element: {tensor[0, 1, 2]}")Basic Array Operations
1. Array Indexing and Slicing
import numpy as np
# 1D array indexing
arr_1d = np.array([10, 20, 30, 40, 50])
print(f"First element: {arr_1d[0]}")
print(f"Last element: {arr_1d[-1]}")
print(f"Slice [1:4]: {arr_1d[1:4]}")
# 2D array indexing
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Element [1,2]: {arr_2d[1, 2]}")
print(f"Second row: {arr_2d[1, :]}")
print(f"Third column: {arr_2d[:, 2]}")
# Boolean indexing
data = np.array([1, 2, 3, 4, 5, 6])
mask = data > 3
print(f"Elements > 3: {data[mask]}")2. Array Shape Operations
# Reshape arrays
arr = np.arange(12)
print(f"Original array: {arr}")
# Reshape to 2D array
reshaped = arr.reshape(3, 4)
print(f"Reshaped:\n{reshaped}")
# Flatten array
flattened = reshaped.flatten()
print(f"Flattened: {flattened}")
# Transpose
transposed = reshaped.T
print(f"Transposed:\n{transposed}")3. Array Concatenation and Splitting
# Array concatenation
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
# Horizontal stacking
hstacked = np.hstack([arr1, arr2])
print(f"Horizontal stack: {hstacked}")
# Vertical stacking
vstacked = np.vstack([arr1, arr2])
print(f"Vertical stack:\n{vstacked}")
# Array splitting
data = np.arange(10)
split_arrays = np.split(data, 5)
print(f"Split result: {split_arrays}")Common SciPy Function Patterns
1. Statistical Function Patterns
from scipy import stats
import numpy as np
# Generate test data
np.random.seed(42)
data = np.random.normal(100, 15, 1000)
# Descriptive statistics
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Standard deviation: {np.std(data):.2f}")
print(f"Skewness: {stats.skew(data):.3f}")
print(f"Kurtosis: {stats.kurtosis(data):.3f}")
# Hypothesis testing
t_stat, p_value = stats.ttest_1samp(data, 100)
print(f"One-sample t-test: t={t_stat:.3f}, p={p_value:.3f}")
# Distribution fitting
mu, sigma = stats.norm.fit(data)
print(f"Fitted normal distribution parameters: μ={mu:.2f}, σ={sigma:.2f}")2. Optimization Function Patterns
from scipy import optimize
import numpy as np
# Define objective function
def objective_function(x):
return x**2 + 10*np.sin(x)
# Find minimum
result = optimize.minimize_scalar(objective_function)
print(f"Minimum location: x={result.x:.3f}")
print(f"Minimum value: f(x)={result.fun:.3f}")
# Solve equation
def equation(x):
return x**3 - 2*x - 5
root = optimize.fsolve(equation, 2.0)[0]
print(f"Root of equation: x={root:.3f}")
print(f"Verification: f({root:.3f})={equation(root):.6f}")3. Integration Function Patterns
from scipy import integrate
import numpy as np
# Define integrand
def integrand(x):
return np.exp(-x**2)
# Numerical integration
result, error = integrate.quad(integrand, 0, np.inf)
print(f"Integration result: {result:.6f}")
print(f"Estimated error: {error:.2e}")
print(f"Theoretical value: {np.sqrt(np.pi)/2:.6f}")
# Multiple integration
def integrand_2d(y, x):
return x * y
result_2d = integrate.dblquad(integrand_2d, 0, 1, lambda x: 0, lambda x: 1)
print(f"Double integration result: {result_2d[0]:.6f}")Error Handling and Debugging
1. Common Error Types
import numpy as np
from scipy import stats
# Dimension mismatch error
try:
a = np.array([1, 2, 3])
b = np.array([[1, 2], [3, 4]])
result = a + b # This will raise a broadcasting error
except ValueError as e:
print(f"Dimension error: {e}")
# Numerical computation error
try:
result = np.log(-1) # Logarithm of negative number
except RuntimeWarning as e:
print(f"Numerical warning: {e}")
# Statistical function error
try:
empty_data = np.array([])
mean = np.mean(empty_data)
except RuntimeWarning as e:
print(f"Empty array warning: {e}")2. Debugging Techniques
# Check array properties
def debug_array(arr, name="Array"):
print(f"{name} information:")
print(f" Shape: {arr.shape}")
print(f" Data type: {arr.dtype}")
print(f" Dimensions: {arr.ndim}")
print(f" Size: {arr.size}")
print(f" Memory usage: {arr.nbytes} bytes")
print(f" Is contiguous: {arr.flags.c_contiguous}")
print(f" Min value: {np.min(arr)}")
print(f" Max value: {np.max(arr)}")
print(f" Contains NaN: {np.any(np.isnan(arr))}")
print(f" Contains infinity: {np.any(np.isinf(arr))}")
print()
# Usage example
test_array = np.random.random((3, 4))
debug_array(test_array, "Test Array")Performance Optimization Basics
1. Vectorized Operations
import numpy as np
import time
# Avoid loops, use vectorized operations
n = 1000000
# Inefficient loop approach
start_time = time.time()
result_loop = []
for i in range(n):
result_loop.append(i ** 2)
loop_time = time.time() - start_time
# Efficient vectorized approach
start_time = time.time()
data = np.arange(n)
result_vectorized = data ** 2
vectorized_time = time.time() - start_time
print(f"Loop approach time: {loop_time:.4f} seconds")
print(f"Vectorized time: {vectorized_time:.4f} seconds")
print(f"Performance improvement: {loop_time / vectorized_time:.1f}x")2. Memory Management
# Use appropriate data types
data_float64 = np.random.random(1000000) # Default float64
data_float32 = np.random.random(1000000).astype(np.float32) # Convert to float32
print(f"float64 memory usage: {data_float64.nbytes / 1024 / 1024:.2f} MB")
print(f"float32 memory usage: {data_float32.nbytes / 1024 / 1024:.2f} MB")
print(f"Memory saved: {(1 - data_float32.nbytes / data_float64.nbytes) * 100:.1f}%")
# In-place operations
data = np.random.random(1000000)
# Create new array (uses more memory)
result1 = data * 2
# In-place operation (saves memory)
data *= 2 # Equivalent to data = data * 2, but more memory efficient3. Choose Appropriate Algorithms
from scipy import linalg
import numpy as np
# For symmetric positive definite matrices, Cholesky decomposition is more efficient
n = 1000
A = np.random.random((n, n))
A = A @ A.T # Create symmetric positive definite matrix
b = np.random.random(n)
# General LU decomposition
start_time = time.time()
x1 = linalg.solve(A, b)
lu_time = time.time() - start_time
# Cholesky decomposition (for symmetric positive definite matrices)
start_time = time.time()
x2 = linalg.solve(A, b, assume_a='pos')
chol_time = time.time() - start_time
print(f"LU decomposition time: {lu_time:.4f} seconds")
print(f"Cholesky decomposition time: {chol_time:.4f} seconds")
print(f"Performance improvement: {lu_time / chol_time:.1f}x")
print(f"Solution difference: {np.max(np.abs(x1 - x2)):.2e}")Practical Application Examples
Example 1: Data Preprocessing
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
# Generate noisy data
np.random.seed(42)
n_samples = 1000
true_signal = np.sin(np.linspace(0, 4*np.pi, n_samples))
noise = np.random.normal(0, 0.3, n_samples)
noisy_data = true_signal + noise
# Data cleaning: remove outliers
z_scores = np.abs(stats.zscore(noisy_data))
threshold = 3
clean_data = noisy_data[z_scores < threshold]
clean_indices = np.where(z_scores < threshold)[0]
print(f"Original data points: {len(noisy_data)}")
print(f"Cleaned data points: {len(clean_data)}")
print(f"Outliers removed: {len(noisy_data) - len(clean_data)}")
# Visualize results
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(true_signal, label='True Signal', linewidth=2)
plt.title('True Signal')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 2)
plt.plot(noisy_data, alpha=0.7, label='Noisy Data')
plt.title('Noisy Data')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 3)
plt.plot(clean_indices, clean_data, 'g.', alpha=0.7, label='Cleaned Data')
plt.title('Cleaned Data')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 4)
plt.hist(noisy_data, bins=50, alpha=0.7, label='Original Data', density=True)
plt.hist(clean_data, bins=50, alpha=0.7, label='Cleaned Data', density=True)
plt.title('Data Distribution Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Example 2: Simple Machine Learning Workflow
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Generate synthetic data
np.random.seed(42)
n_samples = 200
X = np.random.uniform(-3, 3, n_samples)
y = 2 * X + 1 + np.random.normal(0, 0.5, n_samples) # y = 2x + 1 + noise
# Data preprocessing
X = X.reshape(-1, 1) # sklearn requires 2D array
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model parameters:")
print(f" Slope: {model.coef_[0]:.3f} (true value: 2.000)")
print(f" Intercept: {model.intercept_:.3f} (true value: 1.000)")
print(f"Model performance:")
print(f" Mean squared error: {mse:.3f}")
print(f" R² score: {r2:.3f}")
# Statistical test
residuals = y_test - y_pred
_, p_value = stats.normaltest(residuals)
print(f"Residual normality test p-value: {p_value:.3f}")
# Visualization
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.scatter(X_train, y_train, alpha=0.6, label='Training Data')
plt.scatter(X_test, y_test, alpha=0.6, label='Test Data')
X_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_line = model.predict(X_line)
plt.plot(X_line, y_line, 'r-', linewidth=2, label='Fitted Line')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs True Values')
plt.grid(True, alpha=0.3)
plt.subplot(1, 3, 3)
plt.hist(residuals, bins=20, density=True, alpha=0.7)
x_norm = np.linspace(residuals.min(), residuals.max(), 100)
y_norm = stats.norm.pdf(x_norm, np.mean(residuals), np.std(residuals))
plt.plot(x_norm, y_norm, 'r-', linewidth=2, label='Normal Distribution')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.title('Residual Distribution')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()Best Practices
1. Code Organization
# Recommended import style
import numpy as np
import scipy as sp
from scipy import stats, optimize, integrate, linalg
import matplotlib.pyplot as plt
# Avoid using from scipy import *
# This pollutes the namespace
# Use meaningful variable names
data_samples = np.random.normal(0, 1, 1000) # Good
x = np.random.normal(0, 1, 1000) # Not clear enough
# Add appropriate comments
def calculate_confidence_interval(data, confidence=0.95):
"""
Calculate confidence interval for data
Parameters:
data: Data array
confidence: Confidence level (default 0.95)
Returns:
(lower_bound, upper_bound): Lower and upper bounds of confidence interval
"""
alpha = 1 - confidence
n = len(data)
mean = np.mean(data)
std_err = stats.sem(data) # Standard error
t_value = stats.t.ppf(1 - alpha/2, n-1)
margin_error = t_value * std_err
return mean - margin_error, mean + margin_error2. Error Handling
def safe_divide(a, b):
"""
Safe division operation, handles division by zero
"""
try:
result = np.divide(a, b)
# Check for infinity or NaN
if np.any(np.isinf(result)) or np.any(np.isnan(result)):
print("Warning: Result contains infinity or NaN values")
return result
except ZeroDivisionError:
print("Error: Division by zero")
return None
except Exception as e:
print(f"Unknown error: {e}")
return None
# Usage example
a = np.array([1, 2, 3, 4])
b = np.array([2, 0, 1, 2])
result = safe_divide(a, b)
print(f"Safe division result: {result}")3. Performance Monitoring
import time
from functools import wraps
def timing_decorator(func):
"""
Decorator: measure function execution time
"""
@wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"{func.__name__} execution time: {end_time - start_time:.4f} seconds")
return result
return wrapper
@timing_decorator
def matrix_multiplication(n):
"""
Matrix multiplication performance test
"""
A = np.random.random((n, n))
B = np.random.random((n, n))
return A @ B
# Test different matrix sizes
for size in [100, 500, 1000]:
print(f"\nMatrix size: {size}x{size}")
result = matrix_multiplication(size)Summary
In this chapter, we learned about SciPy's basic concepts:
- Core Principles: Based on NumPy arrays, modular design, functional programming
- Data Types: Scalars, vectors, matrices, tensors
- Array Operations: Indexing, slicing, shape operations, concatenation and splitting
- Function Patterns: Common patterns for statistics, optimization, integration
- Error Handling: Common error types and debugging techniques
- Performance Optimization: Vectorization, memory management, algorithm selection
- Best Practices: Code organization, error handling, performance monitoring
After mastering these basic concepts, you can begin learning specific SciPy functionality modules. Next, we will learn more advanced array processing techniques in Array and Matrix Operations.
Practice Exercises
- Array Operations Practice: Create a 5x5 random matrix, calculate its row sums, column sums, and diagonal sum
- Performance Comparison: Compare the performance difference between using loops and vectorized operations to calculate array element squares
- Error Handling: Write a function to safely calculate the logarithm of an array, handling negative and zero values
- Statistical Analysis: Generate two groups of random data, perform a t-test to compare their mean differences
# Practice exercise reference answers
# 1. Array operations practice
import numpy as np
matrix = np.random.random((5, 5))
print(f"Matrix:\n{matrix}")
print(f"Row sums: {np.sum(matrix, axis=1)}")
print(f"Column sums: {np.sum(matrix, axis=0)}")
print(f"Main diagonal sum: {np.trace(matrix)}")
print(f"Anti-diagonal sum: {np.sum(np.fliplr(matrix).diagonal())}")
# 2. Performance comparison
import time
n = 100000
data = np.random.random(n)
# Loop approach
start = time.time()
squared_loop = [x**2 for x in data]
loop_time = time.time() - start
# Vectorized approach
start = time.time()
squared_vectorized = data**2
vectorized_time = time.time() - start
print(f"Loop time: {loop_time:.4f}s")
print(f"Vectorized time: {vectorized_time:.4f}s")
print(f"Performance improvement: {loop_time/vectorized_time:.1f}x")
# 3. Error handling
def safe_log(arr):
"""
Safely compute logarithm, handling negative and zero values
"""
result = np.full_like(arr, np.nan, dtype=float)
positive_mask = arr > 0
result[positive_mask] = np.log(arr[positive_mask])
if np.any(arr <= 0):
print(f"Warning: Found {np.sum(arr <= 0)} non-positive values")
return result
# Test
test_data = np.array([-1, 0, 1, 2.718, 10])
log_result = safe_log(test_data)
print(f"Safe logarithm result: {log_result}")
# 4. Statistical analysis
from scipy import stats
np.random.seed(42)
group1 = np.random.normal(100, 15, 50)
group2 = np.random.normal(105, 15, 50)
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"Two-sample t-test:")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}")
print(f"Conclusion: {'Significant difference exists' if p_value < 0.05 else 'No significant difference'}")