diff --git a/README.md b/README.md index a84d743..2087788 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,17 @@ diagrams = ripser(data)['dgms'] plot_diagrams(diagrams, show=True) ``` +You can also estimate the RAM requirements before running Ripser: + +```python +import numpy as np +from ripser import estimate_ripser_memory + +data = np.random.random((1000, 2)) +estimated_gb = estimate_ripser_memory(data, maxdim=2) +print(f"Estimated RAM required: {estimated_gb:.2f} GB") +``` + We also supply a Scikit-learn transformer style object if you would prefer to use that: ``` diff --git a/src/ripser/ripser.py b/src/ripser/ripser.py index 5695ed4..eb7afc9 100644 --- a/src/ripser/ripser.py +++ b/src/ripser/ripser.py @@ -102,6 +102,60 @@ def get_greedy_perm(X, n_perm=None, distance_matrix=False, metric="euclidean"): return (idx_perm, lambdas, dperm2all) +def estimate_ripser_memory(X, maxdim=1, distance_matrix=False): + """Estimate the RAM requirements for running Ripser. + + Parameters + ---------- + X : ndarray (n_samples, n_features) + A numpy array of either data or distance matrix. + + maxdim : int, optional (default=1) + Maximum homology dimension to be computed. + + distance_matrix : bool, optional (default=False) + Whether X is a distance matrix. + + Returns + ------- + float + Estimated RAM requirement in GB. + + Notes + ----- + This is a rough estimation based on: + 1. Memory for distance matrix: O(n^2) where n is number of points + 2. Memory for simplicial complex: O(n^(d+1)) where d is maxdim + 3. Memory for persistence computation: O(n^3) worst case for matrix reduction + """ + if distance_matrix: + n_points = X.shape[0] + else: + n_points = X.shape[0] + + # Base memory for distance matrix (8 bytes per float64) + distance_matrix_memory = (n_points * n_points * 8) / (1024**3) # in GB + + # Memory for simplicial complex (worst case) + # For each dimension d, we need to store combinations of d+1 points + # Using scipy.special.comb for exact calculation would be more accurate + # but this is a rough upper bound + complex_memory = 0 + for d in range(maxdim + 1): + # Approximate number of d-simplices (overestimate) + n_simplices = (n_points ** (d + 1)) / np.math.factorial(d + 1) + # Each simplex needs (d+1) integers for vertices + 1 float for filtration value + complex_memory += (n_simplices * ((d + 1) * 4 + 8)) / (1024**3) # in GB + + # Memory for persistence computation (worst case) + # This is a rough estimate based on the boundary matrix + persistence_memory = (n_points**3 * 8) / (1024**3) # in GB + + total_memory = distance_matrix_memory + complex_memory + persistence_memory + + return total_memory + + def ripser( X, maxdim=1, @@ -651,4 +705,4 @@ def plot( ) -__all__ = ["Rips", "ripser", "lower_star_img"] +__all__ = ["Rips", "ripser", "lower_star_img", "estimate_ripser_memory"] diff --git a/test/test_memory_estimation.py b/test/test_memory_estimation.py new file mode 100644 index 0000000..be54491 --- /dev/null +++ b/test/test_memory_estimation.py @@ -0,0 +1,39 @@ +import numpy as np +from ripser import estimate_ripser_memory + +def test_memory_estimation(): + # Create a small point cloud + n_points = 100 + n_features = 3 + X = np.random.rand(n_points, n_features) + + # Test memory estimation for different dimensions + mem_dim0 = estimate_ripser_memory(X, maxdim=0) + mem_dim1 = estimate_ripser_memory(X, maxdim=1) + mem_dim2 = estimate_ripser_memory(X, maxdim=2) + + # Memory should increase with dimension + assert mem_dim0 < mem_dim1 < mem_dim2 + + # Test with distance matrix + D = np.random.rand(n_points, n_points) + D = (D + D.T) / 2 # Make it symmetric + np.fill_diagonal(D, 0) # Zero diagonal + + mem_dist = estimate_ripser_memory(D, maxdim=1, distance_matrix=True) + assert mem_dist > 0 + +def test_memory_scaling(): + # Test how memory scales with number of points + n_points_small = 50 + n_points_large = 100 + n_features = 3 + + X_small = np.random.rand(n_points_small, n_features) + X_large = np.random.rand(n_points_large, n_features) + + mem_small = estimate_ripser_memory(X_small, maxdim=1) + mem_large = estimate_ripser_memory(X_large, maxdim=1) + + # Memory should scale super-linearly with number of points + assert mem_large > 4 * mem_small # Since complexity is O(n^3) \ No newline at end of file