Cluster module updates (#214)

espg · web-flow · commit a5e9e4120e40 · 2025-04-17T15:53:32.000-04:00
* update to `prune` function in the clustering module; added `minsize` keyword for 'method' arguement. Prune function also now throws an error when using an unknown 'method' keyword (previous behavior was to simple return the matrix unmodified).

* attempting to fix CI/CD so that tests run on external PRs

* new method (and keyword) for static cluster expansion
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -1,6 +1,12 @@
 name: Python Package using Conda
 
-on: [push]
+on:
+  push:
+    branches:
+      - 'master'
+      - 'releases/**'
+  pull_request_target:
+    types: [assigned, opened, synchronize, reopened]
 
 jobs:
   build-linux:
diff --git a/pgamit/cluster.py b/pgamit/cluster.py
@@ -20,40 +20,45 @@
                                      _kmeans_single_lloyd)
 
 
-def prune(OC, central_points, method='linear'):
+def prune(OC, central_points, method='minsize'):
     """Prune redundant clusters from over cluster (OC) and other arrays
 
     Parameters
     ----------
 
     OC : bool array of shape (n_clusters, n_coordinates)
-    method : ["linear", None]; defaults linear scan
+    method : ["linear", "minsize:]; "linear" is a row-by-row scan through the
+        cluster matrix, "minsize" will sort matrix rows (i.e., the clusters)
+        according to size and prioritize pruning the smallest clusters first.
 
     Returns
 
     OC : Pruned bool array of shape (n_clusters - N, n_coordinates)
     central_points : Pruned int array of shape (n_clusters -N,)
     """
+    subset = []
+    rowlength = len(OC[0,:])
     if method == "linear":
-        subset = []
-        for i, row in enumerate(OC):
-            mod = OC.copy()
-            mod[i, :] = np.zeros(len(row))
-            counts = mod.sum(axis=0)
-            problems = np.sum(counts == 0)
-            if problems == 0:
-                subset.append(i)
-                OC[i, :] = np.zeros(len(row))
-        # Cast subset list to pandas index
-        dfIndex = pd.Index(subset)
-        # Cast OC to pandas dataframe
-        dfOC = pd.DataFrame(OC)
-        # Apply the 'inverse' index; pruned is boolean numpy index array
-        pruned = ~dfOC.index.isin(dfIndex)
-        return OC[pruned], central_points[pruned]
+        indices = list(range(len(OC)))
+    elif method == "minsize":
+        indices = np.argsort(OC.sum(axis=1))
     else:
-        return OC, central_points
-
+        raise ValueError("Unknown method '" + method + "'")
+    for i in indices:
+        mod = OC.copy()
+        mod[i, :] = np.zeros(rowlength)
+        counts = mod.sum(axis=0)
+        problems = np.sum(counts == 0)
+        if problems == 0:
+            subset.append(i)
+            OC[i, :] = np.zeros(rowlength)
+    # Cast subset list to pandas index
+    dfIndex = pd.Index(subset)
+    # Cast OC to pandas dataframe
+    dfOC = pd.DataFrame(OC)
+    # Apply the 'inverse' index; pruned is boolean numpy index array
+    pruned = ~dfOC.index.isin(dfIndex)
+    return OC[pruned], central_points[pruned]
 
 def select_central_point(coordinates, centroids, metric='euclidean'):
     """Select the nearest central point in a given neighborhood
@@ -95,8 +100,8 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
     return idxs.squeeze()
 
 
-def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
-                 overlap_points=2, rejection_threshold=None):
+def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
+                 overlap_points=2, rejection_threshold=None, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
     Expands an existing clustering to create overlapping membership between
@@ -155,9 +160,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
         Sparse matrices are only supported by scikit-learn metrics.  See the
         documentation for scipy.spatial.distance for details on these metrics.
 
-    neighborhood : int greater than or equal to 1, default=3
-        Number of adjacent clusters to include when adding cluster membership
-        overlap. Should be less than the number of unique cluster labels - 1.
+    neighbors: int greater than or equal to 1, default=3
+        For method='static', this is total number of points that will be added
+        to the seed clusters during cluster expansion.
+        For method='dynamic', this is the (zero-indexed) number of adjacent
+        clusters to include when adding cluster membership overlap. Should be
+        less than the number of unique cluster labels - 1.
 
     overlap_points : int greater than or equal to 1, default=2
         Should not exceed the size of the smallest cluster in `labels`.
@@ -169,6 +177,11 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
         Note that if value other than 'None' is used, there is no guarantee
         that all clusters will have overlap points added.
 
+    method : 'static' (default) or 'dynamic'
+        The 'static' method will always produce an overcluster equal to the
+        `neighbors` parameter; 'dynamic' will produce an overcluster ceiling
+        of (neighbors - 1) * overlap_points, with a floor of neighbors.
+
     Returns
     -------
     expanded_clusters : bool array of shape (n_clusters, n_coordinates)
@@ -183,8 +196,8 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
     clusters = np.unique(labels)
     n_clusters = len(clusters)
 
-    if (n_clusters - 1) < neighborhood:
-        neighborhood = (n_clusters - 1)
+    if (n_clusters - 1) < neighbors:
+        neighbors = (n_clusters - 1)
 
     # reference index for reverse lookups
     ridx = np.array(list(range(len(labels))))
@@ -200,10 +213,11 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
         # Build index tree on members
         nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree',
                                 metric=metric).fit(coordinates[members])
-        # Could be set to '1';
-        # using same check as while loop for consistency
-        coverage = len(np.unique(labels[output[cluster, :]]))
-        while coverage <= neighborhood:
+        if method == 'dynamic':
+            coverage = len(np.unique(labels[output[cluster, :]]))
+        elif method == 'static':
+            coverage = 0
+        while coverage <= neighbors:
             # intersect search tree with non-members
             D, _ = nbrs.kneighbors(coordinates[nonmembers, :])
             # Rejection threshold is lightly tested...
@@ -216,8 +230,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
             nonmembers[new_member] = 0
             # Add to member label array
             output[cluster, new_member] = 1
-            # Update current count of over-clustered neighbors
-            coverage = len(np.unique(labels[output[cluster, :]]))
+            if method == 'dynamic':
+                # Update current count of over-clustered neighbors
+                coverage = len(np.unique(labels[output[cluster, :]]))
+            elif method == 'static':
+                # Update current point expansion count
+                coverage += 1
             # Grab label of new member for overlap check
             nm_label = labels[new_member]
             # Check if we've exceeded our overlap allotment...