From 91009b76a000e34f4c303c8b04e6bb611aed46d1 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Fri, 18 Apr 2025 15:11:49 -0400
Subject: [PATCH 1/8] modifying test to use new keyword

---
 pgamit/tests/test_make_clusters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py
index 7a1cfcb6..ea24cf93 100644
--- a/pgamit/tests/test_make_clusters.py
+++ b/pgamit/tests/test_make_clusters.py
@@ -58,8 +58,9 @@ def test_max_clust_expansion(min_clust, max_clust, neighbors, overlap):
                             max_iter=8000, random_state=42)
     clust.fit(data)
  
-    OC = over_cluster(clust.labels_, data, metric='euclidean',
-                      neighbors=neighbors, overlap_points=overlap)
+    OC = over_cluster(clust.labels_, data, metric='euclidean', 
+                      neighbors=neighbors, overlap_points=overlap,
+                      method='dynamic')
 
     expanded_sizes = np.sum(OC, axis=1)
     _, original_sizes = np.unique(clust.labels_, return_counts=True)

From 47eb693a5fc4bbaf01c181ae26ebcecb0090fd42 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Tue, 13 May 2025 13:08:59 -0800
Subject: [PATCH 2/8] new 'paired' method for `over_cluster`

---
 pgamit/cluster.py | 63 ++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/pgamit/cluster.py b/pgamit/cluster.py
index 3a53a86b..ac427aeb 100644
--- a/pgamit/cluster.py
+++ b/pgamit/cluster.py
@@ -106,7 +106,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
 
 
 def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
-                 overlap_points=2, rejection_threshold=None, method='static'):
+                 overlap_points=2, rejection_threshold=5e6, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
     Expands an existing clustering to create overlapping membership between
@@ -222,33 +222,46 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
             coverage = len(np.unique(labels[output[cluster, :]]))
         elif method == 'static':
             coverage = 0
-        while coverage <= neighbors:
-            # intersect search tree with non-members
+        
+        if method == 'paired':    
             D, _ = nbrs.kneighbors(coordinates[nonmembers, :])
-            # Rejection threshold is lightly tested...
-            if rejection_threshold:
-                if np.min(D) > rejection_threshold:
-                    break
-            # Select closest external point to add to member cluster
             new_member = ridx[nonmembers][np.argmin(D)]
-            # Remove point from future coordinate distance queries
-            nonmembers[new_member] = 0
-            # Add to member label array
+            cluster_label = labels[new_member]
+            cloc = labels == cluster_label
+            nonmembers[~cloc] = 0
+            rdists, _ = nbrs.radius_neighbors(coordinates[nonmembers, :],
+                                              radius=rejection_threshold,
+                                              return_distance=True)
+            far_member = ridx[nonmembers][np.argmax(rdists)]
             output[cluster, new_member] = 1
-            if method == 'dynamic':
-                # Update current count of over-clustered neighbors
-                coverage = len(np.unique(labels[output[cluster, :]]))
-            elif method == 'static':
-                # Update current point expansion count
-                coverage += 1
-            # Grab label of new member for overlap check
-            nm_label = labels[new_member]
-            # Check if we've exceeded our overlap allotment...
-            if sum(labels[output[cluster, :]] == nm_label) >= overlap_points:
-                # ...if so, remove entire neighboring cluster
-                remove = nm_label == labels
-                nonmembers[remove] = False
-
+            output[cluster, far_member] = 1
+        else:
+            while coverage <= neighbors:
+                # intersect search tree with non-members
+                D, _ = nbrs.kneighbors(coordinates[nonmembers, :])
+                # Rejection threshold is lightly tested...
+                if rejection_threshold:
+                    if np.min(D) > rejection_threshold:
+                        break
+                # Select closest external point to add to member cluster
+                new_member = ridx[nonmembers][np.argmin(D)]
+                # Remove point from future coordinate distance queries
+                nonmembers[new_member] = 0
+                # Add to member label array
+                output[cluster, new_member] = 1
+                if method == 'dynamic':
+                    # Update current count of over-clustered neighbors
+                    coverage = len(np.unique(labels[output[cluster, :]]))
+                elif method == 'static':
+                    # Update current point expansion count
+                    coverage += 1
+                # Grab label of new member for overlap check
+                nm_label = labels[new_member]
+                # Check if we've exceeded our overlap allotment...
+                if sum(labels[output[cluster, :]] == nm_label) >= overlap_points:
+                    # ...if so, remove entire neighboring cluster
+                    remove = nm_label == labels
+                    nonmembers[remove] = False
     return output
 
 

From 76269cec6b9314c342f85fbfdec53ec11a3133c5 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Tue, 13 May 2025 14:59:54 -0800
Subject: [PATCH 3/8] generalized for cases where neighbors doesn't equal '1';
 some flake8 fixes; proper handling of too long baseline pairs

---
 pgamit/cluster.py | 200 +++++++++++++++++++++++++++-------------------
 1 file changed, 120 insertions(+), 80 deletions(-)

diff --git a/pgamit/cluster.py b/pgamit/cluster.py
index ac427aeb..c809c619 100644
--- a/pgamit/cluster.py
+++ b/pgamit/cluster.py
@@ -9,11 +9,11 @@
 import scipy.sparse as sp
 import heapq
 import networkx as nx
-from tqdm import tqdm
 
 from scipy.spatial.distance import pdist, squareform
 
 from sklearn.neighbors import NearestNeighbors
+from sklearn.metrics import pairwise_distances
 from sklearn.base import _fit_context
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Integral, Interval, StrOptions
@@ -41,7 +41,7 @@ def prune(OC, central_points, method='minsize'):
     central_points : Pruned int array of shape (n_clusters -N,)
     """
     subset = []
-    rowlength = len(OC[0,:])
+    rowlength = len(OC[0, :])
     if method == "linear":
         indices = list(range(len(OC)))
     elif method == "minsize":
@@ -106,13 +106,13 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
 
 
 def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
-                 overlap_points=2, rejection_threshold=5e6, method='static'):
+                 overlap=2, rejection_threshold=5e6, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
     Expands an existing clustering to create overlapping membership between
     clusters. Existing clusters are processed sequentially by looking up
     nearest neighbors as an intersection of the current cluster membership and
-    all cluster's point membership.  Once the `overlap_points` for a given
+    all cluster's point membership.  Once the `overlap` for a given
     neighbor cluster have been determined and added to current cluster, the
     remainder of that neighboring cluster is removed from consideration and
     distance query is rerun, with the process repeating until a number of
@@ -172,7 +172,7 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
         clusters to include when adding cluster membership overlap. Should be
         less than the number of unique cluster labels - 1.
 
-    overlap_points : int greater than or equal to 1, default=2
+    overlap : int greater than or equal to 1, default=2
         Should not exceed the size of the smallest cluster in `labels`.
 
     rejection_threshold : float, default=None
@@ -220,31 +220,40 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
                                 metric=metric).fit(coordinates[members])
         if method == 'dynamic':
             coverage = len(np.unique(labels[output[cluster, :]]))
-        elif method == 'static':
+        else:  # method == 'static' or 'paired'
             coverage = 0
-        
-        if method == 'paired':    
-            D, _ = nbrs.kneighbors(coordinates[nonmembers, :])
-            new_member = ridx[nonmembers][np.argmin(D)]
-            cluster_label = labels[new_member]
-            cloc = labels == cluster_label
-            nonmembers[~cloc] = 0
-            rdists, _ = nbrs.radius_neighbors(coordinates[nonmembers, :],
-                                              radius=rejection_threshold,
-                                              return_distance=True)
-            far_member = ridx[nonmembers][np.argmax(rdists)]
-            output[cluster, new_member] = 1
-            output[cluster, far_member] = 1
-        else:
-            while coverage <= neighbors:
-                # intersect search tree with non-members
-                D, _ = nbrs.kneighbors(coordinates[nonmembers, :])
+
+        while coverage <= neighbors:
+            # intersect search tree with non-members
+            D, indx = nbrs.kneighbors(coordinates[nonmembers, :])
+            mindex = np.argmin(D)
+            # Select closest external point to add to member cluster
+            new_member = ridx[nonmembers][mindex]
+            # Grab label of new member for overlap and other checks
+            nm_label = labels[new_member]
+            # Paired method removes full cluster from consideration
+            if method == 'paired':
+                # 'remove' is the captured cluster, from which we select pairs
+                remove = nm_label == labels
+                # For simplicity, we use the single point defined my 'mindex'
+                # as the 'member point' to calculate max eligible distance
+                rdists = pairwise_distances(coordinates[members][indx[mindex]],
+                                            coordinates[remove])
+                # Filter too far points from argmax eligibility
+                rdists[rdists >= rejection_threshold] = 0
+                far_member = ridx[remove][np.argmax(rdists)]
+                # Add near / far points to cluster for overlap
+                output[cluster, new_member] = 1
+                output[cluster, far_member] = 1
+                # Remove captured cluster from further consideration
+                nonmembers[remove] = False
+                # Continue
+                coverage += 1
+            else:
                 # Rejection threshold is lightly tested...
                 if rejection_threshold:
                     if np.min(D) > rejection_threshold:
                         break
-                # Select closest external point to add to member cluster
-                new_member = ridx[nonmembers][np.argmin(D)]
                 # Remove point from future coordinate distance queries
                 nonmembers[new_member] = 0
                 # Add to member label array
@@ -255,10 +264,8 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
                 elif method == 'static':
                     # Update current point expansion count
                     coverage += 1
-                # Grab label of new member for overlap check
-                nm_label = labels[new_member]
                 # Check if we've exceeded our overlap allotment...
-                if sum(labels[output[cluster, :]] == nm_label) >= overlap_points:
+                if sum(labels[output[cluster, :]] == nm_label) >= overlap:
                     # ...if so, remove entire neighboring cluster
                     remove = nm_label == labels
                     nonmembers[remove] = False
@@ -266,32 +273,37 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
 
 
 class DeterministicClustering(object):
-    def __init__(self, target_size=15, tolerance=4, num_tie_points=4, max_dist_to_centroid=5_000_000):
-        self.target_size     = target_size
-        self.num_tie_points  = num_tie_points
-        self.tolerance       = tolerance
-        self.max_dist        = max_dist_to_centroid
-        self.points          = np.array([])
-        self.OC              = np.array([])
+    def __init__(self, target_size=15, tolerance=4,
+                 num_tie_points=4, max_dist_to_centroid=5_000_000):
+
+        self.target_size = target_size
+        self.num_tie_points = num_tie_points
+        self.tolerance = tolerance
+        self.max_dist = max_dist_to_centroid
+        self.points = np.array([])
+        self.OC = np.array([])
         # variables to store results
-        self.centroid_ids    = []
-        self.clustered_ids   = []
-        self.tie_ids         = []
+        self.centroid_ids = []
+        self.clustered_ids = []
+        self.tie_ids = []
 
     def constrained_agglomerative(self, points, tie_clusters=True):
         """
-        Perform spatially-constrained agglomerative clustering with centroid snapping.
+        Spatially-constrained agglomerative clustering with centroid snapping.
+
         Parameters:
             points (np.ndarray): Nx3 ECEF coordinates.
-            tie_clusters (bool): to tie clusters together using neighbors, pass True
+            tie_clusters (bool): to tie clusters together using neighbors,
+                                 pass True
 
         Returns:
             clustered_points (List[List[np.ndarray]]): Points per cluster.
             labels (np.ndarray): Cluster label for each point.
-            final_centroids (List[np.ndarray]): Snapped centroids from points for each cluster.
+            final_centroids (List[np.ndarray]): Snapped centroids from points
+                                                for each cluster.
         """
         target_size = self.target_size
-        margin      = self.tolerance
+        margin = self.tolerance
         max_dist_to_centroid = self.max_dist
 
         points = np.array(points)
@@ -327,17 +339,20 @@ def constrained_agglomerative(self, points, tie_clusters=True):
 
             merged_points = points[merged_cluster]
             centroid = merged_points.mean(axis=0)
-            # compare to square distance to avoid computing the sqrt and save some computation time
+            # compare to square distance to avoid computing the sqrt
+            # and save some computation time
             if np.any(np.einsum('ij,ij->i', merged_points - centroid,
-                                merged_points - centroid) > max_dist_to_centroid ** 2):
+                                merged_points -
+                                centroid) > max_dist_to_centroid ** 2):
                 continue
 
             clusters[next_cluster_id] = merged_cluster
             centroids[next_cluster_id] = centroid
             del clusters[ci], clusters[cj]
             del centroids[ci], centroids[cj]
-            self.update_heap_vectorized(heap, clusters, centroids, centroid, merged_cluster,
-                                        next_cluster_id, max_size, max_dist_to_centroid)
+            self.update_heap_vectorized(heap, clusters, centroids, centroid,
+                                        merged_cluster, next_cluster_id,
+                                        max_size, max_dist_to_centroid)
 
             next_cluster_id += 1
 
@@ -347,11 +362,14 @@ def constrained_agglomerative(self, points, tie_clusters=True):
         is_final = cluster_lens >= min_size
 
         # Split clusters
-        final_clusters = [cluster_items[i] for i in range(len(cluster_items)) if is_final[i]]
-        leftovers = [cluster_items[i] for i in range(len(cluster_items)) if not is_final[i]]
+        final_clusters = [cluster_items[i] for i in
+                          range(len(cluster_items)) if is_final[i]]
+        leftovers = [cluster_items[i] for i in
+                     range(len(cluster_items)) if not is_final[i]]
 
         # Compute centroids for final clusters
-        centroids_arr = np.array([points[c].mean(axis=0) for c in final_clusters])
+        centroids_arr = np.array([points[c].mean(axis=0) for c in
+                                  final_clusters])
         # Snap to closest input point in cluster
         snapped_idxs = [
             c[np.argmin(np.linalg.norm(points[c] - centroid, axis=1))]
@@ -364,9 +382,11 @@ def constrained_agglomerative(self, points, tie_clusters=True):
             if len(cluster) == 1:
                 idx = cluster[0]
                 point = points[idx]
-                dists = np.linalg.norm(np.array(final_centroids) - point, axis=1)
+                dists = np.linalg.norm(np.array(final_centroids) - point,
+                                       axis=1)
                 valid = [i for i in range(len(final_clusters)) if
-                         len(final_clusters[i]) < max_size and dists[i] <= max_dist_to_centroid]
+                         len(final_clusters[i]) < max_size and
+                         dists[i] <= max_dist_to_centroid]
                 if valid:
                     best_fit = valid[np.argmin(dists[valid])]
                 else:
@@ -374,33 +394,38 @@ def constrained_agglomerative(self, points, tie_clusters=True):
                 final_clusters[best_fit].append(idx)
                 cluster_points = points[final_clusters[best_fit]]
                 centroid = cluster_points.mean(axis=0)
-                snapped_idx = final_clusters[best_fit][np.argmin(np.linalg.norm(cluster_points - centroid, axis=1))]
+                snapped_idx = final_clusters[best_fit][np.argmin(
+                    np.linalg.norm(cluster_points - centroid, axis=1))]
                 final_centroids[best_fit] = points[snapped_idx]
-                centroid_ids[best_fit]  = snapped_idx
+                centroid_ids[best_fit] = snapped_idx
             else:
                 centroid = points[cluster].mean(axis=0)
-                dists = np.linalg.norm(np.array(final_centroids) - centroid, axis=1)
+                dists = np.linalg.norm(np.array(final_centroids) - centroid,
+                                       axis=1)
                 best_fit = None
                 for i in np.argsort(dists):
                     potential = final_clusters[i] + cluster
                     if len(potential) <= max_size:
                         test_points = points[potential]
                         test_centroid = test_points.mean(axis=0)
-                        if np.all(np.linalg.norm(test_points - test_centroid, axis=1) <= max_dist_to_centroid):
+                        if np.all(np.linalg.norm(test_points -
+                                                 test_centroid,
+                                                 axis=1) <=
+                                  max_dist_to_centroid):
                             best_fit = i
                             break
                 if best_fit is not None:
                     final_clusters[best_fit].extend(cluster)
                     cluster_points = points[final_clusters[best_fit]]
                     centroid = cluster_points.mean(axis=0)
-                    snapped_idx = final_clusters[best_fit][np.argmin(np.linalg.norm(cluster_points - centroid, axis=1))]
+                    snapped_idx = final_clusters[best_fit][np.argmin(
+                        np.linalg.norm(cluster_points - centroid, axis=1))]
                     final_centroids[best_fit] = points[snapped_idx]
-                    centroid_ids[best_fit]  = snapped_idx
+                    centroid_ids[best_fit] = snapped_idx
                 else:
-                    # tqdm.write(' -- cluster %i ended up with %i stations (min was %i)' % (len(final_clusters),
-                    #                                                                 len(cluster), min_size))
                     final_clusters.append(cluster)
-                    snapped_idx = cluster[np.argmin(np.linalg.norm(points[cluster] - centroid, axis=1))]
+                    snapped_idx = cluster[np.argmin(np.linalg.norm(
+                        points[cluster] - centroid, axis=1))]
                     final_centroids.append(points[snapped_idx])
                     centroid_ids.append(snapped_idx)
 
@@ -409,8 +434,8 @@ def constrained_agglomerative(self, points, tie_clusters=True):
             for i in cluster:
                 labels[i] = idx
 
-        self.clustered_ids  = final_clusters
-        self.centroid_ids   = centroid_ids
+        self.clustered_ids = final_clusters
+        self.centroid_ids = centroid_ids
 
         # ties clusters together
         if tie_clusters:
@@ -418,20 +443,24 @@ def constrained_agglomerative(self, points, tie_clusters=True):
 
         return final_clusters, labels, centroid_ids
 
-    def update_heap_vectorized(self, heap, clusters, centroids, centroid, merged_cluster, next_cluster_id, max_size,
+    def update_heap_vectorized(self, heap, clusters, centroids, centroid,
+                               merged_cluster, next_cluster_id, max_size,
                                max_dist_to_centroid):
         """
         Vectorized version to update the heap with valid cluster pairs after a merge.
 
         Parameters:
             heap (List[Tuple[float, int, int]]): The heap to update.
-            clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list of point indices.
-            centroids (Dict[int, np.ndarray]): Dictionary of cluster_id -> centroid coordinates.
+            clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list
+                                             of point indices.
+            centroids (Dict[int, np.ndarray]): Dictionary of cluster_id ->
+                                               centroid coordinates.
             centroid (np.ndarray): The centroid of the newly merged cluster.
             merged_cluster (List[int]): Indices of points in the new cluster.
             next_cluster_id (int): The ID of the new cluster.
             max_size (int): Maximum allowed cluster size.
-            max_dist_to_centroid (float): Maximum allowed distance for a valid connection.
+            max_dist_to_centroid (float): Maximum allowed distance for a valid
+                                          connection.
         """
         # Extract existing cluster IDs and their centroids
         existing_ids = np.array(list(clusters.keys()))
@@ -456,13 +485,17 @@ def add_tie_points(self, cluster_labels, num_neighbors=4, max_tie_distance=5_000
         to at least `num_neighbors` external clusters.
 
         Parameters:
-            cluster_labels (np.ndarray): Cluster index (0..K-1) for each station.
-            num_neighbors (int): Minimum number of external links each island must have.
+            cluster_labels (np.ndarray): Cluster index (0..K-1) for each
+                                         station.
+            num_neighbors (int): Minimum number of external links each island
+                                 must have.
             max_tie_distance (float): Max allowable tie distance in meters.
 
         Returns:
-            new_clusters (List[List[int]]): Cluster station indices including added tie points.
-            tie_points (List[List[int]]): Tie point indices added to each cluster.
+            new_clusters (List[List[int]]): Cluster station indices including
+                                            added tie points.
+            tie_points (List[List[int]]): Tie point indices added to each
+                                          cluster.
         """
 
         points = self.points
@@ -503,13 +536,13 @@ def add_reciprocal_tie(i, j, pi_idx, pj_idx):
                 pj = [idx for idx in clusters[j] if idx not in used_points]
                 if not pi or not pj:
                     continue
-                dist_matrix = np.linalg.norm(points[pi][:, None, :] - points[pj][None, :, :], axis=2)
+                dist_matrix = np.linalg.norm(points[pi][:, None, :] -
+                                             points[pj][None, :, :], axis=2)
                 min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape)
                 min_dist = dist_matrix[min_idx]
                 if min_dist <= max_tie_distance:
                     add_reciprocal_tie(i, j, pi[min_idx[0]], pj[min_idx[1]])
 
-
         # === Step 3: Build the graph from current tie connections ===
         G = nx.Graph()
         G.add_nodes_from(range(n_clusters))
@@ -536,11 +569,16 @@ def add_reciprocal_tie(i, j, pi_idx, pj_idx):
                         pj = [idx for idx in clusters[j] if idx not in used_points]
                         if not pi or not pj:
                             continue
-                        dist_matrix = np.linalg.norm(points[pi][:, None, :] - points[pj][None, :, :], axis=2)
-                        min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape)
+                        dist_matrix = np.linalg.norm(points[pi][:, None, :] -
+                                                     points[pj][None, :, :],
+                                                     axis=2)
+                        min_idx = np.unravel_index(np.argmin(dist_matrix),
+                                                   dist_matrix.shape)
                         dist = dist_matrix[min_idx]
                         if dist <= max_tie_distance:
-                            connection_candidates.append((dist, i, j, pi[min_idx[0]], pj[min_idx[1]]))
+                            connection_candidates.append((dist, i, j,
+                                                          pi[min_idx[0]],
+                                                          pj[min_idx[1]]))
 
                 # Sort connections by shortest distance
                 connection_candidates.sort()
@@ -574,18 +612,20 @@ def add_reciprocal_tie(i, j, pi_idx, pj_idx):
         self.OC = np.array(matrix)
 
         self.clustered_ids = new_clusters
-        self.tie_ids       = tie_points
+        self.tie_ids = tie_points
 
         return new_clusters, tie_points
 
     def get_cluster_coordinates(self):
-        return [[self.points[i] for i in cluster] for cluster in self.clustered_ids]
+        return [[self.points[i] for i in cluster]
+                for cluster in self.clustered_ids]
 
     def get_centroid_coordinates(self):
         return [self.points[i] for i in self.centroid_ids]
 
     def get_tie_coordinates(self):
-        return [[self.points[i] for i in cluster] for cluster in self.tie_ids]
+        return [[self.points[i] for i in cluster]
+                for cluster in self.tie_ids]
 
 
 """Bisecting Q-means clustering."""
@@ -879,7 +919,7 @@ def fit(self, X, y=None, sample_weight=None):
             cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
 
             # Split this cluster into 2 subclusters
-            #if cluster_to_bisect is not None:
+            # if cluster_to_bisect is not None:
             if cluster_to_bisect.score > self.max_size:
                 self._bisect(X, x_squared_norms, sample_weight,
                              cluster_to_bisect)
@@ -961,7 +1001,7 @@ def get_cluster_to_bisect(self):
                 max_score = cluster_leaf.score
                 best_cluster_leaf = cluster_leaf
 
-        #if max_score >= self.opt_size: 
+        # if max_score >= self.opt_size:
         if np.isneginf(max_score):
             self.bisect = False
         else:

From 382fa3728ed871e379156845266c0ad99c87599f Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Tue, 13 May 2025 16:09:24 -0800
Subject: [PATCH 4/8] fixed off by one bug in over_cluster; updated
 documentation; renamed parameter and updated unit test and function calls;
 updated overcluster default parameters

---
 pgamit/cluster.py | 32 ++++++++++++++++++++------------
 pgamit/network.py |  2 +-
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/pgamit/cluster.py b/pgamit/cluster.py
index c809c619..fe4e64d7 100644
--- a/pgamit/cluster.py
+++ b/pgamit/cluster.py
@@ -105,7 +105,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
     return idxs.squeeze()
 
 
-def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
+def over_cluster(labels, coordinates, metric='euclidean', neighbors=5,
                  overlap=2, rejection_threshold=5e6, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
@@ -137,11 +137,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
         clustering in an X,Y,Z projection, those coordinates can be provided in
         spherical coordinates, provided that 'haversine' is selected for the
         `metric` parameter.
-    metric : str or callable, default='haversine'
+    metric : str or callable, default='euclidean'
         Metric to use for distance computation. Any metric from scikit-learn or
-        scipy.spatial.distance can be used. Note that latitude and longitude
-        values will need to be converted to radians if using the default
-        'haversine' distance metric.
+        scipy.spatial.distance can be used.
+
+        Note that latitude and longitude values will need to be converted to
+        radians if using the default 'haversine' distance metric.
 
         If metric is a callable function, it is called on each pair of
         instances (rows) and the resulting value recorded. The callable should
@@ -168,24 +169,31 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
     neighbors: int greater than or equal to 1, default=3
         For method='static', this is total number of points that will be added
         to the seed clusters during cluster expansion.
+        For method='paired', this is the number of cluster that are used to
+        tie, with each cluster contributing exactly 2 points.
         For method='dynamic', this is the (zero-indexed) number of adjacent
         clusters to include when adding cluster membership overlap. Should be
         less than the number of unique cluster labels - 1.
 
     overlap : int greater than or equal to 1, default=2
-        Should not exceed the size of the smallest cluster in `labels`.
+        Should not exceed the size of the smallest cluster in `labels`. Note
+        that this parameter has no effect for method='paired'.
 
-    rejection_threshold : float, default=None
+    rejection_threshold : float, default=5e6
         Determines if any potential overlapping points should be rejected for
         being too far (from source centroid or nearest source edge point).
-        Default of 'None' is equivalent to setting the threshold to infinity.
+        Value of 'None' is equivalent to setting the threshold to infinity.
         Note that if value other than 'None' is used, there is no guarantee
-        that all clusters will have overlap points added.
+        that all clusters will have overlap points added. This parameter value
+        is required to be set when using method='paired'.
 
-    method : 'static' (default) or 'dynamic'
+    method : 'static' (default), 'paired', or 'dynamic'
         The 'static' method will always produce an overcluster equal to the
         `neighbors` parameter; 'dynamic' will produce an overcluster ceiling
-        of (neighbors - 1) * overlap_points, with a floor of neighbors.
+        of (neighbors - 1) * overlap_points, with a floor of neighbors. The
+        'paired' method will add 2 * `nieghbors` points per cluster, one
+        of which is the closest nieghbor and one which is the farthest point
+        within that same nieghboring cluster.
 
     Returns
     -------
@@ -221,7 +229,7 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
         if method == 'dynamic':
             coverage = len(np.unique(labels[output[cluster, :]]))
         else:  # method == 'static' or 'paired'
-            coverage = 0
+            coverage = 1
 
         while coverage <= neighbors:
             # intersect search tree with non-members
diff --git a/pgamit/network.py b/pgamit/network.py
index 667b7cc2..92567d15 100644
--- a/pgamit/network.py
+++ b/pgamit/network.py
@@ -187,7 +187,7 @@ def make_clusters(self, points, stations, net_limit=NET_LIMIT):
         central_points = select_central_point(points, qmean.cluster_centers_)
         # expand the initial clusters to overlap stations with neighbors
         OC = over_cluster(qmean.labels_, points, metric='euclidean',
-                          neighbors=self.ties, overlap_points=2)
+                          neighbors=self.ties, overlap=2)
         # set 'method=None' to disable
         OC, central_points = prune(OC, central_points, method='minsize')
         # calculate all 'tie' stations

From 65a70b51eee634dc91542776aacd1792f8b172d7 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Tue, 13 May 2025 16:39:21 -0800
Subject: [PATCH 5/8] forgot to push changes from unit test file...

---
 pgamit/tests/test_make_clusters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py
index d95cc707..6092e01c 100644
--- a/pgamit/tests/test_make_clusters.py
+++ b/pgamit/tests/test_make_clusters.py
@@ -60,7 +60,7 @@ def test_max_clust_expansion(max_clust, neighbors, overlap):
     clust.fit(data)
  
     OC = over_cluster(clust.labels_, data, metric='euclidean', 
-                      neighbors=neighbors, overlap_points=overlap,
+                      neighbors=neighbors, overlap=overlap,
                       method='dynamic')
 
     expanded_sizes = np.sum(OC, axis=1)

From d6b082262a323489391b33bf3358594aac8e44f1 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Fri, 23 May 2025 15:03:57 -0400
Subject: [PATCH 6/8] updating function name from `over_cluster` to
 `overcluster` to match paper discussion

---
 pgamit/cluster.py                  | 6 +++---
 pgamit/network.py                  | 6 +++---
 pgamit/tests/test_make_clusters.py | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pgamit/cluster.py b/pgamit/cluster.py
index fe4e64d7..a15f19cc 100644
--- a/pgamit/cluster.py
+++ b/pgamit/cluster.py
@@ -25,7 +25,7 @@
 
 
 def prune(OC, central_points, method='minsize'):
-    """Prune redundant clusters from over cluster (OC) and other arrays
+    """Prune redundant clusters from overcluster (OC) and other arrays
 
     Parameters
     ----------
@@ -105,7 +105,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
     return idxs.squeeze()
 
 
-def over_cluster(labels, coordinates, metric='euclidean', neighbors=5,
+def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
                  overlap=2, rejection_threshold=5e6, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
@@ -267,7 +267,7 @@ def over_cluster(labels, coordinates, metric='euclidean', neighbors=5,
                 # Add to member label array
                 output[cluster, new_member] = 1
                 if method == 'dynamic':
-                    # Update current count of over-clustered neighbors
+                    # Update current count of overclustered neighbors
                     coverage = len(np.unique(labels[output[cluster, :]]))
                 elif method == 'static':
                     # Update current point expansion count
diff --git a/pgamit/network.py b/pgamit/network.py
index 92567d15..301aa1c9 100644
--- a/pgamit/network.py
+++ b/pgamit/network.py
@@ -37,7 +37,7 @@
 # app
 from pgamit.pyGamitSession import GamitSession
 from pgamit.pyStation import StationCollection
-from pgamit.cluster import (BisectingQMeans, over_cluster, prune, 
+from pgamit.cluster import (BisectingQMeans, overcluster, prune, 
                             select_central_point)
 from pgamit.plots import plot_global_network
 
@@ -186,8 +186,8 @@ def make_clusters(self, points, stations, net_limit=NET_LIMIT):
         # snap centroids to closest station coordinate
         central_points = select_central_point(points, qmean.cluster_centers_)
         # expand the initial clusters to overlap stations with neighbors
-        OC = over_cluster(qmean.labels_, points, metric='euclidean',
-                          neighbors=self.ties, overlap=2)
+        OC = overcluster(qmean.labels_, points, metric='euclidean',
+                         neighbors=self.ties, overlap=2)
         # set 'method=None' to disable
         OC, central_points = prune(OC, central_points, method='minsize')
         # calculate all 'tie' stations
diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py
index 6092e01c..ca9812bb 100644
--- a/pgamit/tests/test_make_clusters.py
+++ b/pgamit/tests/test_make_clusters.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from .common import gen_variable_density_clusters, generate_clustered_data
-from ..cluster import BisectingQMeans, over_cluster
+from ..cluster import BisectingQMeans, overcluster
 
 
 @pytest.mark.parametrize(
@@ -48,7 +48,7 @@ def test_ceiling_variable_density(max_size, clust_size):
     ],
 )
 def test_max_clust_expansion(max_clust, neighbors, overlap):
-    """Test algorithmic guarantee of `over_cluster`
+    """Test algorithmic guarantee of `overcluster`
 
     Verify that expanded cluster size is under (<=, less than or equal to):
     [initial cluster size + (neighbors * overlap)]"""
@@ -59,7 +59,7 @@ def test_max_clust_expansion(max_clust, neighbors, overlap):
                             max_iter=8000, random_state=42)
     clust.fit(data)
  
-    OC = over_cluster(clust.labels_, data, metric='euclidean', 
+    OC = overcluster(clust.labels_, data, metric='euclidean', 
                       neighbors=neighbors, overlap=overlap,
                       method='dynamic')
 

From 34ad027870da26a398851897a335cf9e6d154ae1 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Sat, 24 May 2025 22:27:20 -0400
Subject: [PATCH 7/8] better parameter names

---
 pgamit/cluster.py                  | 30 +++++++++++++++---------------
 pgamit/network.py                  |  4 ++--
 pgamit/tests/test_make_clusters.py | 18 +++++++++---------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/pgamit/cluster.py b/pgamit/cluster.py
index a15f19cc..e3ecf5e6 100644
--- a/pgamit/cluster.py
+++ b/pgamit/cluster.py
@@ -105,18 +105,18 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
     return idxs.squeeze()
 
 
-def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
-                 overlap=2, rejection_threshold=5e6, method='static'):
+def overcluster(labels, coordinates, metric='euclidean', overlap=5,
+                nmax=2, rejection_threshold=5e6, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
     Expands an existing clustering to create overlapping membership between
     clusters. Existing clusters are processed sequentially by looking up
     nearest neighbors as an intersection of the current cluster membership and
-    all cluster's point membership.  Once the `overlap` for a given
+    all cluster's point membership.  Once the `nmax` for a given
     neighbor cluster have been determined and added to current cluster, the
     remainder of that neighboring cluster is removed from consideration and
     distance query is rerun, with the process repeating until a number of
-    clusters equal to the `neighborhood` parameter is reached. For stability,
+    clusters equal to the `overlap` parameter is reached. For stability,
     only original points are included for subsequent neighborhood searches;
     that is, Nearest neighbor distances are run as the shortest distance from
     all **original** members of the current cluster.
@@ -166,7 +166,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
         Sparse matrices are only supported by scikit-learn metrics.  See the
         documentation for scipy.spatial.distance for details on these metrics.
 
-    neighbors: int greater than or equal to 1, default=3
+    overlap: int greater than or equal to 1, default=3
         For method='static', this is total number of points that will be added
         to the seed clusters during cluster expansion.
         For method='paired', this is the number of cluster that are used to
@@ -175,7 +175,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
         clusters to include when adding cluster membership overlap. Should be
         less than the number of unique cluster labels - 1.
 
-    overlap : int greater than or equal to 1, default=2
+    nmax : int greater than or equal to 1, default=2
         Should not exceed the size of the smallest cluster in `labels`. Note
         that this parameter has no effect for method='paired'.
 
@@ -189,8 +189,8 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
 
     method : 'static' (default), 'paired', or 'dynamic'
         The 'static' method will always produce an overcluster equal to the
-        `neighbors` parameter; 'dynamic' will produce an overcluster ceiling
-        of (neighbors - 1) * overlap_points, with a floor of neighbors. The
+        `overlap` parameter; 'dynamic' will produce an overcluster ceiling
+        of (overlap - 1) * overlap_points, with a floor of overlap. The
         'paired' method will add 2 * `nieghbors` points per cluster, one
         of which is the closest nieghbor and one which is the farthest point
         within that same nieghboring cluster.
@@ -209,8 +209,8 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
     clusters = np.unique(labels)
     n_clusters = len(clusters)
 
-    if (n_clusters - 1) < neighbors:
-        neighbors = (n_clusters - 1)
+    if (n_clusters - 1) < overlap:
+        overlap = (n_clusters - 1)
 
     # reference index for reverse lookups
     ridx = np.array(list(range(len(labels))))
@@ -231,7 +231,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
         else:  # method == 'static' or 'paired'
             coverage = 1
 
-        while coverage <= neighbors:
+        while coverage <= overlap:
             # intersect search tree with non-members
             D, indx = nbrs.kneighbors(coordinates[nonmembers, :])
             mindex = np.argmin(D)
@@ -273,7 +273,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5,
                     # Update current point expansion count
                     coverage += 1
                 # Check if we've exceeded our overlap allotment...
-                if sum(labels[output[cluster, :]] == nm_label) >= overlap:
+                if sum(labels[output[cluster, :]] == nm_label) >= nmax:
                     # ...if so, remove entire neighboring cluster
                     remove = nm_label == labels
                     nonmembers[remove] = False
@@ -659,7 +659,7 @@ class BisectingQMeans(_BaseKMeans):
 
     Parameters
     ----------
-    max_size: int, default=25
+    qmax: int, default=25
         Hard cutoff to bypass the heuristic when bisecting clusters; no
         clusters greater than this size will be produced.
 
@@ -759,7 +759,7 @@ class BisectingQMeans(_BaseKMeans):
 
     def __init__(
         self,
-        max_size=25,
+        qmax=25,
         *,
         init="random",
         n_init=1,
@@ -781,7 +781,7 @@ def __init__(
             n_init=n_init,
         )
 
-        self.max_size = max_size
+        self.max_size = qmax
         self.copy_x = copy_x
         self.algorithm = algorithm
         self.bisect = True
diff --git a/pgamit/network.py b/pgamit/network.py
index 301aa1c9..e1631ce8 100644
--- a/pgamit/network.py
+++ b/pgamit/network.py
@@ -181,13 +181,13 @@ def __init__(self, cnn, archive, GamitConfig, stations, date,
 
     def make_clusters(self, points, stations, net_limit=NET_LIMIT):
         # Run initial clustering using bisecting 'q-means'
-        qmean = BisectingQMeans(max_size=self.cluster_size, random_state=42)
+        qmean = BisectingQMeans(qmax=self.cluster_size, random_state=42)
         qmean.fit(points)
         # snap centroids to closest station coordinate
         central_points = select_central_point(points, qmean.cluster_centers_)
         # expand the initial clusters to overlap stations with neighbors
         OC = overcluster(qmean.labels_, points, metric='euclidean',
-                         neighbors=self.ties, overlap=2)
+                         overlap=self.ties, nmax=2)
         # set 'method=None' to disable
         OC, central_points = prune(OC, central_points, method='minsize')
         # calculate all 'tie' stations
diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py
index ca9812bb..665c8ea2 100644
--- a/pgamit/tests/test_make_clusters.py
+++ b/pgamit/tests/test_make_clusters.py
@@ -9,7 +9,7 @@
 
 
 @pytest.mark.parametrize(
-    ("max_size", "clust_size"),
+    ("qmax", "clust_size"),
     [
         [5, 10],
         [7, 10],
@@ -20,24 +20,24 @@
         [30, 250],
     ],
 )
-def test_ceiling_variable_density(max_size, clust_size):
+def test_ceiling_variable_density(qmax, clust_size):
     """Test algorithmic guarantee of BisectingQMeans on variable density data
     
     Verify that when `min_size=2`, that the max per cluster membership is
     under (<, less than) what parameter `opt_cluster_size` is set to"""
 
     data = gen_variable_density_clusters(clust_size)
-    clust = BisectingQMeans(max_size=max_size,
+    clust = BisectingQMeans(qmax=qmax,
                             init='random', n_init=50, algorithm='lloyd',
                             max_iter=8000, random_state=42)
     clust.fit(data)
 
     _, counts = np.unique(clust.labels_, return_counts=True)
-    assert np.max(counts) <= max_size
+    assert np.max(counts) <= qmax
 
 
 @pytest.mark.parametrize(
-    ("max_clust", "neighbors", "overlap"),
+    ("qmax", "overlap", "nmax"),
     [
         [5, 5, 2],
         [10, 2, 5],
@@ -47,23 +47,23 @@ def test_ceiling_variable_density(max_size, clust_size):
         [17, 10, 1],
     ],
 )
-def test_max_clust_expansion(max_clust, neighbors, overlap):
+def test_max_clust_expansion(qmax, overlap, nmax):
     """Test algorithmic guarantee of `overcluster`
 
     Verify that expanded cluster size is under (<=, less than or equal to):
     [initial cluster size + (neighbors * overlap)]"""
     
     data = gen_variable_density_clusters()
-    clust = BisectingQMeans(max_size=max_clust,
+    clust = BisectingQMeans(qmax=qmax,
                             init='random', n_init=50, algorithm='lloyd',
                             max_iter=8000, random_state=42)
     clust.fit(data)
  
     OC = overcluster(clust.labels_, data, metric='euclidean', 
-                      neighbors=neighbors, overlap=overlap,
+                      overlap=overlap, nmax=nmax,
                       method='dynamic')
 
     expanded_sizes = np.sum(OC, axis=1)
     _, original_sizes = np.unique(clust.labels_, return_counts=True)
-    assert np.all((expanded_sizes - original_sizes) <= neighbors*overlap)
+    assert np.all((expanded_sizes - original_sizes) <= overlap*nmax)
 

From e02ba07548064bad541d6339f3fcf4e7f546bf37 Mon Sep 17 00:00:00 2001
From: espg <refuge@rocktalus.com>
Date: Fri, 13 Jun 2025 11:06:38 -0600
Subject: [PATCH 8/8] seperated agglomertative functions to own module; reduces
 line lengths below 1000 lines for both modules, makes clustering pep8
 complient again, displays proper unit test coverage percentages

---
 pgamit/agglomerative.py | 365 ++++++++++++++++++++++++++++++++++++++++
 pgamit/cluster.py       | 362 +--------------------------------------
 2 files changed, 366 insertions(+), 361 deletions(-)
 create mode 100644 pgamit/agglomerative.py

diff --git a/pgamit/agglomerative.py b/pgamit/agglomerative.py
new file mode 100644
index 00000000..e11c18d0
--- /dev/null
+++ b/pgamit/agglomerative.py
@@ -0,0 +1,365 @@
+import heapq
+import networkx as nx
+import numpy as np
+
+from scipy.spatial.distance import pdist, squareform
+from sklearn.neighbors import NearestNeighbors
+
+
+class DeterministicClustering(object):
+    def __init__(self, target_size=15, tolerance=4,
+                 num_tie_points=4, max_dist_to_centroid=5_000_000):
+
+        self.target_size = target_size
+        self.num_tie_points = num_tie_points
+        self.tolerance = tolerance
+        self.max_dist = max_dist_to_centroid
+        self.points = np.array([])
+        self.OC = np.array([])
+        # variables to store results
+        self.centroid_ids = []
+        self.clustered_ids = []
+        self.tie_ids = []
+
+    def constrained_agglomerative(self, points, tie_clusters=True):
+        """
+        Spatially-constrained agglomerative clustering with centroid snapping.
+
+        Parameters:
+            points (np.ndarray): Nx3 ECEF coordinates.
+            tie_clusters (bool): to tie clusters together using neighbors,
+                                 pass True
+
+        Returns:
+            clustered_points (List[List[np.ndarray]]): Points per cluster.
+            labels (np.ndarray): Cluster label for each point.
+            final_centroids (List[np.ndarray]): Snapped centroids from points
+                                                for each cluster.
+        """
+        target_size = self.target_size
+        margin = self.tolerance
+        max_dist_to_centroid = self.max_dist
+
+        points = np.array(points)
+        self.points = points
+
+        min_size = target_size - margin
+        max_size = target_size + margin
+
+        clusters = {i: [i] for i in range(len(points))}
+        centroids = {i: points[i] for i in range(len(points))}
+        next_cluster_id = len(points)
+
+        # Vectorized pairwise distance computation
+        pairwise_dists = squareform(pdist(points))
+        i_idx, j_idx = np.triu_indices(len(points), k=1)
+        dists = pairwise_dists[i_idx, j_idx]
+        valid_mask = dists <= max_dist_to_centroid * 2
+        i_valid = i_idx[valid_mask]
+        j_valid = j_idx[valid_mask]
+        d_valid = dists[valid_mask]
+
+        heap = list(zip(d_valid, i_valid, j_valid))
+        heapq.heapify(heap)
+
+        while heap:
+            dist, ci, cj = heapq.heappop(heap)
+            if ci not in clusters or cj not in clusters:
+                continue
+
+            merged_cluster = clusters[ci] + clusters[cj]
+            if len(merged_cluster) > max_size:
+                continue
+
+            merged_points = points[merged_cluster]
+            centroid = merged_points.mean(axis=0)
+            # compare to square distance to avoid computing the sqrt
+            # and save some computation time
+            if np.any(np.einsum('ij,ij->i', merged_points - centroid,
+                                merged_points -
+                                centroid) > max_dist_to_centroid ** 2):
+                continue
+
+            clusters[next_cluster_id] = merged_cluster
+            centroids[next_cluster_id] = centroid
+            del clusters[ci], clusters[cj]
+            del centroids[ci], centroids[cj]
+            self.update_heap_vectorized(heap, clusters, centroids, centroid,
+                                        merged_cluster, next_cluster_id,
+                                        max_size, max_dist_to_centroid)
+
+            next_cluster_id += 1
+
+        # Precompute centroids and classify clusters
+        cluster_items = list(clusters.values())
+        cluster_lens = np.array([len(c) for c in cluster_items])
+        is_final = cluster_lens >= min_size
+
+        # Split clusters
+        final_clusters = [cluster_items[i] for i in
+                          range(len(cluster_items)) if is_final[i]]
+        leftovers = [cluster_items[i] for i in
+                     range(len(cluster_items)) if not is_final[i]]
+
+        # Compute centroids for final clusters
+        centroids_arr = np.array([points[c].mean(axis=0) for c in
+                                  final_clusters])
+        # Snap to closest input point in cluster
+        snapped_idxs = [
+            c[np.argmin(np.linalg.norm(points[c] - centroid, axis=1))]
+            for c, centroid in zip(final_clusters, centroids_arr)
+        ]
+        final_centroids = list(points[snapped_idxs])
+        centroid_ids = snapped_idxs
+
+        for cluster in leftovers:
+            if len(cluster) == 1:
+                idx = cluster[0]
+                point = points[idx]
+                dists = np.linalg.norm(np.array(final_centroids) - point,
+                                       axis=1)
+                valid = [i for i in range(len(final_clusters)) if
+                         len(final_clusters[i]) < max_size and
+                         dists[i] <= max_dist_to_centroid]
+                if valid:
+                    best_fit = valid[np.argmin(dists[valid])]
+                else:
+                    best_fit = np.argmin(dists)
+                final_clusters[best_fit].append(idx)
+                cluster_points = points[final_clusters[best_fit]]
+                centroid = cluster_points.mean(axis=0)
+                snapped_idx = final_clusters[best_fit][np.argmin(
+                    np.linalg.norm(cluster_points - centroid, axis=1))]
+                final_centroids[best_fit] = points[snapped_idx]
+                centroid_ids[best_fit] = snapped_idx
+            else:
+                centroid = points[cluster].mean(axis=0)
+                dists = np.linalg.norm(np.array(final_centroids) - centroid,
+                                       axis=1)
+                best_fit = None
+                for i in np.argsort(dists):
+                    potential = final_clusters[i] + cluster
+                    if len(potential) <= max_size:
+                        test_points = points[potential]
+                        test_centroid = test_points.mean(axis=0)
+                        if np.all(np.linalg.norm(test_points -
+                                                 test_centroid,
+                                                 axis=1) <=
+                                  max_dist_to_centroid):
+                            best_fit = i
+                            break
+                if best_fit is not None:
+                    final_clusters[best_fit].extend(cluster)
+                    cluster_points = points[final_clusters[best_fit]]
+                    centroid = cluster_points.mean(axis=0)
+                    snapped_idx = final_clusters[best_fit][np.argmin(
+                        np.linalg.norm(cluster_points - centroid, axis=1))]
+                    final_centroids[best_fit] = points[snapped_idx]
+                    centroid_ids[best_fit] = snapped_idx
+                else:
+                    final_clusters.append(cluster)
+                    snapped_idx = cluster[np.argmin(np.linalg.norm(
+                        points[cluster] - centroid, axis=1))]
+                    final_centroids.append(points[snapped_idx])
+                    centroid_ids.append(snapped_idx)
+
+        labels = np.zeros(len(points), dtype=int)
+        for idx, cluster in enumerate(final_clusters):
+            for i in cluster:
+                labels[i] = idx
+
+        self.clustered_ids = final_clusters
+        self.centroid_ids = centroid_ids
+
+        # ties clusters together
+        if tie_clusters:
+            self.add_tie_points(labels, self.num_tie_points, self.max_dist)
+
+        return final_clusters, labels, centroid_ids
+
+    def update_heap_vectorized(self, heap, clusters, centroids, centroid,
+                               merged_cluster, next_cluster_id, max_size,
+                               max_dist_to_centroid):
+        """
+        Vectorized version to update heap with valid cluster pairs post merge
+
+        Parameters:
+            heap (List[Tuple[float, int, int]]): The heap to update.
+            clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list
+                                             of point indices.
+            centroids (Dict[int, np.ndarray]): Dictionary of cluster_id ->
+                                               centroid coordinates.
+            centroid (np.ndarray): The centroid of the newly merged cluster.
+            merged_cluster (List[int]): Indices of points in the new cluster.
+            next_cluster_id (int): The ID of the new cluster.
+            max_size (int): Maximum allowed cluster size.
+            max_dist_to_centroid (float): Maximum allowed distance for a valid
+                                          connection.
+        """
+        # Extract existing cluster IDs and their centroids
+        existing_ids = np.array(list(clusters.keys()))
+        existing_centroids = np.array([centroids[k] for k in existing_ids])
+        existing_sizes = np.array([len(clusters[k]) for k in existing_ids])
+
+        # Compute distance from new centroid to all others
+        dists = np.linalg.norm(existing_centroids - centroid, axis=1)
+
+        # Evaluate which existing clusters are valid for merging
+        size_sum = existing_sizes + len(merged_cluster)
+        valid = ((existing_ids != next_cluster_id) &
+                 (size_sum <= max_size) &
+                 (dists <= max_dist_to_centroid * 2))
+
+        # Push valid merge pairs into heap
+        for other_id, dist in zip(existing_ids[valid], dists[valid]):
+            heapq.heappush(heap, (dist, next_cluster_id, other_id))
+
+    def add_tie_points(self, cluster_labels, num_neighbors=4,
+                       max_tie_distance=5_000_000):
+        """
+        Add reciprocal tie points to each cluster from its nearest neighbors,
+        then ensure every disconnected cluster component (island) is connected
+        to at least `num_neighbors` external clusters.
+
+        Parameters:
+            cluster_labels (np.ndarray): Cluster index (0..K-1) for each
+                                         station.
+            num_neighbors (int): Minimum number of external links each island
+                                 must have.
+            max_tie_distance (float): Max allowable tie distance in meters.
+
+        Returns:
+            new_clusters (List[List[int]]): Cluster station indices including
+                                            added tie points.
+            tie_points (List[List[int]]): Tie point indices added to each
+                                          cluster.
+        """
+
+        points = self.points
+        labels = cluster_labels
+        centroids = self.get_centroid_coordinates()
+
+        n_clusters = len(centroids)
+        n_stations = points.shape[0]
+
+        # === Step 1: Initialize cluster structure ===
+        clusters = [[] for _ in range(n_clusters)]
+        for idx, label in enumerate(labels):
+            clusters[label].append(idx)
+
+        new_clusters = [list(cluster) for cluster in clusters]
+        tie_points = [[] for _ in range(n_clusters)]
+        used_points = set()
+        tie_log = np.zeros((n_clusters, n_clusters), dtype=bool)
+
+        # === Step 2: Initial connections to nearby neighbors using centroids
+        nbrs = NearestNeighbors(n_neighbors=num_neighbors + 1).fit(centroids)
+        _, neighbors = nbrs.kneighbors(centroids)
+
+        def add_reciprocal_tie(i, j, pi_idx, pj_idx):
+            new_clusters[i].append(pj_idx)
+            new_clusters[j].append(pi_idx)
+            tie_points[i].append(pj_idx)
+            tie_points[j].append(pi_idx)
+            used_points.update({pi_idx, pj_idx})
+            tie_log[i, j] = True
+            tie_log[j, i] = True
+
+        for i in range(n_clusters):
+            for j in neighbors[i][1:]:
+                if tie_log[i, j] or tie_log[j, i]:
+                    continue
+                pi = [idx for idx in clusters[i] if idx not in used_points]
+                pj = [idx for idx in clusters[j] if idx not in used_points]
+                if not pi or not pj:
+                    continue
+                dist_matrix = np.linalg.norm(points[pi][:, None, :] -
+                                             points[pj][None, :, :], axis=2)
+                min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape)
+                min_dist = dist_matrix[min_idx]
+                if min_dist <= max_tie_distance:
+                    add_reciprocal_tie(i, j, pi[min_idx[0]], pj[min_idx[1]])
+
+        # === Step 3: Build the graph from current tie connections ===
+        G = nx.Graph()
+        G.add_nodes_from(range(n_clusters))
+        for i in range(n_clusters):
+            for j in range(i + 1, n_clusters):
+                if set(tie_points[i]) & set(clusters[j]) or set(tie_points[j]) & set(clusters[i]):
+                    G.add_edge(i, j)
+
+        # === Step 4: Ensure every disconnected component connects to at least `num_neighbors` others ===
+        components = sorted(list(nx.connected_components(G)), key=len)
+
+        while len(components) > 1:
+            updated = False  # Flag to break early if no valid connections were made
+
+            for comp in components:
+                external_nodes = set(range(n_clusters)) - comp
+                connection_candidates = []
+
+                for i in comp:
+                    for j in external_nodes:
+                        if tie_log[i, j] or tie_log[j, i]:
+                            continue
+                        pi = [idx for idx in clusters[i] if idx not in used_points]
+                        pj = [idx for idx in clusters[j] if idx not in used_points]
+                        if not pi or not pj:
+                            continue
+                        dist_matrix = np.linalg.norm(points[pi][:, None, :] -
+                                                     points[pj][None, :, :],
+                                                     axis=2)
+                        min_idx = np.unravel_index(np.argmin(dist_matrix),
+                                                   dist_matrix.shape)
+                        dist = dist_matrix[min_idx]
+                        if dist <= max_tie_distance:
+                            connection_candidates.append((dist, i, j,
+                                                          pi[min_idx[0]],
+                                                          pj[min_idx[1]]))
+
+                # Sort connections by shortest distance
+                connection_candidates.sort()
+                added = 0
+                for conn in connection_candidates:
+                    _, i, j, pi_idx, pj_idx = conn
+                    if not G.has_edge(i, j):
+                        add_reciprocal_tie(i, j, pi_idx, pj_idx)
+                        G.add_edge(i, j)
+                        added += 1
+                        updated = True
+                    if added >= num_neighbors:
+                        break
+
+                if added < num_neighbors:
+                    pass
+                    # tqdm.write(f" -- WARNING: Component containing clusters {sorted(comp)} "
+                    #      f"was only connected to {added} external clusters under the distance constraint.")
+
+            if not updated:
+                # tqdm.write(" -- WARNING: Some disconnected components could not be joined with the rest of the graph.")
+                break
+
+            components = list(nx.connected_components(G))
+
+        # create matrix with clusters and stations
+        matrix = np.zeros((n_clusters, n_stations), dtype=bool)
+        for i, cluster in enumerate(new_clusters):
+            matrix[i, cluster] = True
+
+        self.OC = np.array(matrix)
+
+        self.clustered_ids = new_clusters
+        self.tie_ids = tie_points
+
+        return new_clusters, tie_points
+
+    def get_cluster_coordinates(self):
+        return [[self.points[i] for i in cluster]
+                for cluster in self.clustered_ids]
+
+    def get_centroid_coordinates(self):
+        return [self.points[i] for i in self.centroid_ids]
+
+    def get_tie_coordinates(self):
+        return [[self.points[i] for i in cluster]
+                for cluster in self.tie_ids]
diff --git a/pgamit/cluster.py b/pgamit/cluster.py
index e3ecf5e6..97a334c6 100644
--- a/pgamit/cluster.py
+++ b/pgamit/cluster.py
@@ -7,10 +7,6 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp
-import heapq
-import networkx as nx
-
-from scipy.spatial.distance import pdist, squareform
 
 from sklearn.neighbors import NearestNeighbors
 from sklearn.metrics import pairwise_distances
@@ -105,7 +101,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
     return idxs.squeeze()
 
 
-def overcluster(labels, coordinates, metric='euclidean', overlap=5,
+def overcluster(labels, coordinates, metric='euclidean', overlap=4,
                 nmax=2, rejection_threshold=5e6, method='static'):
     """Expand cluster membership to include edge points of neighbor clusters
 
@@ -280,362 +276,6 @@ def overcluster(labels, coordinates, metric='euclidean', overlap=5,
     return output
 
 
-class DeterministicClustering(object):
-    def __init__(self, target_size=15, tolerance=4,
-                 num_tie_points=4, max_dist_to_centroid=5_000_000):
-
-        self.target_size = target_size
-        self.num_tie_points = num_tie_points
-        self.tolerance = tolerance
-        self.max_dist = max_dist_to_centroid
-        self.points = np.array([])
-        self.OC = np.array([])
-        # variables to store results
-        self.centroid_ids = []
-        self.clustered_ids = []
-        self.tie_ids = []
-
-    def constrained_agglomerative(self, points, tie_clusters=True):
-        """
-        Spatially-constrained agglomerative clustering with centroid snapping.
-
-        Parameters:
-            points (np.ndarray): Nx3 ECEF coordinates.
-            tie_clusters (bool): to tie clusters together using neighbors,
-                                 pass True
-
-        Returns:
-            clustered_points (List[List[np.ndarray]]): Points per cluster.
-            labels (np.ndarray): Cluster label for each point.
-            final_centroids (List[np.ndarray]): Snapped centroids from points
-                                                for each cluster.
-        """
-        target_size = self.target_size
-        margin = self.tolerance
-        max_dist_to_centroid = self.max_dist
-
-        points = np.array(points)
-        self.points = points
-
-        min_size = target_size - margin
-        max_size = target_size + margin
-
-        clusters = {i: [i] for i in range(len(points))}
-        centroids = {i: points[i] for i in range(len(points))}
-        next_cluster_id = len(points)
-
-        # Vectorized pairwise distance computation
-        pairwise_dists = squareform(pdist(points))
-        i_idx, j_idx = np.triu_indices(len(points), k=1)
-        dists = pairwise_dists[i_idx, j_idx]
-        valid_mask = dists <= max_dist_to_centroid * 2
-        i_valid = i_idx[valid_mask]
-        j_valid = j_idx[valid_mask]
-        d_valid = dists[valid_mask]
-
-        heap = list(zip(d_valid, i_valid, j_valid))
-        heapq.heapify(heap)
-
-        while heap:
-            dist, ci, cj = heapq.heappop(heap)
-            if ci not in clusters or cj not in clusters:
-                continue
-
-            merged_cluster = clusters[ci] + clusters[cj]
-            if len(merged_cluster) > max_size:
-                continue
-
-            merged_points = points[merged_cluster]
-            centroid = merged_points.mean(axis=0)
-            # compare to square distance to avoid computing the sqrt
-            # and save some computation time
-            if np.any(np.einsum('ij,ij->i', merged_points - centroid,
-                                merged_points -
-                                centroid) > max_dist_to_centroid ** 2):
-                continue
-
-            clusters[next_cluster_id] = merged_cluster
-            centroids[next_cluster_id] = centroid
-            del clusters[ci], clusters[cj]
-            del centroids[ci], centroids[cj]
-            self.update_heap_vectorized(heap, clusters, centroids, centroid,
-                                        merged_cluster, next_cluster_id,
-                                        max_size, max_dist_to_centroid)
-
-            next_cluster_id += 1
-
-        # Precompute centroids and classify clusters
-        cluster_items = list(clusters.values())
-        cluster_lens = np.array([len(c) for c in cluster_items])
-        is_final = cluster_lens >= min_size
-
-        # Split clusters
-        final_clusters = [cluster_items[i] for i in
-                          range(len(cluster_items)) if is_final[i]]
-        leftovers = [cluster_items[i] for i in
-                     range(len(cluster_items)) if not is_final[i]]
-
-        # Compute centroids for final clusters
-        centroids_arr = np.array([points[c].mean(axis=0) for c in
-                                  final_clusters])
-        # Snap to closest input point in cluster
-        snapped_idxs = [
-            c[np.argmin(np.linalg.norm(points[c] - centroid, axis=1))]
-            for c, centroid in zip(final_clusters, centroids_arr)
-        ]
-        final_centroids = list(points[snapped_idxs])
-        centroid_ids = snapped_idxs
-
-        for cluster in leftovers:
-            if len(cluster) == 1:
-                idx = cluster[0]
-                point = points[idx]
-                dists = np.linalg.norm(np.array(final_centroids) - point,
-                                       axis=1)
-                valid = [i for i in range(len(final_clusters)) if
-                         len(final_clusters[i]) < max_size and
-                         dists[i] <= max_dist_to_centroid]
-                if valid:
-                    best_fit = valid[np.argmin(dists[valid])]
-                else:
-                    best_fit = np.argmin(dists)
-                final_clusters[best_fit].append(idx)
-                cluster_points = points[final_clusters[best_fit]]
-                centroid = cluster_points.mean(axis=0)
-                snapped_idx = final_clusters[best_fit][np.argmin(
-                    np.linalg.norm(cluster_points - centroid, axis=1))]
-                final_centroids[best_fit] = points[snapped_idx]
-                centroid_ids[best_fit] = snapped_idx
-            else:
-                centroid = points[cluster].mean(axis=0)
-                dists = np.linalg.norm(np.array(final_centroids) - centroid,
-                                       axis=1)
-                best_fit = None
-                for i in np.argsort(dists):
-                    potential = final_clusters[i] + cluster
-                    if len(potential) <= max_size:
-                        test_points = points[potential]
-                        test_centroid = test_points.mean(axis=0)
-                        if np.all(np.linalg.norm(test_points -
-                                                 test_centroid,
-                                                 axis=1) <=
-                                  max_dist_to_centroid):
-                            best_fit = i
-                            break
-                if best_fit is not None:
-                    final_clusters[best_fit].extend(cluster)
-                    cluster_points = points[final_clusters[best_fit]]
-                    centroid = cluster_points.mean(axis=0)
-                    snapped_idx = final_clusters[best_fit][np.argmin(
-                        np.linalg.norm(cluster_points - centroid, axis=1))]
-                    final_centroids[best_fit] = points[snapped_idx]
-                    centroid_ids[best_fit] = snapped_idx
-                else:
-                    final_clusters.append(cluster)
-                    snapped_idx = cluster[np.argmin(np.linalg.norm(
-                        points[cluster] - centroid, axis=1))]
-                    final_centroids.append(points[snapped_idx])
-                    centroid_ids.append(snapped_idx)
-
-        labels = np.zeros(len(points), dtype=int)
-        for idx, cluster in enumerate(final_clusters):
-            for i in cluster:
-                labels[i] = idx
-
-        self.clustered_ids = final_clusters
-        self.centroid_ids = centroid_ids
-
-        # ties clusters together
-        if tie_clusters:
-            self.add_tie_points(labels, self.num_tie_points, self.max_dist)
-
-        return final_clusters, labels, centroid_ids
-
-    def update_heap_vectorized(self, heap, clusters, centroids, centroid,
-                               merged_cluster, next_cluster_id, max_size,
-                               max_dist_to_centroid):
-        """
-        Vectorized version to update the heap with valid cluster pairs after a merge.
-
-        Parameters:
-            heap (List[Tuple[float, int, int]]): The heap to update.
-            clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list
-                                             of point indices.
-            centroids (Dict[int, np.ndarray]): Dictionary of cluster_id ->
-                                               centroid coordinates.
-            centroid (np.ndarray): The centroid of the newly merged cluster.
-            merged_cluster (List[int]): Indices of points in the new cluster.
-            next_cluster_id (int): The ID of the new cluster.
-            max_size (int): Maximum allowed cluster size.
-            max_dist_to_centroid (float): Maximum allowed distance for a valid
-                                          connection.
-        """
-        # Extract existing cluster IDs and their centroids
-        existing_ids = np.array(list(clusters.keys()))
-        existing_centroids = np.array([centroids[k] for k in existing_ids])
-        existing_sizes = np.array([len(clusters[k]) for k in existing_ids])
-
-        # Compute distance from new centroid to all others
-        dists = np.linalg.norm(existing_centroids - centroid, axis=1)
-
-        # Evaluate which existing clusters are valid for merging
-        size_sum = existing_sizes + len(merged_cluster)
-        valid = (existing_ids != next_cluster_id) & (size_sum <= max_size) & (dists <= max_dist_to_centroid * 2)
-
-        # Push valid merge pairs into heap
-        for other_id, dist in zip(existing_ids[valid], dists[valid]):
-            heapq.heappush(heap, (dist, next_cluster_id, other_id))
-
-    def add_tie_points(self, cluster_labels, num_neighbors=4, max_tie_distance=5_000_000):
-        """
-        Add reciprocal tie points to each cluster from its nearest neighbors,
-        then ensure every disconnected cluster component (island) is connected
-        to at least `num_neighbors` external clusters.
-
-        Parameters:
-            cluster_labels (np.ndarray): Cluster index (0..K-1) for each
-                                         station.
-            num_neighbors (int): Minimum number of external links each island
-                                 must have.
-            max_tie_distance (float): Max allowable tie distance in meters.
-
-        Returns:
-            new_clusters (List[List[int]]): Cluster station indices including
-                                            added tie points.
-            tie_points (List[List[int]]): Tie point indices added to each
-                                          cluster.
-        """
-
-        points = self.points
-        labels = cluster_labels
-        centroids = self.get_centroid_coordinates()
-
-        n_clusters = len(centroids)
-        n_stations = points.shape[0]
-
-        # === Step 1: Initialize cluster structure ===
-        clusters = [[] for _ in range(n_clusters)]
-        for idx, label in enumerate(labels):
-            clusters[label].append(idx)
-
-        new_clusters = [list(cluster) for cluster in clusters]
-        tie_points = [[] for _ in range(n_clusters)]
-        used_points = set()
-        tie_log = np.zeros((n_clusters, n_clusters), dtype=bool)
-
-        # === Step 2: Initial connections to nearby neighbors using centroids ===
-        nbrs = NearestNeighbors(n_neighbors=num_neighbors + 1).fit(centroids)
-        _, neighbors = nbrs.kneighbors(centroids)
-
-        def add_reciprocal_tie(i, j, pi_idx, pj_idx):
-            new_clusters[i].append(pj_idx)
-            new_clusters[j].append(pi_idx)
-            tie_points[i].append(pj_idx)
-            tie_points[j].append(pi_idx)
-            used_points.update({pi_idx, pj_idx})
-            tie_log[i, j] = True
-            tie_log[j, i] = True
-
-        for i in range(n_clusters):
-            for j in neighbors[i][1:]:
-                if tie_log[i, j] or tie_log[j, i]:
-                    continue
-                pi = [idx for idx in clusters[i] if idx not in used_points]
-                pj = [idx for idx in clusters[j] if idx not in used_points]
-                if not pi or not pj:
-                    continue
-                dist_matrix = np.linalg.norm(points[pi][:, None, :] -
-                                             points[pj][None, :, :], axis=2)
-                min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape)
-                min_dist = dist_matrix[min_idx]
-                if min_dist <= max_tie_distance:
-                    add_reciprocal_tie(i, j, pi[min_idx[0]], pj[min_idx[1]])
-
-        # === Step 3: Build the graph from current tie connections ===
-        G = nx.Graph()
-        G.add_nodes_from(range(n_clusters))
-        for i in range(n_clusters):
-            for j in range(i + 1, n_clusters):
-                if set(tie_points[i]) & set(clusters[j]) or set(tie_points[j]) & set(clusters[i]):
-                    G.add_edge(i, j)
-
-        # === Step 4: Ensure every disconnected component connects to at least `num_neighbors` others ===
-        components = sorted(list(nx.connected_components(G)), key=len)
-
-        while len(components) > 1:
-            updated = False  # Flag to break early if no valid connections were made
-
-            for comp in components:
-                external_nodes = set(range(n_clusters)) - comp
-                connection_candidates = []
-
-                for i in comp:
-                    for j in external_nodes:
-                        if tie_log[i, j] or tie_log[j, i]:
-                            continue
-                        pi = [idx for idx in clusters[i] if idx not in used_points]
-                        pj = [idx for idx in clusters[j] if idx not in used_points]
-                        if not pi or not pj:
-                            continue
-                        dist_matrix = np.linalg.norm(points[pi][:, None, :] -
-                                                     points[pj][None, :, :],
-                                                     axis=2)
-                        min_idx = np.unravel_index(np.argmin(dist_matrix),
-                                                   dist_matrix.shape)
-                        dist = dist_matrix[min_idx]
-                        if dist <= max_tie_distance:
-                            connection_candidates.append((dist, i, j,
-                                                          pi[min_idx[0]],
-                                                          pj[min_idx[1]]))
-
-                # Sort connections by shortest distance
-                connection_candidates.sort()
-                added = 0
-                for conn in connection_candidates:
-                    _, i, j, pi_idx, pj_idx = conn
-                    if not G.has_edge(i, j):
-                        add_reciprocal_tie(i, j, pi_idx, pj_idx)
-                        G.add_edge(i, j)
-                        added += 1
-                        updated = True
-                    if added >= num_neighbors:
-                        break
-
-                if added < num_neighbors:
-                    pass
-                    # tqdm.write(f" -- WARNING: Component containing clusters {sorted(comp)} "
-                    #      f"was only connected to {added} external clusters under the distance constraint.")
-
-            if not updated:
-                # tqdm.write(" -- WARNING: Some disconnected components could not be joined with the rest of the graph.")
-                break
-
-            components = list(nx.connected_components(G))
-
-        # create matrix with clusters and stations
-        matrix = np.zeros((n_clusters, n_stations), dtype=bool)
-        for i, cluster in enumerate(new_clusters):
-            matrix[i, cluster] = True
-
-        self.OC = np.array(matrix)
-
-        self.clustered_ids = new_clusters
-        self.tie_ids = tie_points
-
-        return new_clusters, tie_points
-
-    def get_cluster_coordinates(self):
-        return [[self.points[i] for i in cluster]
-                for cluster in self.clustered_ids]
-
-    def get_centroid_coordinates(self):
-        return [self.points[i] for i in self.centroid_ids]
-
-    def get_tie_coordinates(self):
-        return [[self.points[i] for i in cluster]
-                for cluster in self.tie_ids]
-
-
 """Bisecting Q-means clustering."""
 
 # Modified from sklearn _bisecting_k_means.py