From 91009b76a000e34f4c303c8b04e6bb611aed46d1 Mon Sep 17 00:00:00 2001 From: espg Date: Fri, 18 Apr 2025 15:11:49 -0400 Subject: [PATCH 1/8] modifying test to use new keyword --- pgamit/tests/test_make_clusters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py index 7a1cfcb6..ea24cf93 100644 --- a/pgamit/tests/test_make_clusters.py +++ b/pgamit/tests/test_make_clusters.py @@ -58,8 +58,9 @@ def test_max_clust_expansion(min_clust, max_clust, neighbors, overlap): max_iter=8000, random_state=42) clust.fit(data) - OC = over_cluster(clust.labels_, data, metric='euclidean', - neighbors=neighbors, overlap_points=overlap) + OC = over_cluster(clust.labels_, data, metric='euclidean', + neighbors=neighbors, overlap_points=overlap, + method='dynamic') expanded_sizes = np.sum(OC, axis=1) _, original_sizes = np.unique(clust.labels_, return_counts=True) From 47eb693a5fc4bbaf01c181ae26ebcecb0090fd42 Mon Sep 17 00:00:00 2001 From: espg Date: Tue, 13 May 2025 13:08:59 -0800 Subject: [PATCH 2/8] new 'paired' method for `over_cluster` --- pgamit/cluster.py | 63 ++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/pgamit/cluster.py b/pgamit/cluster.py index 3a53a86b..ac427aeb 100644 --- a/pgamit/cluster.py +++ b/pgamit/cluster.py @@ -106,7 +106,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'): def over_cluster(labels, coordinates, metric='haversine', neighbors=5, - overlap_points=2, rejection_threshold=None, method='static'): + overlap_points=2, rejection_threshold=5e6, method='static'): """Expand cluster membership to include edge points of neighbor clusters Expands an existing clustering to create overlapping membership between @@ -222,33 +222,46 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, coverage = len(np.unique(labels[output[cluster, :]])) elif method == 'static': coverage = 0 - while coverage <= neighbors: - # intersect search tree with non-members + + if method == 'paired': D, _ = nbrs.kneighbors(coordinates[nonmembers, :]) - # Rejection threshold is lightly tested... - if rejection_threshold: - if np.min(D) > rejection_threshold: - break - # Select closest external point to add to member cluster new_member = ridx[nonmembers][np.argmin(D)] - # Remove point from future coordinate distance queries - nonmembers[new_member] = 0 - # Add to member label array + cluster_label = labels[new_member] + cloc = labels == cluster_label + nonmembers[~cloc] = 0 + rdists, _ = nbrs.radius_neighbors(coordinates[nonmembers, :], + radius=rejection_threshold, + return_distance=True) + far_member = ridx[nonmembers][np.argmax(rdists)] output[cluster, new_member] = 1 - if method == 'dynamic': - # Update current count of over-clustered neighbors - coverage = len(np.unique(labels[output[cluster, :]])) - elif method == 'static': - # Update current point expansion count - coverage += 1 - # Grab label of new member for overlap check - nm_label = labels[new_member] - # Check if we've exceeded our overlap allotment... - if sum(labels[output[cluster, :]] == nm_label) >= overlap_points: - # ...if so, remove entire neighboring cluster - remove = nm_label == labels - nonmembers[remove] = False - + output[cluster, far_member] = 1 + else: + while coverage <= neighbors: + # intersect search tree with non-members + D, _ = nbrs.kneighbors(coordinates[nonmembers, :]) + # Rejection threshold is lightly tested... + if rejection_threshold: + if np.min(D) > rejection_threshold: + break + # Select closest external point to add to member cluster + new_member = ridx[nonmembers][np.argmin(D)] + # Remove point from future coordinate distance queries + nonmembers[new_member] = 0 + # Add to member label array + output[cluster, new_member] = 1 + if method == 'dynamic': + # Update current count of over-clustered neighbors + coverage = len(np.unique(labels[output[cluster, :]])) + elif method == 'static': + # Update current point expansion count + coverage += 1 + # Grab label of new member for overlap check + nm_label = labels[new_member] + # Check if we've exceeded our overlap allotment... + if sum(labels[output[cluster, :]] == nm_label) >= overlap_points: + # ...if so, remove entire neighboring cluster + remove = nm_label == labels + nonmembers[remove] = False return output From 76269cec6b9314c342f85fbfdec53ec11a3133c5 Mon Sep 17 00:00:00 2001 From: espg Date: Tue, 13 May 2025 14:59:54 -0800 Subject: [PATCH 3/8] generalized for cases where neighbors doesn't equal '1'; some flake8 fixes; proper handling of too long baseline pairs --- pgamit/cluster.py | 200 +++++++++++++++++++++++++++------------------- 1 file changed, 120 insertions(+), 80 deletions(-) diff --git a/pgamit/cluster.py b/pgamit/cluster.py index ac427aeb..c809c619 100644 --- a/pgamit/cluster.py +++ b/pgamit/cluster.py @@ -9,11 +9,11 @@ import scipy.sparse as sp import heapq import networkx as nx -from tqdm import tqdm from scipy.spatial.distance import pdist, squareform from sklearn.neighbors import NearestNeighbors +from sklearn.metrics import pairwise_distances from sklearn.base import _fit_context from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Integral, Interval, StrOptions @@ -41,7 +41,7 @@ def prune(OC, central_points, method='minsize'): central_points : Pruned int array of shape (n_clusters -N,) """ subset = [] - rowlength = len(OC[0,:]) + rowlength = len(OC[0, :]) if method == "linear": indices = list(range(len(OC))) elif method == "minsize": @@ -106,13 +106,13 @@ def select_central_point(coordinates, centroids, metric='euclidean'): def over_cluster(labels, coordinates, metric='haversine', neighbors=5, - overlap_points=2, rejection_threshold=5e6, method='static'): + overlap=2, rejection_threshold=5e6, method='static'): """Expand cluster membership to include edge points of neighbor clusters Expands an existing clustering to create overlapping membership between clusters. Existing clusters are processed sequentially by looking up nearest neighbors as an intersection of the current cluster membership and - all cluster's point membership. Once the `overlap_points` for a given + all cluster's point membership. Once the `overlap` for a given neighbor cluster have been determined and added to current cluster, the remainder of that neighboring cluster is removed from consideration and distance query is rerun, with the process repeating until a number of @@ -172,7 +172,7 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, clusters to include when adding cluster membership overlap. Should be less than the number of unique cluster labels - 1. - overlap_points : int greater than or equal to 1, default=2 + overlap : int greater than or equal to 1, default=2 Should not exceed the size of the smallest cluster in `labels`. rejection_threshold : float, default=None @@ -220,31 +220,40 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, metric=metric).fit(coordinates[members]) if method == 'dynamic': coverage = len(np.unique(labels[output[cluster, :]])) - elif method == 'static': + else: # method == 'static' or 'paired' coverage = 0 - - if method == 'paired': - D, _ = nbrs.kneighbors(coordinates[nonmembers, :]) - new_member = ridx[nonmembers][np.argmin(D)] - cluster_label = labels[new_member] - cloc = labels == cluster_label - nonmembers[~cloc] = 0 - rdists, _ = nbrs.radius_neighbors(coordinates[nonmembers, :], - radius=rejection_threshold, - return_distance=True) - far_member = ridx[nonmembers][np.argmax(rdists)] - output[cluster, new_member] = 1 - output[cluster, far_member] = 1 - else: - while coverage <= neighbors: - # intersect search tree with non-members - D, _ = nbrs.kneighbors(coordinates[nonmembers, :]) + + while coverage <= neighbors: + # intersect search tree with non-members + D, indx = nbrs.kneighbors(coordinates[nonmembers, :]) + mindex = np.argmin(D) + # Select closest external point to add to member cluster + new_member = ridx[nonmembers][mindex] + # Grab label of new member for overlap and other checks + nm_label = labels[new_member] + # Paired method removes full cluster from consideration + if method == 'paired': + # 'remove' is the captured cluster, from which we select pairs + remove = nm_label == labels + # For simplicity, we use the single point defined my 'mindex' + # as the 'member point' to calculate max eligible distance + rdists = pairwise_distances(coordinates[members][indx[mindex]], + coordinates[remove]) + # Filter too far points from argmax eligibility + rdists[rdists >= rejection_threshold] = 0 + far_member = ridx[remove][np.argmax(rdists)] + # Add near / far points to cluster for overlap + output[cluster, new_member] = 1 + output[cluster, far_member] = 1 + # Remove captured cluster from further consideration + nonmembers[remove] = False + # Continue + coverage += 1 + else: # Rejection threshold is lightly tested... if rejection_threshold: if np.min(D) > rejection_threshold: break - # Select closest external point to add to member cluster - new_member = ridx[nonmembers][np.argmin(D)] # Remove point from future coordinate distance queries nonmembers[new_member] = 0 # Add to member label array @@ -255,10 +264,8 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, elif method == 'static': # Update current point expansion count coverage += 1 - # Grab label of new member for overlap check - nm_label = labels[new_member] # Check if we've exceeded our overlap allotment... - if sum(labels[output[cluster, :]] == nm_label) >= overlap_points: + if sum(labels[output[cluster, :]] == nm_label) >= overlap: # ...if so, remove entire neighboring cluster remove = nm_label == labels nonmembers[remove] = False @@ -266,32 +273,37 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, class DeterministicClustering(object): - def __init__(self, target_size=15, tolerance=4, num_tie_points=4, max_dist_to_centroid=5_000_000): - self.target_size = target_size - self.num_tie_points = num_tie_points - self.tolerance = tolerance - self.max_dist = max_dist_to_centroid - self.points = np.array([]) - self.OC = np.array([]) + def __init__(self, target_size=15, tolerance=4, + num_tie_points=4, max_dist_to_centroid=5_000_000): + + self.target_size = target_size + self.num_tie_points = num_tie_points + self.tolerance = tolerance + self.max_dist = max_dist_to_centroid + self.points = np.array([]) + self.OC = np.array([]) # variables to store results - self.centroid_ids = [] - self.clustered_ids = [] - self.tie_ids = [] + self.centroid_ids = [] + self.clustered_ids = [] + self.tie_ids = [] def constrained_agglomerative(self, points, tie_clusters=True): """ - Perform spatially-constrained agglomerative clustering with centroid snapping. + Spatially-constrained agglomerative clustering with centroid snapping. + Parameters: points (np.ndarray): Nx3 ECEF coordinates. - tie_clusters (bool): to tie clusters together using neighbors, pass True + tie_clusters (bool): to tie clusters together using neighbors, + pass True Returns: clustered_points (List[List[np.ndarray]]): Points per cluster. labels (np.ndarray): Cluster label for each point. - final_centroids (List[np.ndarray]): Snapped centroids from points for each cluster. + final_centroids (List[np.ndarray]): Snapped centroids from points + for each cluster. """ target_size = self.target_size - margin = self.tolerance + margin = self.tolerance max_dist_to_centroid = self.max_dist points = np.array(points) @@ -327,17 +339,20 @@ def constrained_agglomerative(self, points, tie_clusters=True): merged_points = points[merged_cluster] centroid = merged_points.mean(axis=0) - # compare to square distance to avoid computing the sqrt and save some computation time + # compare to square distance to avoid computing the sqrt + # and save some computation time if np.any(np.einsum('ij,ij->i', merged_points - centroid, - merged_points - centroid) > max_dist_to_centroid ** 2): + merged_points - + centroid) > max_dist_to_centroid ** 2): continue clusters[next_cluster_id] = merged_cluster centroids[next_cluster_id] = centroid del clusters[ci], clusters[cj] del centroids[ci], centroids[cj] - self.update_heap_vectorized(heap, clusters, centroids, centroid, merged_cluster, - next_cluster_id, max_size, max_dist_to_centroid) + self.update_heap_vectorized(heap, clusters, centroids, centroid, + merged_cluster, next_cluster_id, + max_size, max_dist_to_centroid) next_cluster_id += 1 @@ -347,11 +362,14 @@ def constrained_agglomerative(self, points, tie_clusters=True): is_final = cluster_lens >= min_size # Split clusters - final_clusters = [cluster_items[i] for i in range(len(cluster_items)) if is_final[i]] - leftovers = [cluster_items[i] for i in range(len(cluster_items)) if not is_final[i]] + final_clusters = [cluster_items[i] for i in + range(len(cluster_items)) if is_final[i]] + leftovers = [cluster_items[i] for i in + range(len(cluster_items)) if not is_final[i]] # Compute centroids for final clusters - centroids_arr = np.array([points[c].mean(axis=0) for c in final_clusters]) + centroids_arr = np.array([points[c].mean(axis=0) for c in + final_clusters]) # Snap to closest input point in cluster snapped_idxs = [ c[np.argmin(np.linalg.norm(points[c] - centroid, axis=1))] @@ -364,9 +382,11 @@ def constrained_agglomerative(self, points, tie_clusters=True): if len(cluster) == 1: idx = cluster[0] point = points[idx] - dists = np.linalg.norm(np.array(final_centroids) - point, axis=1) + dists = np.linalg.norm(np.array(final_centroids) - point, + axis=1) valid = [i for i in range(len(final_clusters)) if - len(final_clusters[i]) < max_size and dists[i] <= max_dist_to_centroid] + len(final_clusters[i]) < max_size and + dists[i] <= max_dist_to_centroid] if valid: best_fit = valid[np.argmin(dists[valid])] else: @@ -374,33 +394,38 @@ def constrained_agglomerative(self, points, tie_clusters=True): final_clusters[best_fit].append(idx) cluster_points = points[final_clusters[best_fit]] centroid = cluster_points.mean(axis=0) - snapped_idx = final_clusters[best_fit][np.argmin(np.linalg.norm(cluster_points - centroid, axis=1))] + snapped_idx = final_clusters[best_fit][np.argmin( + np.linalg.norm(cluster_points - centroid, axis=1))] final_centroids[best_fit] = points[snapped_idx] - centroid_ids[best_fit] = snapped_idx + centroid_ids[best_fit] = snapped_idx else: centroid = points[cluster].mean(axis=0) - dists = np.linalg.norm(np.array(final_centroids) - centroid, axis=1) + dists = np.linalg.norm(np.array(final_centroids) - centroid, + axis=1) best_fit = None for i in np.argsort(dists): potential = final_clusters[i] + cluster if len(potential) <= max_size: test_points = points[potential] test_centroid = test_points.mean(axis=0) - if np.all(np.linalg.norm(test_points - test_centroid, axis=1) <= max_dist_to_centroid): + if np.all(np.linalg.norm(test_points - + test_centroid, + axis=1) <= + max_dist_to_centroid): best_fit = i break if best_fit is not None: final_clusters[best_fit].extend(cluster) cluster_points = points[final_clusters[best_fit]] centroid = cluster_points.mean(axis=0) - snapped_idx = final_clusters[best_fit][np.argmin(np.linalg.norm(cluster_points - centroid, axis=1))] + snapped_idx = final_clusters[best_fit][np.argmin( + np.linalg.norm(cluster_points - centroid, axis=1))] final_centroids[best_fit] = points[snapped_idx] - centroid_ids[best_fit] = snapped_idx + centroid_ids[best_fit] = snapped_idx else: - # tqdm.write(' -- cluster %i ended up with %i stations (min was %i)' % (len(final_clusters), - # len(cluster), min_size)) final_clusters.append(cluster) - snapped_idx = cluster[np.argmin(np.linalg.norm(points[cluster] - centroid, axis=1))] + snapped_idx = cluster[np.argmin(np.linalg.norm( + points[cluster] - centroid, axis=1))] final_centroids.append(points[snapped_idx]) centroid_ids.append(snapped_idx) @@ -409,8 +434,8 @@ def constrained_agglomerative(self, points, tie_clusters=True): for i in cluster: labels[i] = idx - self.clustered_ids = final_clusters - self.centroid_ids = centroid_ids + self.clustered_ids = final_clusters + self.centroid_ids = centroid_ids # ties clusters together if tie_clusters: @@ -418,20 +443,24 @@ def constrained_agglomerative(self, points, tie_clusters=True): return final_clusters, labels, centroid_ids - def update_heap_vectorized(self, heap, clusters, centroids, centroid, merged_cluster, next_cluster_id, max_size, + def update_heap_vectorized(self, heap, clusters, centroids, centroid, + merged_cluster, next_cluster_id, max_size, max_dist_to_centroid): """ Vectorized version to update the heap with valid cluster pairs after a merge. Parameters: heap (List[Tuple[float, int, int]]): The heap to update. - clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list of point indices. - centroids (Dict[int, np.ndarray]): Dictionary of cluster_id -> centroid coordinates. + clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list + of point indices. + centroids (Dict[int, np.ndarray]): Dictionary of cluster_id -> + centroid coordinates. centroid (np.ndarray): The centroid of the newly merged cluster. merged_cluster (List[int]): Indices of points in the new cluster. next_cluster_id (int): The ID of the new cluster. max_size (int): Maximum allowed cluster size. - max_dist_to_centroid (float): Maximum allowed distance for a valid connection. + max_dist_to_centroid (float): Maximum allowed distance for a valid + connection. """ # Extract existing cluster IDs and their centroids existing_ids = np.array(list(clusters.keys())) @@ -456,13 +485,17 @@ def add_tie_points(self, cluster_labels, num_neighbors=4, max_tie_distance=5_000 to at least `num_neighbors` external clusters. Parameters: - cluster_labels (np.ndarray): Cluster index (0..K-1) for each station. - num_neighbors (int): Minimum number of external links each island must have. + cluster_labels (np.ndarray): Cluster index (0..K-1) for each + station. + num_neighbors (int): Minimum number of external links each island + must have. max_tie_distance (float): Max allowable tie distance in meters. Returns: - new_clusters (List[List[int]]): Cluster station indices including added tie points. - tie_points (List[List[int]]): Tie point indices added to each cluster. + new_clusters (List[List[int]]): Cluster station indices including + added tie points. + tie_points (List[List[int]]): Tie point indices added to each + cluster. """ points = self.points @@ -503,13 +536,13 @@ def add_reciprocal_tie(i, j, pi_idx, pj_idx): pj = [idx for idx in clusters[j] if idx not in used_points] if not pi or not pj: continue - dist_matrix = np.linalg.norm(points[pi][:, None, :] - points[pj][None, :, :], axis=2) + dist_matrix = np.linalg.norm(points[pi][:, None, :] - + points[pj][None, :, :], axis=2) min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape) min_dist = dist_matrix[min_idx] if min_dist <= max_tie_distance: add_reciprocal_tie(i, j, pi[min_idx[0]], pj[min_idx[1]]) - # === Step 3: Build the graph from current tie connections === G = nx.Graph() G.add_nodes_from(range(n_clusters)) @@ -536,11 +569,16 @@ def add_reciprocal_tie(i, j, pi_idx, pj_idx): pj = [idx for idx in clusters[j] if idx not in used_points] if not pi or not pj: continue - dist_matrix = np.linalg.norm(points[pi][:, None, :] - points[pj][None, :, :], axis=2) - min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape) + dist_matrix = np.linalg.norm(points[pi][:, None, :] - + points[pj][None, :, :], + axis=2) + min_idx = np.unravel_index(np.argmin(dist_matrix), + dist_matrix.shape) dist = dist_matrix[min_idx] if dist <= max_tie_distance: - connection_candidates.append((dist, i, j, pi[min_idx[0]], pj[min_idx[1]])) + connection_candidates.append((dist, i, j, + pi[min_idx[0]], + pj[min_idx[1]])) # Sort connections by shortest distance connection_candidates.sort() @@ -574,18 +612,20 @@ def add_reciprocal_tie(i, j, pi_idx, pj_idx): self.OC = np.array(matrix) self.clustered_ids = new_clusters - self.tie_ids = tie_points + self.tie_ids = tie_points return new_clusters, tie_points def get_cluster_coordinates(self): - return [[self.points[i] for i in cluster] for cluster in self.clustered_ids] + return [[self.points[i] for i in cluster] + for cluster in self.clustered_ids] def get_centroid_coordinates(self): return [self.points[i] for i in self.centroid_ids] def get_tie_coordinates(self): - return [[self.points[i] for i in cluster] for cluster in self.tie_ids] + return [[self.points[i] for i in cluster] + for cluster in self.tie_ids] """Bisecting Q-means clustering.""" @@ -879,7 +919,7 @@ def fit(self, X, y=None, sample_weight=None): cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect() # Split this cluster into 2 subclusters - #if cluster_to_bisect is not None: + # if cluster_to_bisect is not None: if cluster_to_bisect.score > self.max_size: self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect) @@ -961,7 +1001,7 @@ def get_cluster_to_bisect(self): max_score = cluster_leaf.score best_cluster_leaf = cluster_leaf - #if max_score >= self.opt_size: + # if max_score >= self.opt_size: if np.isneginf(max_score): self.bisect = False else: From 382fa3728ed871e379156845266c0ad99c87599f Mon Sep 17 00:00:00 2001 From: espg Date: Tue, 13 May 2025 16:09:24 -0800 Subject: [PATCH 4/8] fixed off by one bug in over_cluster; updated documentation; renamed parameter and updated unit test and function calls; updated overcluster default parameters --- pgamit/cluster.py | 32 ++++++++++++++++++++------------ pgamit/network.py | 2 +- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pgamit/cluster.py b/pgamit/cluster.py index c809c619..fe4e64d7 100644 --- a/pgamit/cluster.py +++ b/pgamit/cluster.py @@ -105,7 +105,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'): return idxs.squeeze() -def over_cluster(labels, coordinates, metric='haversine', neighbors=5, +def over_cluster(labels, coordinates, metric='euclidean', neighbors=5, overlap=2, rejection_threshold=5e6, method='static'): """Expand cluster membership to include edge points of neighbor clusters @@ -137,11 +137,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, clustering in an X,Y,Z projection, those coordinates can be provided in spherical coordinates, provided that 'haversine' is selected for the `metric` parameter. - metric : str or callable, default='haversine' + metric : str or callable, default='euclidean' Metric to use for distance computation. Any metric from scikit-learn or - scipy.spatial.distance can be used. Note that latitude and longitude - values will need to be converted to radians if using the default - 'haversine' distance metric. + scipy.spatial.distance can be used. + + Note that latitude and longitude values will need to be converted to + radians if using the default 'haversine' distance metric. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should @@ -168,24 +169,31 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, neighbors: int greater than or equal to 1, default=3 For method='static', this is total number of points that will be added to the seed clusters during cluster expansion. + For method='paired', this is the number of cluster that are used to + tie, with each cluster contributing exactly 2 points. For method='dynamic', this is the (zero-indexed) number of adjacent clusters to include when adding cluster membership overlap. Should be less than the number of unique cluster labels - 1. overlap : int greater than or equal to 1, default=2 - Should not exceed the size of the smallest cluster in `labels`. + Should not exceed the size of the smallest cluster in `labels`. Note + that this parameter has no effect for method='paired'. - rejection_threshold : float, default=None + rejection_threshold : float, default=5e6 Determines if any potential overlapping points should be rejected for being too far (from source centroid or nearest source edge point). - Default of 'None' is equivalent to setting the threshold to infinity. + Value of 'None' is equivalent to setting the threshold to infinity. Note that if value other than 'None' is used, there is no guarantee - that all clusters will have overlap points added. + that all clusters will have overlap points added. This parameter value + is required to be set when using method='paired'. - method : 'static' (default) or 'dynamic' + method : 'static' (default), 'paired', or 'dynamic' The 'static' method will always produce an overcluster equal to the `neighbors` parameter; 'dynamic' will produce an overcluster ceiling - of (neighbors - 1) * overlap_points, with a floor of neighbors. + of (neighbors - 1) * overlap_points, with a floor of neighbors. The + 'paired' method will add 2 * `nieghbors` points per cluster, one + of which is the closest nieghbor and one which is the farthest point + within that same nieghboring cluster. Returns ------- @@ -221,7 +229,7 @@ def over_cluster(labels, coordinates, metric='haversine', neighbors=5, if method == 'dynamic': coverage = len(np.unique(labels[output[cluster, :]])) else: # method == 'static' or 'paired' - coverage = 0 + coverage = 1 while coverage <= neighbors: # intersect search tree with non-members diff --git a/pgamit/network.py b/pgamit/network.py index 667b7cc2..92567d15 100644 --- a/pgamit/network.py +++ b/pgamit/network.py @@ -187,7 +187,7 @@ def make_clusters(self, points, stations, net_limit=NET_LIMIT): central_points = select_central_point(points, qmean.cluster_centers_) # expand the initial clusters to overlap stations with neighbors OC = over_cluster(qmean.labels_, points, metric='euclidean', - neighbors=self.ties, overlap_points=2) + neighbors=self.ties, overlap=2) # set 'method=None' to disable OC, central_points = prune(OC, central_points, method='minsize') # calculate all 'tie' stations From 65a70b51eee634dc91542776aacd1792f8b172d7 Mon Sep 17 00:00:00 2001 From: espg Date: Tue, 13 May 2025 16:39:21 -0800 Subject: [PATCH 5/8] forgot to push changes from unit test file... --- pgamit/tests/test_make_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py index d95cc707..6092e01c 100644 --- a/pgamit/tests/test_make_clusters.py +++ b/pgamit/tests/test_make_clusters.py @@ -60,7 +60,7 @@ def test_max_clust_expansion(max_clust, neighbors, overlap): clust.fit(data) OC = over_cluster(clust.labels_, data, metric='euclidean', - neighbors=neighbors, overlap_points=overlap, + neighbors=neighbors, overlap=overlap, method='dynamic') expanded_sizes = np.sum(OC, axis=1) From d6b082262a323489391b33bf3358594aac8e44f1 Mon Sep 17 00:00:00 2001 From: espg Date: Fri, 23 May 2025 15:03:57 -0400 Subject: [PATCH 6/8] updating function name from `over_cluster` to `overcluster` to match paper discussion --- pgamit/cluster.py | 6 +++--- pgamit/network.py | 6 +++--- pgamit/tests/test_make_clusters.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pgamit/cluster.py b/pgamit/cluster.py index fe4e64d7..a15f19cc 100644 --- a/pgamit/cluster.py +++ b/pgamit/cluster.py @@ -25,7 +25,7 @@ def prune(OC, central_points, method='minsize'): - """Prune redundant clusters from over cluster (OC) and other arrays + """Prune redundant clusters from overcluster (OC) and other arrays Parameters ---------- @@ -105,7 +105,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'): return idxs.squeeze() -def over_cluster(labels, coordinates, metric='euclidean', neighbors=5, +def overcluster(labels, coordinates, metric='euclidean', neighbors=5, overlap=2, rejection_threshold=5e6, method='static'): """Expand cluster membership to include edge points of neighbor clusters @@ -267,7 +267,7 @@ def over_cluster(labels, coordinates, metric='euclidean', neighbors=5, # Add to member label array output[cluster, new_member] = 1 if method == 'dynamic': - # Update current count of over-clustered neighbors + # Update current count of overclustered neighbors coverage = len(np.unique(labels[output[cluster, :]])) elif method == 'static': # Update current point expansion count diff --git a/pgamit/network.py b/pgamit/network.py index 92567d15..301aa1c9 100644 --- a/pgamit/network.py +++ b/pgamit/network.py @@ -37,7 +37,7 @@ # app from pgamit.pyGamitSession import GamitSession from pgamit.pyStation import StationCollection -from pgamit.cluster import (BisectingQMeans, over_cluster, prune, +from pgamit.cluster import (BisectingQMeans, overcluster, prune, select_central_point) from pgamit.plots import plot_global_network @@ -186,8 +186,8 @@ def make_clusters(self, points, stations, net_limit=NET_LIMIT): # snap centroids to closest station coordinate central_points = select_central_point(points, qmean.cluster_centers_) # expand the initial clusters to overlap stations with neighbors - OC = over_cluster(qmean.labels_, points, metric='euclidean', - neighbors=self.ties, overlap=2) + OC = overcluster(qmean.labels_, points, metric='euclidean', + neighbors=self.ties, overlap=2) # set 'method=None' to disable OC, central_points = prune(OC, central_points, method='minsize') # calculate all 'tie' stations diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py index 6092e01c..ca9812bb 100644 --- a/pgamit/tests/test_make_clusters.py +++ b/pgamit/tests/test_make_clusters.py @@ -5,7 +5,7 @@ import numpy as np from .common import gen_variable_density_clusters, generate_clustered_data -from ..cluster import BisectingQMeans, over_cluster +from ..cluster import BisectingQMeans, overcluster @pytest.mark.parametrize( @@ -48,7 +48,7 @@ def test_ceiling_variable_density(max_size, clust_size): ], ) def test_max_clust_expansion(max_clust, neighbors, overlap): - """Test algorithmic guarantee of `over_cluster` + """Test algorithmic guarantee of `overcluster` Verify that expanded cluster size is under (<=, less than or equal to): [initial cluster size + (neighbors * overlap)]""" @@ -59,7 +59,7 @@ def test_max_clust_expansion(max_clust, neighbors, overlap): max_iter=8000, random_state=42) clust.fit(data) - OC = over_cluster(clust.labels_, data, metric='euclidean', + OC = overcluster(clust.labels_, data, metric='euclidean', neighbors=neighbors, overlap=overlap, method='dynamic') From 34ad027870da26a398851897a335cf9e6d154ae1 Mon Sep 17 00:00:00 2001 From: espg Date: Sat, 24 May 2025 22:27:20 -0400 Subject: [PATCH 7/8] better parameter names --- pgamit/cluster.py | 30 +++++++++++++++--------------- pgamit/network.py | 4 ++-- pgamit/tests/test_make_clusters.py | 18 +++++++++--------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pgamit/cluster.py b/pgamit/cluster.py index a15f19cc..e3ecf5e6 100644 --- a/pgamit/cluster.py +++ b/pgamit/cluster.py @@ -105,18 +105,18 @@ def select_central_point(coordinates, centroids, metric='euclidean'): return idxs.squeeze() -def overcluster(labels, coordinates, metric='euclidean', neighbors=5, - overlap=2, rejection_threshold=5e6, method='static'): +def overcluster(labels, coordinates, metric='euclidean', overlap=5, + nmax=2, rejection_threshold=5e6, method='static'): """Expand cluster membership to include edge points of neighbor clusters Expands an existing clustering to create overlapping membership between clusters. Existing clusters are processed sequentially by looking up nearest neighbors as an intersection of the current cluster membership and - all cluster's point membership. Once the `overlap` for a given + all cluster's point membership. Once the `nmax` for a given neighbor cluster have been determined and added to current cluster, the remainder of that neighboring cluster is removed from consideration and distance query is rerun, with the process repeating until a number of - clusters equal to the `neighborhood` parameter is reached. For stability, + clusters equal to the `overlap` parameter is reached. For stability, only original points are included for subsequent neighborhood searches; that is, Nearest neighbor distances are run as the shortest distance from all **original** members of the current cluster. @@ -166,7 +166,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5, Sparse matrices are only supported by scikit-learn metrics. See the documentation for scipy.spatial.distance for details on these metrics. - neighbors: int greater than or equal to 1, default=3 + overlap: int greater than or equal to 1, default=3 For method='static', this is total number of points that will be added to the seed clusters during cluster expansion. For method='paired', this is the number of cluster that are used to @@ -175,7 +175,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5, clusters to include when adding cluster membership overlap. Should be less than the number of unique cluster labels - 1. - overlap : int greater than or equal to 1, default=2 + nmax : int greater than or equal to 1, default=2 Should not exceed the size of the smallest cluster in `labels`. Note that this parameter has no effect for method='paired'. @@ -189,8 +189,8 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5, method : 'static' (default), 'paired', or 'dynamic' The 'static' method will always produce an overcluster equal to the - `neighbors` parameter; 'dynamic' will produce an overcluster ceiling - of (neighbors - 1) * overlap_points, with a floor of neighbors. The + `overlap` parameter; 'dynamic' will produce an overcluster ceiling + of (overlap - 1) * overlap_points, with a floor of overlap. The 'paired' method will add 2 * `nieghbors` points per cluster, one of which is the closest nieghbor and one which is the farthest point within that same nieghboring cluster. @@ -209,8 +209,8 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5, clusters = np.unique(labels) n_clusters = len(clusters) - if (n_clusters - 1) < neighbors: - neighbors = (n_clusters - 1) + if (n_clusters - 1) < overlap: + overlap = (n_clusters - 1) # reference index for reverse lookups ridx = np.array(list(range(len(labels)))) @@ -231,7 +231,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5, else: # method == 'static' or 'paired' coverage = 1 - while coverage <= neighbors: + while coverage <= overlap: # intersect search tree with non-members D, indx = nbrs.kneighbors(coordinates[nonmembers, :]) mindex = np.argmin(D) @@ -273,7 +273,7 @@ def overcluster(labels, coordinates, metric='euclidean', neighbors=5, # Update current point expansion count coverage += 1 # Check if we've exceeded our overlap allotment... - if sum(labels[output[cluster, :]] == nm_label) >= overlap: + if sum(labels[output[cluster, :]] == nm_label) >= nmax: # ...if so, remove entire neighboring cluster remove = nm_label == labels nonmembers[remove] = False @@ -659,7 +659,7 @@ class BisectingQMeans(_BaseKMeans): Parameters ---------- - max_size: int, default=25 + qmax: int, default=25 Hard cutoff to bypass the heuristic when bisecting clusters; no clusters greater than this size will be produced. @@ -759,7 +759,7 @@ class BisectingQMeans(_BaseKMeans): def __init__( self, - max_size=25, + qmax=25, *, init="random", n_init=1, @@ -781,7 +781,7 @@ def __init__( n_init=n_init, ) - self.max_size = max_size + self.max_size = qmax self.copy_x = copy_x self.algorithm = algorithm self.bisect = True diff --git a/pgamit/network.py b/pgamit/network.py index 301aa1c9..e1631ce8 100644 --- a/pgamit/network.py +++ b/pgamit/network.py @@ -181,13 +181,13 @@ def __init__(self, cnn, archive, GamitConfig, stations, date, def make_clusters(self, points, stations, net_limit=NET_LIMIT): # Run initial clustering using bisecting 'q-means' - qmean = BisectingQMeans(max_size=self.cluster_size, random_state=42) + qmean = BisectingQMeans(qmax=self.cluster_size, random_state=42) qmean.fit(points) # snap centroids to closest station coordinate central_points = select_central_point(points, qmean.cluster_centers_) # expand the initial clusters to overlap stations with neighbors OC = overcluster(qmean.labels_, points, metric='euclidean', - neighbors=self.ties, overlap=2) + overlap=self.ties, nmax=2) # set 'method=None' to disable OC, central_points = prune(OC, central_points, method='minsize') # calculate all 'tie' stations diff --git a/pgamit/tests/test_make_clusters.py b/pgamit/tests/test_make_clusters.py index ca9812bb..665c8ea2 100644 --- a/pgamit/tests/test_make_clusters.py +++ b/pgamit/tests/test_make_clusters.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize( - ("max_size", "clust_size"), + ("qmax", "clust_size"), [ [5, 10], [7, 10], @@ -20,24 +20,24 @@ [30, 250], ], ) -def test_ceiling_variable_density(max_size, clust_size): +def test_ceiling_variable_density(qmax, clust_size): """Test algorithmic guarantee of BisectingQMeans on variable density data Verify that when `min_size=2`, that the max per cluster membership is under (<, less than) what parameter `opt_cluster_size` is set to""" data = gen_variable_density_clusters(clust_size) - clust = BisectingQMeans(max_size=max_size, + clust = BisectingQMeans(qmax=qmax, init='random', n_init=50, algorithm='lloyd', max_iter=8000, random_state=42) clust.fit(data) _, counts = np.unique(clust.labels_, return_counts=True) - assert np.max(counts) <= max_size + assert np.max(counts) <= qmax @pytest.mark.parametrize( - ("max_clust", "neighbors", "overlap"), + ("qmax", "overlap", "nmax"), [ [5, 5, 2], [10, 2, 5], @@ -47,23 +47,23 @@ def test_ceiling_variable_density(max_size, clust_size): [17, 10, 1], ], ) -def test_max_clust_expansion(max_clust, neighbors, overlap): +def test_max_clust_expansion(qmax, overlap, nmax): """Test algorithmic guarantee of `overcluster` Verify that expanded cluster size is under (<=, less than or equal to): [initial cluster size + (neighbors * overlap)]""" data = gen_variable_density_clusters() - clust = BisectingQMeans(max_size=max_clust, + clust = BisectingQMeans(qmax=qmax, init='random', n_init=50, algorithm='lloyd', max_iter=8000, random_state=42) clust.fit(data) OC = overcluster(clust.labels_, data, metric='euclidean', - neighbors=neighbors, overlap=overlap, + overlap=overlap, nmax=nmax, method='dynamic') expanded_sizes = np.sum(OC, axis=1) _, original_sizes = np.unique(clust.labels_, return_counts=True) - assert np.all((expanded_sizes - original_sizes) <= neighbors*overlap) + assert np.all((expanded_sizes - original_sizes) <= overlap*nmax) From e02ba07548064bad541d6339f3fcf4e7f546bf37 Mon Sep 17 00:00:00 2001 From: espg Date: Fri, 13 Jun 2025 11:06:38 -0600 Subject: [PATCH 8/8] seperated agglomertative functions to own module; reduces line lengths below 1000 lines for both modules, makes clustering pep8 complient again, displays proper unit test coverage percentages --- pgamit/agglomerative.py | 365 ++++++++++++++++++++++++++++++++++++++++ pgamit/cluster.py | 362 +-------------------------------------- 2 files changed, 366 insertions(+), 361 deletions(-) create mode 100644 pgamit/agglomerative.py diff --git a/pgamit/agglomerative.py b/pgamit/agglomerative.py new file mode 100644 index 00000000..e11c18d0 --- /dev/null +++ b/pgamit/agglomerative.py @@ -0,0 +1,365 @@ +import heapq +import networkx as nx +import numpy as np + +from scipy.spatial.distance import pdist, squareform +from sklearn.neighbors import NearestNeighbors + + +class DeterministicClustering(object): + def __init__(self, target_size=15, tolerance=4, + num_tie_points=4, max_dist_to_centroid=5_000_000): + + self.target_size = target_size + self.num_tie_points = num_tie_points + self.tolerance = tolerance + self.max_dist = max_dist_to_centroid + self.points = np.array([]) + self.OC = np.array([]) + # variables to store results + self.centroid_ids = [] + self.clustered_ids = [] + self.tie_ids = [] + + def constrained_agglomerative(self, points, tie_clusters=True): + """ + Spatially-constrained agglomerative clustering with centroid snapping. + + Parameters: + points (np.ndarray): Nx3 ECEF coordinates. + tie_clusters (bool): to tie clusters together using neighbors, + pass True + + Returns: + clustered_points (List[List[np.ndarray]]): Points per cluster. + labels (np.ndarray): Cluster label for each point. + final_centroids (List[np.ndarray]): Snapped centroids from points + for each cluster. + """ + target_size = self.target_size + margin = self.tolerance + max_dist_to_centroid = self.max_dist + + points = np.array(points) + self.points = points + + min_size = target_size - margin + max_size = target_size + margin + + clusters = {i: [i] for i in range(len(points))} + centroids = {i: points[i] for i in range(len(points))} + next_cluster_id = len(points) + + # Vectorized pairwise distance computation + pairwise_dists = squareform(pdist(points)) + i_idx, j_idx = np.triu_indices(len(points), k=1) + dists = pairwise_dists[i_idx, j_idx] + valid_mask = dists <= max_dist_to_centroid * 2 + i_valid = i_idx[valid_mask] + j_valid = j_idx[valid_mask] + d_valid = dists[valid_mask] + + heap = list(zip(d_valid, i_valid, j_valid)) + heapq.heapify(heap) + + while heap: + dist, ci, cj = heapq.heappop(heap) + if ci not in clusters or cj not in clusters: + continue + + merged_cluster = clusters[ci] + clusters[cj] + if len(merged_cluster) > max_size: + continue + + merged_points = points[merged_cluster] + centroid = merged_points.mean(axis=0) + # compare to square distance to avoid computing the sqrt + # and save some computation time + if np.any(np.einsum('ij,ij->i', merged_points - centroid, + merged_points - + centroid) > max_dist_to_centroid ** 2): + continue + + clusters[next_cluster_id] = merged_cluster + centroids[next_cluster_id] = centroid + del clusters[ci], clusters[cj] + del centroids[ci], centroids[cj] + self.update_heap_vectorized(heap, clusters, centroids, centroid, + merged_cluster, next_cluster_id, + max_size, max_dist_to_centroid) + + next_cluster_id += 1 + + # Precompute centroids and classify clusters + cluster_items = list(clusters.values()) + cluster_lens = np.array([len(c) for c in cluster_items]) + is_final = cluster_lens >= min_size + + # Split clusters + final_clusters = [cluster_items[i] for i in + range(len(cluster_items)) if is_final[i]] + leftovers = [cluster_items[i] for i in + range(len(cluster_items)) if not is_final[i]] + + # Compute centroids for final clusters + centroids_arr = np.array([points[c].mean(axis=0) for c in + final_clusters]) + # Snap to closest input point in cluster + snapped_idxs = [ + c[np.argmin(np.linalg.norm(points[c] - centroid, axis=1))] + for c, centroid in zip(final_clusters, centroids_arr) + ] + final_centroids = list(points[snapped_idxs]) + centroid_ids = snapped_idxs + + for cluster in leftovers: + if len(cluster) == 1: + idx = cluster[0] + point = points[idx] + dists = np.linalg.norm(np.array(final_centroids) - point, + axis=1) + valid = [i for i in range(len(final_clusters)) if + len(final_clusters[i]) < max_size and + dists[i] <= max_dist_to_centroid] + if valid: + best_fit = valid[np.argmin(dists[valid])] + else: + best_fit = np.argmin(dists) + final_clusters[best_fit].append(idx) + cluster_points = points[final_clusters[best_fit]] + centroid = cluster_points.mean(axis=0) + snapped_idx = final_clusters[best_fit][np.argmin( + np.linalg.norm(cluster_points - centroid, axis=1))] + final_centroids[best_fit] = points[snapped_idx] + centroid_ids[best_fit] = snapped_idx + else: + centroid = points[cluster].mean(axis=0) + dists = np.linalg.norm(np.array(final_centroids) - centroid, + axis=1) + best_fit = None + for i in np.argsort(dists): + potential = final_clusters[i] + cluster + if len(potential) <= max_size: + test_points = points[potential] + test_centroid = test_points.mean(axis=0) + if np.all(np.linalg.norm(test_points - + test_centroid, + axis=1) <= + max_dist_to_centroid): + best_fit = i + break + if best_fit is not None: + final_clusters[best_fit].extend(cluster) + cluster_points = points[final_clusters[best_fit]] + centroid = cluster_points.mean(axis=0) + snapped_idx = final_clusters[best_fit][np.argmin( + np.linalg.norm(cluster_points - centroid, axis=1))] + final_centroids[best_fit] = points[snapped_idx] + centroid_ids[best_fit] = snapped_idx + else: + final_clusters.append(cluster) + snapped_idx = cluster[np.argmin(np.linalg.norm( + points[cluster] - centroid, axis=1))] + final_centroids.append(points[snapped_idx]) + centroid_ids.append(snapped_idx) + + labels = np.zeros(len(points), dtype=int) + for idx, cluster in enumerate(final_clusters): + for i in cluster: + labels[i] = idx + + self.clustered_ids = final_clusters + self.centroid_ids = centroid_ids + + # ties clusters together + if tie_clusters: + self.add_tie_points(labels, self.num_tie_points, self.max_dist) + + return final_clusters, labels, centroid_ids + + def update_heap_vectorized(self, heap, clusters, centroids, centroid, + merged_cluster, next_cluster_id, max_size, + max_dist_to_centroid): + """ + Vectorized version to update heap with valid cluster pairs post merge + + Parameters: + heap (List[Tuple[float, int, int]]): The heap to update. + clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list + of point indices. + centroids (Dict[int, np.ndarray]): Dictionary of cluster_id -> + centroid coordinates. + centroid (np.ndarray): The centroid of the newly merged cluster. + merged_cluster (List[int]): Indices of points in the new cluster. + next_cluster_id (int): The ID of the new cluster. + max_size (int): Maximum allowed cluster size. + max_dist_to_centroid (float): Maximum allowed distance for a valid + connection. + """ + # Extract existing cluster IDs and their centroids + existing_ids = np.array(list(clusters.keys())) + existing_centroids = np.array([centroids[k] for k in existing_ids]) + existing_sizes = np.array([len(clusters[k]) for k in existing_ids]) + + # Compute distance from new centroid to all others + dists = np.linalg.norm(existing_centroids - centroid, axis=1) + + # Evaluate which existing clusters are valid for merging + size_sum = existing_sizes + len(merged_cluster) + valid = ((existing_ids != next_cluster_id) & + (size_sum <= max_size) & + (dists <= max_dist_to_centroid * 2)) + + # Push valid merge pairs into heap + for other_id, dist in zip(existing_ids[valid], dists[valid]): + heapq.heappush(heap, (dist, next_cluster_id, other_id)) + + def add_tie_points(self, cluster_labels, num_neighbors=4, + max_tie_distance=5_000_000): + """ + Add reciprocal tie points to each cluster from its nearest neighbors, + then ensure every disconnected cluster component (island) is connected + to at least `num_neighbors` external clusters. + + Parameters: + cluster_labels (np.ndarray): Cluster index (0..K-1) for each + station. + num_neighbors (int): Minimum number of external links each island + must have. + max_tie_distance (float): Max allowable tie distance in meters. + + Returns: + new_clusters (List[List[int]]): Cluster station indices including + added tie points. + tie_points (List[List[int]]): Tie point indices added to each + cluster. + """ + + points = self.points + labels = cluster_labels + centroids = self.get_centroid_coordinates() + + n_clusters = len(centroids) + n_stations = points.shape[0] + + # === Step 1: Initialize cluster structure === + clusters = [[] for _ in range(n_clusters)] + for idx, label in enumerate(labels): + clusters[label].append(idx) + + new_clusters = [list(cluster) for cluster in clusters] + tie_points = [[] for _ in range(n_clusters)] + used_points = set() + tie_log = np.zeros((n_clusters, n_clusters), dtype=bool) + + # === Step 2: Initial connections to nearby neighbors using centroids + nbrs = NearestNeighbors(n_neighbors=num_neighbors + 1).fit(centroids) + _, neighbors = nbrs.kneighbors(centroids) + + def add_reciprocal_tie(i, j, pi_idx, pj_idx): + new_clusters[i].append(pj_idx) + new_clusters[j].append(pi_idx) + tie_points[i].append(pj_idx) + tie_points[j].append(pi_idx) + used_points.update({pi_idx, pj_idx}) + tie_log[i, j] = True + tie_log[j, i] = True + + for i in range(n_clusters): + for j in neighbors[i][1:]: + if tie_log[i, j] or tie_log[j, i]: + continue + pi = [idx for idx in clusters[i] if idx not in used_points] + pj = [idx for idx in clusters[j] if idx not in used_points] + if not pi or not pj: + continue + dist_matrix = np.linalg.norm(points[pi][:, None, :] - + points[pj][None, :, :], axis=2) + min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape) + min_dist = dist_matrix[min_idx] + if min_dist <= max_tie_distance: + add_reciprocal_tie(i, j, pi[min_idx[0]], pj[min_idx[1]]) + + # === Step 3: Build the graph from current tie connections === + G = nx.Graph() + G.add_nodes_from(range(n_clusters)) + for i in range(n_clusters): + for j in range(i + 1, n_clusters): + if set(tie_points[i]) & set(clusters[j]) or set(tie_points[j]) & set(clusters[i]): + G.add_edge(i, j) + + # === Step 4: Ensure every disconnected component connects to at least `num_neighbors` others === + components = sorted(list(nx.connected_components(G)), key=len) + + while len(components) > 1: + updated = False # Flag to break early if no valid connections were made + + for comp in components: + external_nodes = set(range(n_clusters)) - comp + connection_candidates = [] + + for i in comp: + for j in external_nodes: + if tie_log[i, j] or tie_log[j, i]: + continue + pi = [idx for idx in clusters[i] if idx not in used_points] + pj = [idx for idx in clusters[j] if idx not in used_points] + if not pi or not pj: + continue + dist_matrix = np.linalg.norm(points[pi][:, None, :] - + points[pj][None, :, :], + axis=2) + min_idx = np.unravel_index(np.argmin(dist_matrix), + dist_matrix.shape) + dist = dist_matrix[min_idx] + if dist <= max_tie_distance: + connection_candidates.append((dist, i, j, + pi[min_idx[0]], + pj[min_idx[1]])) + + # Sort connections by shortest distance + connection_candidates.sort() + added = 0 + for conn in connection_candidates: + _, i, j, pi_idx, pj_idx = conn + if not G.has_edge(i, j): + add_reciprocal_tie(i, j, pi_idx, pj_idx) + G.add_edge(i, j) + added += 1 + updated = True + if added >= num_neighbors: + break + + if added < num_neighbors: + pass + # tqdm.write(f" -- WARNING: Component containing clusters {sorted(comp)} " + # f"was only connected to {added} external clusters under the distance constraint.") + + if not updated: + # tqdm.write(" -- WARNING: Some disconnected components could not be joined with the rest of the graph.") + break + + components = list(nx.connected_components(G)) + + # create matrix with clusters and stations + matrix = np.zeros((n_clusters, n_stations), dtype=bool) + for i, cluster in enumerate(new_clusters): + matrix[i, cluster] = True + + self.OC = np.array(matrix) + + self.clustered_ids = new_clusters + self.tie_ids = tie_points + + return new_clusters, tie_points + + def get_cluster_coordinates(self): + return [[self.points[i] for i in cluster] + for cluster in self.clustered_ids] + + def get_centroid_coordinates(self): + return [self.points[i] for i in self.centroid_ids] + + def get_tie_coordinates(self): + return [[self.points[i] for i in cluster] + for cluster in self.tie_ids] diff --git a/pgamit/cluster.py b/pgamit/cluster.py index e3ecf5e6..97a334c6 100644 --- a/pgamit/cluster.py +++ b/pgamit/cluster.py @@ -7,10 +7,6 @@ import numpy as np import pandas as pd import scipy.sparse as sp -import heapq -import networkx as nx - -from scipy.spatial.distance import pdist, squareform from sklearn.neighbors import NearestNeighbors from sklearn.metrics import pairwise_distances @@ -105,7 +101,7 @@ def select_central_point(coordinates, centroids, metric='euclidean'): return idxs.squeeze() -def overcluster(labels, coordinates, metric='euclidean', overlap=5, +def overcluster(labels, coordinates, metric='euclidean', overlap=4, nmax=2, rejection_threshold=5e6, method='static'): """Expand cluster membership to include edge points of neighbor clusters @@ -280,362 +276,6 @@ def overcluster(labels, coordinates, metric='euclidean', overlap=5, return output -class DeterministicClustering(object): - def __init__(self, target_size=15, tolerance=4, - num_tie_points=4, max_dist_to_centroid=5_000_000): - - self.target_size = target_size - self.num_tie_points = num_tie_points - self.tolerance = tolerance - self.max_dist = max_dist_to_centroid - self.points = np.array([]) - self.OC = np.array([]) - # variables to store results - self.centroid_ids = [] - self.clustered_ids = [] - self.tie_ids = [] - - def constrained_agglomerative(self, points, tie_clusters=True): - """ - Spatially-constrained agglomerative clustering with centroid snapping. - - Parameters: - points (np.ndarray): Nx3 ECEF coordinates. - tie_clusters (bool): to tie clusters together using neighbors, - pass True - - Returns: - clustered_points (List[List[np.ndarray]]): Points per cluster. - labels (np.ndarray): Cluster label for each point. - final_centroids (List[np.ndarray]): Snapped centroids from points - for each cluster. - """ - target_size = self.target_size - margin = self.tolerance - max_dist_to_centroid = self.max_dist - - points = np.array(points) - self.points = points - - min_size = target_size - margin - max_size = target_size + margin - - clusters = {i: [i] for i in range(len(points))} - centroids = {i: points[i] for i in range(len(points))} - next_cluster_id = len(points) - - # Vectorized pairwise distance computation - pairwise_dists = squareform(pdist(points)) - i_idx, j_idx = np.triu_indices(len(points), k=1) - dists = pairwise_dists[i_idx, j_idx] - valid_mask = dists <= max_dist_to_centroid * 2 - i_valid = i_idx[valid_mask] - j_valid = j_idx[valid_mask] - d_valid = dists[valid_mask] - - heap = list(zip(d_valid, i_valid, j_valid)) - heapq.heapify(heap) - - while heap: - dist, ci, cj = heapq.heappop(heap) - if ci not in clusters or cj not in clusters: - continue - - merged_cluster = clusters[ci] + clusters[cj] - if len(merged_cluster) > max_size: - continue - - merged_points = points[merged_cluster] - centroid = merged_points.mean(axis=0) - # compare to square distance to avoid computing the sqrt - # and save some computation time - if np.any(np.einsum('ij,ij->i', merged_points - centroid, - merged_points - - centroid) > max_dist_to_centroid ** 2): - continue - - clusters[next_cluster_id] = merged_cluster - centroids[next_cluster_id] = centroid - del clusters[ci], clusters[cj] - del centroids[ci], centroids[cj] - self.update_heap_vectorized(heap, clusters, centroids, centroid, - merged_cluster, next_cluster_id, - max_size, max_dist_to_centroid) - - next_cluster_id += 1 - - # Precompute centroids and classify clusters - cluster_items = list(clusters.values()) - cluster_lens = np.array([len(c) for c in cluster_items]) - is_final = cluster_lens >= min_size - - # Split clusters - final_clusters = [cluster_items[i] for i in - range(len(cluster_items)) if is_final[i]] - leftovers = [cluster_items[i] for i in - range(len(cluster_items)) if not is_final[i]] - - # Compute centroids for final clusters - centroids_arr = np.array([points[c].mean(axis=0) for c in - final_clusters]) - # Snap to closest input point in cluster - snapped_idxs = [ - c[np.argmin(np.linalg.norm(points[c] - centroid, axis=1))] - for c, centroid in zip(final_clusters, centroids_arr) - ] - final_centroids = list(points[snapped_idxs]) - centroid_ids = snapped_idxs - - for cluster in leftovers: - if len(cluster) == 1: - idx = cluster[0] - point = points[idx] - dists = np.linalg.norm(np.array(final_centroids) - point, - axis=1) - valid = [i for i in range(len(final_clusters)) if - len(final_clusters[i]) < max_size and - dists[i] <= max_dist_to_centroid] - if valid: - best_fit = valid[np.argmin(dists[valid])] - else: - best_fit = np.argmin(dists) - final_clusters[best_fit].append(idx) - cluster_points = points[final_clusters[best_fit]] - centroid = cluster_points.mean(axis=0) - snapped_idx = final_clusters[best_fit][np.argmin( - np.linalg.norm(cluster_points - centroid, axis=1))] - final_centroids[best_fit] = points[snapped_idx] - centroid_ids[best_fit] = snapped_idx - else: - centroid = points[cluster].mean(axis=0) - dists = np.linalg.norm(np.array(final_centroids) - centroid, - axis=1) - best_fit = None - for i in np.argsort(dists): - potential = final_clusters[i] + cluster - if len(potential) <= max_size: - test_points = points[potential] - test_centroid = test_points.mean(axis=0) - if np.all(np.linalg.norm(test_points - - test_centroid, - axis=1) <= - max_dist_to_centroid): - best_fit = i - break - if best_fit is not None: - final_clusters[best_fit].extend(cluster) - cluster_points = points[final_clusters[best_fit]] - centroid = cluster_points.mean(axis=0) - snapped_idx = final_clusters[best_fit][np.argmin( - np.linalg.norm(cluster_points - centroid, axis=1))] - final_centroids[best_fit] = points[snapped_idx] - centroid_ids[best_fit] = snapped_idx - else: - final_clusters.append(cluster) - snapped_idx = cluster[np.argmin(np.linalg.norm( - points[cluster] - centroid, axis=1))] - final_centroids.append(points[snapped_idx]) - centroid_ids.append(snapped_idx) - - labels = np.zeros(len(points), dtype=int) - for idx, cluster in enumerate(final_clusters): - for i in cluster: - labels[i] = idx - - self.clustered_ids = final_clusters - self.centroid_ids = centroid_ids - - # ties clusters together - if tie_clusters: - self.add_tie_points(labels, self.num_tie_points, self.max_dist) - - return final_clusters, labels, centroid_ids - - def update_heap_vectorized(self, heap, clusters, centroids, centroid, - merged_cluster, next_cluster_id, max_size, - max_dist_to_centroid): - """ - Vectorized version to update the heap with valid cluster pairs after a merge. - - Parameters: - heap (List[Tuple[float, int, int]]): The heap to update. - clusters (Dict[int, List[int]]): Dictionary of cluster_id -> list - of point indices. - centroids (Dict[int, np.ndarray]): Dictionary of cluster_id -> - centroid coordinates. - centroid (np.ndarray): The centroid of the newly merged cluster. - merged_cluster (List[int]): Indices of points in the new cluster. - next_cluster_id (int): The ID of the new cluster. - max_size (int): Maximum allowed cluster size. - max_dist_to_centroid (float): Maximum allowed distance for a valid - connection. - """ - # Extract existing cluster IDs and their centroids - existing_ids = np.array(list(clusters.keys())) - existing_centroids = np.array([centroids[k] for k in existing_ids]) - existing_sizes = np.array([len(clusters[k]) for k in existing_ids]) - - # Compute distance from new centroid to all others - dists = np.linalg.norm(existing_centroids - centroid, axis=1) - - # Evaluate which existing clusters are valid for merging - size_sum = existing_sizes + len(merged_cluster) - valid = (existing_ids != next_cluster_id) & (size_sum <= max_size) & (dists <= max_dist_to_centroid * 2) - - # Push valid merge pairs into heap - for other_id, dist in zip(existing_ids[valid], dists[valid]): - heapq.heappush(heap, (dist, next_cluster_id, other_id)) - - def add_tie_points(self, cluster_labels, num_neighbors=4, max_tie_distance=5_000_000): - """ - Add reciprocal tie points to each cluster from its nearest neighbors, - then ensure every disconnected cluster component (island) is connected - to at least `num_neighbors` external clusters. - - Parameters: - cluster_labels (np.ndarray): Cluster index (0..K-1) for each - station. - num_neighbors (int): Minimum number of external links each island - must have. - max_tie_distance (float): Max allowable tie distance in meters. - - Returns: - new_clusters (List[List[int]]): Cluster station indices including - added tie points. - tie_points (List[List[int]]): Tie point indices added to each - cluster. - """ - - points = self.points - labels = cluster_labels - centroids = self.get_centroid_coordinates() - - n_clusters = len(centroids) - n_stations = points.shape[0] - - # === Step 1: Initialize cluster structure === - clusters = [[] for _ in range(n_clusters)] - for idx, label in enumerate(labels): - clusters[label].append(idx) - - new_clusters = [list(cluster) for cluster in clusters] - tie_points = [[] for _ in range(n_clusters)] - used_points = set() - tie_log = np.zeros((n_clusters, n_clusters), dtype=bool) - - # === Step 2: Initial connections to nearby neighbors using centroids === - nbrs = NearestNeighbors(n_neighbors=num_neighbors + 1).fit(centroids) - _, neighbors = nbrs.kneighbors(centroids) - - def add_reciprocal_tie(i, j, pi_idx, pj_idx): - new_clusters[i].append(pj_idx) - new_clusters[j].append(pi_idx) - tie_points[i].append(pj_idx) - tie_points[j].append(pi_idx) - used_points.update({pi_idx, pj_idx}) - tie_log[i, j] = True - tie_log[j, i] = True - - for i in range(n_clusters): - for j in neighbors[i][1:]: - if tie_log[i, j] or tie_log[j, i]: - continue - pi = [idx for idx in clusters[i] if idx not in used_points] - pj = [idx for idx in clusters[j] if idx not in used_points] - if not pi or not pj: - continue - dist_matrix = np.linalg.norm(points[pi][:, None, :] - - points[pj][None, :, :], axis=2) - min_idx = np.unravel_index(np.argmin(dist_matrix), dist_matrix.shape) - min_dist = dist_matrix[min_idx] - if min_dist <= max_tie_distance: - add_reciprocal_tie(i, j, pi[min_idx[0]], pj[min_idx[1]]) - - # === Step 3: Build the graph from current tie connections === - G = nx.Graph() - G.add_nodes_from(range(n_clusters)) - for i in range(n_clusters): - for j in range(i + 1, n_clusters): - if set(tie_points[i]) & set(clusters[j]) or set(tie_points[j]) & set(clusters[i]): - G.add_edge(i, j) - - # === Step 4: Ensure every disconnected component connects to at least `num_neighbors` others === - components = sorted(list(nx.connected_components(G)), key=len) - - while len(components) > 1: - updated = False # Flag to break early if no valid connections were made - - for comp in components: - external_nodes = set(range(n_clusters)) - comp - connection_candidates = [] - - for i in comp: - for j in external_nodes: - if tie_log[i, j] or tie_log[j, i]: - continue - pi = [idx for idx in clusters[i] if idx not in used_points] - pj = [idx for idx in clusters[j] if idx not in used_points] - if not pi or not pj: - continue - dist_matrix = np.linalg.norm(points[pi][:, None, :] - - points[pj][None, :, :], - axis=2) - min_idx = np.unravel_index(np.argmin(dist_matrix), - dist_matrix.shape) - dist = dist_matrix[min_idx] - if dist <= max_tie_distance: - connection_candidates.append((dist, i, j, - pi[min_idx[0]], - pj[min_idx[1]])) - - # Sort connections by shortest distance - connection_candidates.sort() - added = 0 - for conn in connection_candidates: - _, i, j, pi_idx, pj_idx = conn - if not G.has_edge(i, j): - add_reciprocal_tie(i, j, pi_idx, pj_idx) - G.add_edge(i, j) - added += 1 - updated = True - if added >= num_neighbors: - break - - if added < num_neighbors: - pass - # tqdm.write(f" -- WARNING: Component containing clusters {sorted(comp)} " - # f"was only connected to {added} external clusters under the distance constraint.") - - if not updated: - # tqdm.write(" -- WARNING: Some disconnected components could not be joined with the rest of the graph.") - break - - components = list(nx.connected_components(G)) - - # create matrix with clusters and stations - matrix = np.zeros((n_clusters, n_stations), dtype=bool) - for i, cluster in enumerate(new_clusters): - matrix[i, cluster] = True - - self.OC = np.array(matrix) - - self.clustered_ids = new_clusters - self.tie_ids = tie_points - - return new_clusters, tie_points - - def get_cluster_coordinates(self): - return [[self.points[i] for i in cluster] - for cluster in self.clustered_ids] - - def get_centroid_coordinates(self): - return [self.points[i] for i in self.centroid_ids] - - def get_tie_coordinates(self): - return [[self.points[i] for i in cluster] - for cluster in self.tie_ids] - - """Bisecting Q-means clustering.""" # Modified from sklearn _bisecting_k_means.py