Skip to content

Commit a5e9e41

Browse files
authored
Cluster module updates (#214)
* update to `prune` function in the clustering module; added `minsize` keyword for 'method' arguement. Prune function also now throws an error when using an unknown 'method' keyword (previous behavior was to simple return the matrix unmodified). * attempting to fix CI/CD so that tests run on external PRs * new method (and keyword) for static cluster expansion
1 parent 05ddc75 commit a5e9e41

File tree

2 files changed

+58
-34
lines changed

2 files changed

+58
-34
lines changed

.github/workflows/python-package-conda.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
name: Python Package using Conda
22

3-
on: [push]
3+
on:
4+
push:
5+
branches:
6+
- 'master'
7+
- 'releases/**'
8+
pull_request_target:
9+
types: [assigned, opened, synchronize, reopened]
410

511
jobs:
612
build-linux:

pgamit/cluster.py

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,40 +20,45 @@
2020
_kmeans_single_lloyd)
2121

2222

23-
def prune(OC, central_points, method='linear'):
23+
def prune(OC, central_points, method='minsize'):
2424
"""Prune redundant clusters from over cluster (OC) and other arrays
2525
2626
Parameters
2727
----------
2828
2929
OC : bool array of shape (n_clusters, n_coordinates)
30-
method : ["linear", None]; defaults linear scan
30+
method : ["linear", "minsize:]; "linear" is a row-by-row scan through the
31+
cluster matrix, "minsize" will sort matrix rows (i.e., the clusters)
32+
according to size and prioritize pruning the smallest clusters first.
3133
3234
Returns
3335
3436
OC : Pruned bool array of shape (n_clusters - N, n_coordinates)
3537
central_points : Pruned int array of shape (n_clusters -N,)
3638
"""
39+
subset = []
40+
rowlength = len(OC[0,:])
3741
if method == "linear":
38-
subset = []
39-
for i, row in enumerate(OC):
40-
mod = OC.copy()
41-
mod[i, :] = np.zeros(len(row))
42-
counts = mod.sum(axis=0)
43-
problems = np.sum(counts == 0)
44-
if problems == 0:
45-
subset.append(i)
46-
OC[i, :] = np.zeros(len(row))
47-
# Cast subset list to pandas index
48-
dfIndex = pd.Index(subset)
49-
# Cast OC to pandas dataframe
50-
dfOC = pd.DataFrame(OC)
51-
# Apply the 'inverse' index; pruned is boolean numpy index array
52-
pruned = ~dfOC.index.isin(dfIndex)
53-
return OC[pruned], central_points[pruned]
42+
indices = list(range(len(OC)))
43+
elif method == "minsize":
44+
indices = np.argsort(OC.sum(axis=1))
5445
else:
55-
return OC, central_points
56-
46+
raise ValueError("Unknown method '" + method + "'")
47+
for i in indices:
48+
mod = OC.copy()
49+
mod[i, :] = np.zeros(rowlength)
50+
counts = mod.sum(axis=0)
51+
problems = np.sum(counts == 0)
52+
if problems == 0:
53+
subset.append(i)
54+
OC[i, :] = np.zeros(rowlength)
55+
# Cast subset list to pandas index
56+
dfIndex = pd.Index(subset)
57+
# Cast OC to pandas dataframe
58+
dfOC = pd.DataFrame(OC)
59+
# Apply the 'inverse' index; pruned is boolean numpy index array
60+
pruned = ~dfOC.index.isin(dfIndex)
61+
return OC[pruned], central_points[pruned]
5762

5863
def select_central_point(coordinates, centroids, metric='euclidean'):
5964
"""Select the nearest central point in a given neighborhood
@@ -95,8 +100,8 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
95100
return idxs.squeeze()
96101

97102

98-
def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
99-
overlap_points=2, rejection_threshold=None):
103+
def over_cluster(labels, coordinates, metric='haversine', neighbors=5,
104+
overlap_points=2, rejection_threshold=None, method='static'):
100105
"""Expand cluster membership to include edge points of neighbor clusters
101106
102107
Expands an existing clustering to create overlapping membership between
@@ -155,9 +160,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
155160
Sparse matrices are only supported by scikit-learn metrics. See the
156161
documentation for scipy.spatial.distance for details on these metrics.
157162
158-
neighborhood : int greater than or equal to 1, default=3
159-
Number of adjacent clusters to include when adding cluster membership
160-
overlap. Should be less than the number of unique cluster labels - 1.
163+
neighbors: int greater than or equal to 1, default=3
164+
For method='static', this is total number of points that will be added
165+
to the seed clusters during cluster expansion.
166+
For method='dynamic', this is the (zero-indexed) number of adjacent
167+
clusters to include when adding cluster membership overlap. Should be
168+
less than the number of unique cluster labels - 1.
161169
162170
overlap_points : int greater than or equal to 1, default=2
163171
Should not exceed the size of the smallest cluster in `labels`.
@@ -169,6 +177,11 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
169177
Note that if value other than 'None' is used, there is no guarantee
170178
that all clusters will have overlap points added.
171179
180+
method : 'static' (default) or 'dynamic'
181+
The 'static' method will always produce an overcluster equal to the
182+
`neighbors` parameter; 'dynamic' will produce an overcluster ceiling
183+
of (neighbors - 1) * overlap_points, with a floor of neighbors.
184+
172185
Returns
173186
-------
174187
expanded_clusters : bool array of shape (n_clusters, n_coordinates)
@@ -183,8 +196,8 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
183196
clusters = np.unique(labels)
184197
n_clusters = len(clusters)
185198

186-
if (n_clusters - 1) < neighborhood:
187-
neighborhood = (n_clusters - 1)
199+
if (n_clusters - 1) < neighbors:
200+
neighbors = (n_clusters - 1)
188201

189202
# reference index for reverse lookups
190203
ridx = np.array(list(range(len(labels))))
@@ -200,10 +213,11 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
200213
# Build index tree on members
201214
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree',
202215
metric=metric).fit(coordinates[members])
203-
# Could be set to '1';
204-
# using same check as while loop for consistency
205-
coverage = len(np.unique(labels[output[cluster, :]]))
206-
while coverage <= neighborhood:
216+
if method == 'dynamic':
217+
coverage = len(np.unique(labels[output[cluster, :]]))
218+
elif method == 'static':
219+
coverage = 0
220+
while coverage <= neighbors:
207221
# intersect search tree with non-members
208222
D, _ = nbrs.kneighbors(coordinates[nonmembers, :])
209223
# Rejection threshold is lightly tested...
@@ -216,8 +230,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
216230
nonmembers[new_member] = 0
217231
# Add to member label array
218232
output[cluster, new_member] = 1
219-
# Update current count of over-clustered neighbors
220-
coverage = len(np.unique(labels[output[cluster, :]]))
233+
if method == 'dynamic':
234+
# Update current count of over-clustered neighbors
235+
coverage = len(np.unique(labels[output[cluster, :]]))
236+
elif method == 'static':
237+
# Update current point expansion count
238+
coverage += 1
221239
# Grab label of new member for overlap check
222240
nm_label = labels[new_member]
223241
# Check if we've exceeded our overlap allotment...

0 commit comments

Comments
 (0)