20
20
_kmeans_single_lloyd )
21
21
22
22
23
- def prune (OC , central_points , method = 'linear ' ):
23
+ def prune (OC , central_points , method = 'minsize ' ):
24
24
"""Prune redundant clusters from over cluster (OC) and other arrays
25
25
26
26
Parameters
27
27
----------
28
28
29
29
OC : bool array of shape (n_clusters, n_coordinates)
30
- method : ["linear", None]; defaults linear scan
30
+ method : ["linear", "minsize:]; "linear" is a row-by-row scan through the
31
+ cluster matrix, "minsize" will sort matrix rows (i.e., the clusters)
32
+ according to size and prioritize pruning the smallest clusters first.
31
33
32
34
Returns
33
35
34
36
OC : Pruned bool array of shape (n_clusters - N, n_coordinates)
35
37
central_points : Pruned int array of shape (n_clusters -N,)
36
38
"""
39
+ subset = []
40
+ rowlength = len (OC [0 ,:])
37
41
if method == "linear" :
38
- subset = []
39
- for i , row in enumerate (OC ):
40
- mod = OC .copy ()
41
- mod [i , :] = np .zeros (len (row ))
42
- counts = mod .sum (axis = 0 )
43
- problems = np .sum (counts == 0 )
44
- if problems == 0 :
45
- subset .append (i )
46
- OC [i , :] = np .zeros (len (row ))
47
- # Cast subset list to pandas index
48
- dfIndex = pd .Index (subset )
49
- # Cast OC to pandas dataframe
50
- dfOC = pd .DataFrame (OC )
51
- # Apply the 'inverse' index; pruned is boolean numpy index array
52
- pruned = ~ dfOC .index .isin (dfIndex )
53
- return OC [pruned ], central_points [pruned ]
42
+ indices = list (range (len (OC )))
43
+ elif method == "minsize" :
44
+ indices = np .argsort (OC .sum (axis = 1 ))
54
45
else :
55
- return OC , central_points
56
-
46
+ raise ValueError ("Unknown method '" + method + "'" )
47
+ for i in indices :
48
+ mod = OC .copy ()
49
+ mod [i , :] = np .zeros (rowlength )
50
+ counts = mod .sum (axis = 0 )
51
+ problems = np .sum (counts == 0 )
52
+ if problems == 0 :
53
+ subset .append (i )
54
+ OC [i , :] = np .zeros (rowlength )
55
+ # Cast subset list to pandas index
56
+ dfIndex = pd .Index (subset )
57
+ # Cast OC to pandas dataframe
58
+ dfOC = pd .DataFrame (OC )
59
+ # Apply the 'inverse' index; pruned is boolean numpy index array
60
+ pruned = ~ dfOC .index .isin (dfIndex )
61
+ return OC [pruned ], central_points [pruned ]
57
62
58
63
def select_central_point (coordinates , centroids , metric = 'euclidean' ):
59
64
"""Select the nearest central point in a given neighborhood
@@ -95,8 +100,8 @@ def select_central_point(coordinates, centroids, metric='euclidean'):
95
100
return idxs .squeeze ()
96
101
97
102
98
- def over_cluster (labels , coordinates , metric = 'haversine' , neighborhood = 5 ,
99
- overlap_points = 2 , rejection_threshold = None ):
103
+ def over_cluster (labels , coordinates , metric = 'haversine' , neighbors = 5 ,
104
+ overlap_points = 2 , rejection_threshold = None , method = 'static' ):
100
105
"""Expand cluster membership to include edge points of neighbor clusters
101
106
102
107
Expands an existing clustering to create overlapping membership between
@@ -155,9 +160,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
155
160
Sparse matrices are only supported by scikit-learn metrics. See the
156
161
documentation for scipy.spatial.distance for details on these metrics.
157
162
158
- neighborhood : int greater than or equal to 1, default=3
159
- Number of adjacent clusters to include when adding cluster membership
160
- overlap. Should be less than the number of unique cluster labels - 1.
163
+ neighbors: int greater than or equal to 1, default=3
164
+ For method='static', this is total number of points that will be added
165
+ to the seed clusters during cluster expansion.
166
+ For method='dynamic', this is the (zero-indexed) number of adjacent
167
+ clusters to include when adding cluster membership overlap. Should be
168
+ less than the number of unique cluster labels - 1.
161
169
162
170
overlap_points : int greater than or equal to 1, default=2
163
171
Should not exceed the size of the smallest cluster in `labels`.
@@ -169,6 +177,11 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
169
177
Note that if value other than 'None' is used, there is no guarantee
170
178
that all clusters will have overlap points added.
171
179
180
+ method : 'static' (default) or 'dynamic'
181
+ The 'static' method will always produce an overcluster equal to the
182
+ `neighbors` parameter; 'dynamic' will produce an overcluster ceiling
183
+ of (neighbors - 1) * overlap_points, with a floor of neighbors.
184
+
172
185
Returns
173
186
-------
174
187
expanded_clusters : bool array of shape (n_clusters, n_coordinates)
@@ -183,8 +196,8 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
183
196
clusters = np .unique (labels )
184
197
n_clusters = len (clusters )
185
198
186
- if (n_clusters - 1 ) < neighborhood :
187
- neighborhood = (n_clusters - 1 )
199
+ if (n_clusters - 1 ) < neighbors :
200
+ neighbors = (n_clusters - 1 )
188
201
189
202
# reference index for reverse lookups
190
203
ridx = np .array (list (range (len (labels ))))
@@ -200,10 +213,11 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
200
213
# Build index tree on members
201
214
nbrs = NearestNeighbors (n_neighbors = 1 , algorithm = 'ball_tree' ,
202
215
metric = metric ).fit (coordinates [members ])
203
- # Could be set to '1';
204
- # using same check as while loop for consistency
205
- coverage = len (np .unique (labels [output [cluster , :]]))
206
- while coverage <= neighborhood :
216
+ if method == 'dynamic' :
217
+ coverage = len (np .unique (labels [output [cluster , :]]))
218
+ elif method == 'static' :
219
+ coverage = 0
220
+ while coverage <= neighbors :
207
221
# intersect search tree with non-members
208
222
D , _ = nbrs .kneighbors (coordinates [nonmembers , :])
209
223
# Rejection threshold is lightly tested...
@@ -216,8 +230,12 @@ def over_cluster(labels, coordinates, metric='haversine', neighborhood=5,
216
230
nonmembers [new_member ] = 0
217
231
# Add to member label array
218
232
output [cluster , new_member ] = 1
219
- # Update current count of over-clustered neighbors
220
- coverage = len (np .unique (labels [output [cluster , :]]))
233
+ if method == 'dynamic' :
234
+ # Update current count of over-clustered neighbors
235
+ coverage = len (np .unique (labels [output [cluster , :]]))
236
+ elif method == 'static' :
237
+ # Update current point expansion count
238
+ coverage += 1
221
239
# Grab label of new member for overlap check
222
240
nm_label = labels [new_member ]
223
241
# Check if we've exceeded our overlap allotment...
0 commit comments