Skip to content

Commit d0ea797

Browse files
authored
Qmeans updates (#216)
* modifying test to use new keyword * heavy modification of qmeans clustering algorithm
1 parent 70287ba commit d0ea797

File tree

1 file changed

+11
-67
lines changed

1 file changed

+11
-67
lines changed

pgamit/cluster.py

Lines changed: 11 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -270,18 +270,6 @@ class BisectingQMeans(_BaseKMeans):
270270
271271
Parameters
272272
----------
273-
min_size : int, default=4
274-
The minimum acceptable cluster size. Clusters of size <= to this
275-
parameter will **not** be produced by this algorithm.
276-
277-
opt_size : int, default=12
278-
Target optimum cluster size. If the sum membership of a proposed
279-
cluster bisection is less than this value, the cluster will not be
280-
bisected. When combined with the `min_size` parameter above,
281-
these conditions together mean that clusters of sizes smaller than
282-
(`opt_size` - `min_size`) are *a priori* ineligible to be
283-
bisected.
284-
285273
max_size: int, default=25
286274
Hard cutoff to bypass the heuristic when bisecting clusters; no
287275
clusters greater than this size will be produced.
@@ -382,8 +370,6 @@ class BisectingQMeans(_BaseKMeans):
382370

383371
def __init__(
384372
self,
385-
min_size=4,
386-
opt_size=12,
387373
max_size=25,
388374
*,
389375
init="random",
@@ -393,7 +379,7 @@ def __init__(
393379
verbose=0,
394380
tol=1e-4,
395381
copy_x=True,
396-
algorithm="lloyd",
382+
algorithm="elkan",
397383
n_clusters=2, # needed for base class, do not remove
398384
):
399385
super().__init__(
@@ -406,8 +392,6 @@ def __init__(
406392
n_init=n_init,
407393
)
408394

409-
self.min_size = min_size
410-
self.opt_size = opt_size
411395
self.max_size = max_size
412396
self.copy_x = copy_x
413397
self.algorithm = algorithm
@@ -422,40 +406,6 @@ def _warn_mkl_vcomp(self, n_active_threads):
422406
f" variable OMP_NUM_THREADS={n_active_threads}."
423407
)
424408

425-
def _inertia_per_cluster(self, X, centers, labels, sample_weight):
426-
"""Calculate the sum of squared errors (inertia) per cluster.
427-
428-
Parameters
429-
----------
430-
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
431-
The input samples.
432-
433-
centers : ndarray of shape (n_clusters=2, n_features)
434-
The cluster centers.
435-
436-
labels : ndarray of shape (n_samples,)
437-
Index of the cluster each sample belongs to.
438-
439-
sample_weight : ndarray of shape (n_samples,)
440-
The weights for each observation in X.
441-
442-
Returns
443-
-------
444-
inertia_per_cluster : ndarray of shape (n_clusters=2,)
445-
Sum of squared errors (inertia) for each cluster.
446-
"""
447-
# n_clusters = 2 since centers comes from a bisection
448-
n_clusters = centers.shape[0]
449-
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
450-
451-
inertia_per_cluster = np.empty(n_clusters)
452-
for label in range(n_clusters):
453-
inertia_per_cluster[label] = _inertia(X, sample_weight, centers,
454-
labels, self._n_threads,
455-
single_label=label)
456-
457-
return inertia_per_cluster
458-
459409
def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
460410
"""Split a cluster into 2 subsclusters.
461411
@@ -511,24 +461,12 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
511461
if self.verbose:
512462
print(f"New centroids from bisection: {best_centers}")
513463

514-
scores = self._inertia_per_cluster(X, best_centers, best_labels,
515-
sample_weight)
516464
counts = np.bincount(best_labels, minlength=2)
517-
scores[np.where(counts <
518-
(self.opt_size - self.min_size))] = -np.inf
519-
# case where bisecting is not optimum
520-
if (counts[0] + counts[1]) < self.opt_size:
521-
cluster_to_bisect.score = -np.inf
522-
# bisect as long as the smallest child meets membership constraints
523-
elif ((counts[0] >= self.min_size) and
524-
(counts[1] >= self.min_size)):
465+
scores = counts
466+
if (counts[0] + counts[1] >= self.max_size):
525467
cluster_to_bisect.split(best_labels, best_centers, scores)
526-
# one child will have membership of 3 or less; don't split
527468
else:
528-
if (counts[0] + counts[1] >= self.max_size):
529-
cluster_to_bisect.split(best_labels, best_centers, scores)
530-
else:
531-
cluster_to_bisect.score = -np.inf
469+
self.bisect = False
532470

533471
@_fit_context(prefer_skip_nested_validation=True)
534472
def fit(self, X, y=None, sample_weight=None):
@@ -592,15 +530,20 @@ def fit(self, X, y=None, sample_weight=None):
592530

593531
x_squared_norms = row_norms(X, squared=True)
594532

533+
# run first bisection out of loop to avoid 0-count early termination
534+
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
535+
self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
595536
while self.bisect:
596537
# Chose cluster to bisect
597538
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
598539

599540
# Split this cluster into 2 subclusters
600-
if cluster_to_bisect is not None:
541+
#if cluster_to_bisect is not None:
542+
if cluster_to_bisect.score > self.max_size:
601543
self._bisect(X, x_squared_norms, sample_weight,
602544
cluster_to_bisect)
603545
else:
546+
self.bisect = False
604547
break
605548

606549
# Aggregate final labels and centers from the bisecting tree
@@ -677,6 +620,7 @@ def get_cluster_to_bisect(self):
677620
max_score = cluster_leaf.score
678621
best_cluster_leaf = cluster_leaf
679622

623+
#if max_score >= self.opt_size:
680624
if np.isneginf(max_score):
681625
self.bisect = False
682626
else:

0 commit comments

Comments
 (0)