@@ -270,18 +270,6 @@ class BisectingQMeans(_BaseKMeans):
270
270
271
271
Parameters
272
272
----------
273
- min_size : int, default=4
274
- The minimum acceptable cluster size. Clusters of size <= to this
275
- parameter will **not** be produced by this algorithm.
276
-
277
- opt_size : int, default=12
278
- Target optimum cluster size. If the sum membership of a proposed
279
- cluster bisection is less than this value, the cluster will not be
280
- bisected. When combined with the `min_size` parameter above,
281
- these conditions together mean that clusters of sizes smaller than
282
- (`opt_size` - `min_size`) are *a priori* ineligible to be
283
- bisected.
284
-
285
273
max_size: int, default=25
286
274
Hard cutoff to bypass the heuristic when bisecting clusters; no
287
275
clusters greater than this size will be produced.
@@ -382,8 +370,6 @@ class BisectingQMeans(_BaseKMeans):
382
370
383
371
def __init__ (
384
372
self ,
385
- min_size = 4 ,
386
- opt_size = 12 ,
387
373
max_size = 25 ,
388
374
* ,
389
375
init = "random" ,
@@ -393,7 +379,7 @@ def __init__(
393
379
verbose = 0 ,
394
380
tol = 1e-4 ,
395
381
copy_x = True ,
396
- algorithm = "lloyd " ,
382
+ algorithm = "elkan " ,
397
383
n_clusters = 2 , # needed for base class, do not remove
398
384
):
399
385
super ().__init__ (
@@ -406,8 +392,6 @@ def __init__(
406
392
n_init = n_init ,
407
393
)
408
394
409
- self .min_size = min_size
410
- self .opt_size = opt_size
411
395
self .max_size = max_size
412
396
self .copy_x = copy_x
413
397
self .algorithm = algorithm
@@ -422,40 +406,6 @@ def _warn_mkl_vcomp(self, n_active_threads):
422
406
f" variable OMP_NUM_THREADS={ n_active_threads } ."
423
407
)
424
408
425
- def _inertia_per_cluster (self , X , centers , labels , sample_weight ):
426
- """Calculate the sum of squared errors (inertia) per cluster.
427
-
428
- Parameters
429
- ----------
430
- X : {ndarray, csr_matrix} of shape (n_samples, n_features)
431
- The input samples.
432
-
433
- centers : ndarray of shape (n_clusters=2, n_features)
434
- The cluster centers.
435
-
436
- labels : ndarray of shape (n_samples,)
437
- Index of the cluster each sample belongs to.
438
-
439
- sample_weight : ndarray of shape (n_samples,)
440
- The weights for each observation in X.
441
-
442
- Returns
443
- -------
444
- inertia_per_cluster : ndarray of shape (n_clusters=2,)
445
- Sum of squared errors (inertia) for each cluster.
446
- """
447
- # n_clusters = 2 since centers comes from a bisection
448
- n_clusters = centers .shape [0 ]
449
- _inertia = _inertia_sparse if sp .issparse (X ) else _inertia_dense
450
-
451
- inertia_per_cluster = np .empty (n_clusters )
452
- for label in range (n_clusters ):
453
- inertia_per_cluster [label ] = _inertia (X , sample_weight , centers ,
454
- labels , self ._n_threads ,
455
- single_label = label )
456
-
457
- return inertia_per_cluster
458
-
459
409
def _bisect (self , X , x_squared_norms , sample_weight , cluster_to_bisect ):
460
410
"""Split a cluster into 2 subsclusters.
461
411
@@ -511,24 +461,12 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
511
461
if self .verbose :
512
462
print (f"New centroids from bisection: { best_centers } " )
513
463
514
- scores = self ._inertia_per_cluster (X , best_centers , best_labels ,
515
- sample_weight )
516
464
counts = np .bincount (best_labels , minlength = 2 )
517
- scores [np .where (counts <
518
- (self .opt_size - self .min_size ))] = - np .inf
519
- # case where bisecting is not optimum
520
- if (counts [0 ] + counts [1 ]) < self .opt_size :
521
- cluster_to_bisect .score = - np .inf
522
- # bisect as long as the smallest child meets membership constraints
523
- elif ((counts [0 ] >= self .min_size ) and
524
- (counts [1 ] >= self .min_size )):
465
+ scores = counts
466
+ if (counts [0 ] + counts [1 ] >= self .max_size ):
525
467
cluster_to_bisect .split (best_labels , best_centers , scores )
526
- # one child will have membership of 3 or less; don't split
527
468
else :
528
- if (counts [0 ] + counts [1 ] >= self .max_size ):
529
- cluster_to_bisect .split (best_labels , best_centers , scores )
530
- else :
531
- cluster_to_bisect .score = - np .inf
469
+ self .bisect = False
532
470
533
471
@_fit_context (prefer_skip_nested_validation = True )
534
472
def fit (self , X , y = None , sample_weight = None ):
@@ -592,15 +530,20 @@ def fit(self, X, y=None, sample_weight=None):
592
530
593
531
x_squared_norms = row_norms (X , squared = True )
594
532
533
+ # run first bisection out of loop to avoid 0-count early termination
534
+ cluster_to_bisect = self ._bisecting_tree .get_cluster_to_bisect ()
535
+ self ._bisect (X , x_squared_norms , sample_weight , cluster_to_bisect )
595
536
while self .bisect :
596
537
# Chose cluster to bisect
597
538
cluster_to_bisect = self ._bisecting_tree .get_cluster_to_bisect ()
598
539
599
540
# Split this cluster into 2 subclusters
600
- if cluster_to_bisect is not None :
541
+ #if cluster_to_bisect is not None:
542
+ if cluster_to_bisect .score > self .max_size :
601
543
self ._bisect (X , x_squared_norms , sample_weight ,
602
544
cluster_to_bisect )
603
545
else :
546
+ self .bisect = False
604
547
break
605
548
606
549
# Aggregate final labels and centers from the bisecting tree
@@ -677,6 +620,7 @@ def get_cluster_to_bisect(self):
677
620
max_score = cluster_leaf .score
678
621
best_cluster_leaf = cluster_leaf
679
622
623
+ #if max_score >= self.opt_size:
680
624
if np .isneginf (max_score ):
681
625
self .bisect = False
682
626
else :
0 commit comments