Skip to content

Commit 2f2aedb

Browse files
authored
Merge pull request #280 from ntumlgroup/random-fixes
Random fixes
2 parents ca78305 + f3e57bc commit 2f2aedb

File tree

9 files changed

+104
-76
lines changed

9 files changed

+104
-76
lines changed

docs/examples/plot_linear_gridsearch_tutorial.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,8 @@
1111

1212
from sklearn.preprocessing import MultiLabelBinarizer
1313
import libmultilabel.linear as linear
14-
from libmultilabel.linear.preprocessor import read_libmultilabel_format
1514

16-
train_data = read_libmultilabel_format('data/rcv1/train.txt')
15+
train_data = linear.read_libmultilabel_format('data/rcv1/train.txt')
1716
binarizer = MultiLabelBinarizer(sparse_output=True)
1817
binarizer.fit(train_data['label'])
1918
y = binarizer.transform(train_data['label']).astype('d')

libmultilabel/linear/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .linear import *
2-
from .metrics import get_metrics, tabulate_metrics
2+
from .metrics import *
33
from .preprocessor import *
44
from .tree import *
55
from .utils import *

libmultilabel/linear/linear.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import logging
24
import os
35

@@ -11,13 +13,14 @@
1113
'train_cost_sensitive',
1214
'train_cost_sensitive_micro',
1315
'train_binary_and_multiclass',
14-
'predict_values']
16+
'predict_values',
17+
'get_topk_labels']
1518

1619

1720
class FlatModel:
1821
def __init__(self, weights: np.matrix,
1922
bias: float,
20-
thresholds: 'float | np.ndarray',
23+
thresholds: float | np.ndarray,
2124
):
2225
self.weights = weights
2326
self.bias = bias
@@ -68,7 +71,7 @@ def train_1vsrest(y: sparse.csr_matrix,
6871
A model which can be used in predict_values.
6972
"""
7073
# Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
71-
x, options, bias = prepare_options(x, options)
74+
x, options, bias = _prepare_options(x, options)
7275

7376
y = y.tocsc()
7477
num_class = y.shape[1]
@@ -79,14 +82,14 @@ def train_1vsrest(y: sparse.csr_matrix,
7982
logging.info(f'Training one-vs-rest model on {num_class} labels')
8083
for i in tqdm(range(num_class), disable=not verbose):
8184
yi = y[:, i].toarray().reshape(-1)
82-
weights[:, i] = do_train(2*yi - 1, x, options).ravel()
85+
weights[:, i] = _do_train(2*yi - 1, x, options).ravel()
8386

8487
return FlatModel(weights=np.asmatrix(weights),
8588
bias=bias,
8689
thresholds=0)
8790

8891

89-
def prepare_options(x: sparse.csr_matrix, options: str) -> 'tuple[sparse.csr_matrix, str, float]':
92+
def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]:
9093
"""Prepare options and x for multi-label training. Called in the first line of
9194
any training function.
9295
@@ -150,7 +153,7 @@ def train_thresholding(y: sparse.csr_matrix,
150153
A model which can be used in predict_values.
151154
"""
152155
# Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
153-
x, options, bias = prepare_options(x, options)
156+
x, options, bias = _prepare_options(x, options)
154157

155158
y = y.tocsc()
156159
num_class = y.shape[1]
@@ -162,7 +165,7 @@ def train_thresholding(y: sparse.csr_matrix,
162165
logging.info(f'Training thresholding model on {num_class} labels')
163166
for i in tqdm(range(num_class), disable=not verbose):
164167
yi = y[:, i].toarray().reshape(-1)
165-
w, t = thresholding_one_label(2*yi - 1, x, options)
168+
w, t = _thresholding_one_label(2*yi - 1, x, options)
166169
weights[:, i] = w.ravel()
167170
thresholds[i] = t
168171

@@ -171,10 +174,10 @@ def train_thresholding(y: sparse.csr_matrix,
171174
thresholds=thresholds)
172175

173176

174-
def thresholding_one_label(y: np.ndarray,
177+
def _thresholding_one_label(y: np.ndarray,
175178
x: sparse.csr_matrix,
176179
options: str
177-
) -> 'tuple[np.ndarray, float]':
180+
) -> tuple[np.ndarray, float]:
178181
"""Outer cross-validation for thresholding on a single label.
179182
180183
Args:
@@ -201,29 +204,29 @@ def thresholding_one_label(y: np.ndarray,
201204
val_idx = perm[mask]
202205
train_idx = perm[mask != True]
203206

204-
scutfbr_w, scutfbr_b_list = scutfbr(
207+
scutfbr_w, scutfbr_b_list = _scutfbr(
205208
y[train_idx], x[train_idx], fbr_list, options)
206209
wTx = (x[val_idx] * scutfbr_w).A1
207210

208211
for i in range(fbr_list.size):
209-
F = fmeasure(y[val_idx], 2*(wTx > -scutfbr_b_list[i]) - 1)
212+
F = _fmeasure(y[val_idx], 2*(wTx > -scutfbr_b_list[i]) - 1)
210213
f_list[i] += F
211214

212215
best_fbr = fbr_list[::-1][np.argmax(f_list[::-1])] # last largest
213216
if np.max(f_list) == 0:
214217
best_fbr = np.min(fbr_list)
215218

216219
# final model
217-
w, b_list = scutfbr(y, x, np.array([best_fbr]), options)
220+
w, b_list = _scutfbr(y, x, np.array([best_fbr]), options)
218221

219222
return w, b_list[0]
220223

221224

222-
def scutfbr(y: np.ndarray,
225+
def _scutfbr(y: np.ndarray,
223226
x: sparse.csr_matrix,
224-
fbr_list: 'list[float]',
227+
fbr_list: list[float],
225228
options: str
226-
) -> 'tuple[np.matrix, np.ndarray]':
229+
) -> tuple[np.matrix, np.ndarray]:
227230
"""Inner cross-validation for SCutfbr heuristic.
228231
229232
Args:
@@ -250,10 +253,10 @@ def scutfbr(y: np.ndarray,
250253
val_idx = perm[mask]
251254
train_idx = perm[mask != True]
252255

253-
w = do_train(y[train_idx], x[train_idx], options)
256+
w = _do_train(y[train_idx], x[train_idx], options)
254257
wTx = (x[val_idx] * w).A1
255258
scut_b = 0.
256-
start_F = fmeasure(y[val_idx], 2*(wTx > -scut_b) - 1)
259+
start_F = _fmeasure(y[val_idx], 2*(wTx > -scut_b) - 1)
257260

258261
# stableness to match the MATLAB implementation
259262
sorted_wTx_index = np.argsort(wTx, kind='stable')
@@ -291,7 +294,7 @@ def scutfbr(y: np.ndarray,
291294
else:
292295
scut_b = -(sorted_wTx[cut] + sorted_wTx[cut + 1]) / 2
293296

294-
F = fmeasure(y_val, 2*(wTx > -scut_b) - 1)
297+
F = _fmeasure(y_val, 2*(wTx > -scut_b) - 1)
295298

296299
for i in range(fbr_list.size):
297300
if F > fbr_list[i]:
@@ -300,10 +303,13 @@ def scutfbr(y: np.ndarray,
300303
b_list[i] -= np.max(wTx)
301304

302305
b_list = b_list / nr_fold
303-
return do_train(y, x, options), b_list
306+
return _do_train(y, x, options), b_list
304307

305308

306-
def do_train(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.matrix:
309+
def _do_train(y: np.ndarray,
310+
x: sparse.csr_matrix,
311+
options: str
312+
) -> np.matrix:
307313
"""Wrapper around liblinear.liblinearutil.train.
308314
Forcibly suppresses all IO regardless of options.
309315
@@ -351,7 +357,7 @@ def __exit__(self, type, value, traceback):
351357
os.close(self.stderr)
352358

353359

354-
def fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:
360+
def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:
355361
"""Calculate F1 score.
356362
357363
Args:
@@ -393,7 +399,7 @@ def train_cost_sensitive(y: sparse.csr_matrix,
393399
A model which can be used in predict_values.
394400
"""
395401
# Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
396-
x, options, bias = prepare_options(x, options)
402+
x, options, bias = _prepare_options(x, options)
397403

398404
y = y.tocsc()
399405
num_class = y.shape[1]
@@ -405,15 +411,15 @@ def train_cost_sensitive(y: sparse.csr_matrix,
405411
f'Training cost-sensitive model for Macro-F1 on {num_class} labels')
406412
for i in tqdm(range(num_class), disable=not verbose):
407413
yi = y[:, i].toarray().reshape(-1)
408-
w = cost_sensitive_one_label(2*yi - 1, x, options)
414+
w = _cost_sensitive_one_label(2*yi - 1, x, options)
409415
weights[:, i] = w.ravel()
410416

411417
return FlatModel(weights=np.asmatrix(weights),
412418
bias=bias,
413419
thresholds=0)
414420

415421

416-
def cost_sensitive_one_label(y: np.ndarray,
422+
def _cost_sensitive_one_label(y: np.ndarray,
417423
x: sparse.csr_matrix,
418424
options: str
419425
) -> np.ndarray:
@@ -436,17 +442,17 @@ def cost_sensitive_one_label(y: np.ndarray,
436442
bestScore = -np.Inf
437443
for a in param_space:
438444
cv_options = f'{options} -w1 {a}'
439-
pred = cross_validate(y, x, cv_options, perm)
440-
score = fmeasure(y, pred)
445+
pred = _cross_validate(y, x, cv_options, perm)
446+
score = _fmeasure(y, pred)
441447
if bestScore < score:
442448
bestScore = score
443449
bestA = a
444450

445451
final_options = f'{options} -w1 {bestA}'
446-
return do_train(y, x, final_options)
452+
return _do_train(y, x, final_options)
447453

448454

449-
def cross_validate(y: np.ndarray,
455+
def _cross_validate(y: np.ndarray,
450456
x: sparse.csr_matrix,
451457
options: str,
452458
perm: np.ndarray
@@ -470,7 +476,7 @@ def cross_validate(y: np.ndarray,
470476
val_idx = perm[mask]
471477
train_idx = perm[mask != True]
472478

473-
w = do_train(y[train_idx], x[train_idx], options)
479+
w = _do_train(y[train_idx], x[train_idx], options)
474480
pred[val_idx] = (x[val_idx] * w).A1 > 0
475481

476482
return 2*pred - 1
@@ -498,7 +504,7 @@ def train_cost_sensitive_micro(y: sparse.csr_matrix,
498504
A model which can be used in predict_values.
499505
"""
500506
# Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
501-
x, options, bias = prepare_options(x, options)
507+
x, options, bias = _prepare_options(x, options)
502508

503509
y = y.tocsc()
504510
num_class = y.shape[1]
@@ -520,7 +526,7 @@ def train_cost_sensitive_micro(y: sparse.csr_matrix,
520526
yi = 2*yi - 1
521527

522528
cv_options = f'{options} -w1 {a}'
523-
pred = cross_validate(yi, x, cv_options, perm)
529+
pred = _cross_validate(yi, x, cv_options, perm)
524530
tp = tp + np.sum(np.logical_and(yi == 1, pred == 1))
525531
fn = fn + np.sum(np.logical_and(yi == 1, pred == -1))
526532
fp = fp + np.sum(np.logical_and(yi == -1, pred == 1))
@@ -533,7 +539,7 @@ def train_cost_sensitive_micro(y: sparse.csr_matrix,
533539
final_options = f'{options} -w1 {bestA}'
534540
for i in range(num_class):
535541
yi = y[:, i].toarray().reshape(-1)
536-
w = do_train(2*yi - 1, x, final_options)
542+
w = _do_train(2*yi - 1, x, final_options)
537543
weights[:, i] = w.ravel()
538544

539545
return FlatModel(weights=np.asmatrix(weights),
@@ -557,7 +563,7 @@ def train_binary_and_multiclass(y: sparse.csr_matrix,
557563
Returns:
558564
A model which can be used in predict_values.
559565
"""
560-
x, options, bias = prepare_options(x, options)
566+
x, options, bias = _prepare_options(x, options)
561567
num_instances, num_labels = y.shape
562568
nonzero_instance_ids, nonzero_label_ids = y.nonzero()
563569
assert len(set(nonzero_instance_ids)) == num_instances, """
@@ -602,7 +608,10 @@ def predict_values(model, x: sparse.csr_matrix) -> np.ndarray:
602608
return model.predict_values(x)
603609

604610

605-
def get_topk_labels(label_mapping: np.ndarray, preds: np.ndarray, top_k: int = 5) -> 'list[list[str]]':
611+
def get_topk_labels(label_mapping: np.ndarray,
612+
preds: np.ndarray,
613+
top_k: int = 5
614+
) -> list[list[str]]:
606615
"""Get top k predictions from decision values.
607616
608617
Args:

libmultilabel/linear/metrics.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1+
from __future__ import annotations
2+
13
import re
24

35
import numpy as np
46

5-
__all__ = ['RPrecision',
6-
'Precision',
7-
'F1',
8-
'MetricCollection',
9-
'get_metrics',
7+
__all__ = ['get_metrics',
108
'tabulate_metrics']
119

1210

@@ -100,20 +98,24 @@ def update(self, preds: np.ndarray, target: np.ndarray) -> None:
10098
for metric in self.metrics.values():
10199
metric.update(preds, target)
102100

103-
def compute(self) -> "dict[str, float]":
101+
def compute(self) -> dict[str, float]:
104102
ret = {}
105103
for name, metric in self.metrics.items():
106104
ret[name] = metric.compute()
107105
return ret
108106

109107

110-
def get_metrics(metric_threshold: float, monitor_metrics: list, num_classes: int, multiclass=False):
108+
def get_metrics(metric_threshold: float,
109+
monitor_metrics: list[str],
110+
num_classes: int,
111+
multiclass: bool = False
112+
) -> MetricCollection:
111113
"""Get a collection of metrics by their names.
112114
113115
Args:
114116
metric_threshold (float): The decision value threshold over which a
115117
label is predicted as positive.
116-
monitor_metrics (list): A list of strings naming the metrics.
118+
monitor_metrics (list[str]): A list metric names.
117119
num_classes (int): The number of classes.
118120
multiclass (bool, optional): Enable multiclass mode. Defaults to False.
119121
@@ -139,7 +141,7 @@ def get_metrics(metric_threshold: float, monitor_metrics: list, num_classes: int
139141
return MetricCollection(metrics)
140142

141143

142-
def tabulate_metrics(metric_dict, split):
144+
def tabulate_metrics(metric_dict: dict[str, float], split: str) -> str:
143145
msg = f'====== {split} dataset evaluation result =======\n'
144146
header = '|'.join([f'{k:^18}' for k in metric_dict.keys()])
145147
values = '|'.join([f'{x * 100:^18.4f}' if isinstance(x, (np.floating,

0 commit comments

Comments
 (0)