Merge pull request #275 from ntumlgroup/word_dict_none

Eleven1Liu · web-flow · commit cb9f8a29ee85 · 2023-03-31T20:10:30.000+08:00
Set the default value for `word_dict` and `embed_vecs`
diff --git a/docs/examples/plot_KimCNN_quickstart.py b/docs/examples/plot_KimCNN_quickstart.py
@@ -37,7 +37,6 @@
 datasets = load_datasets('data/rcv1/train.txt', 'data/rcv1/test.txt', tokenize_text=True)
 classes = load_or_build_label(datasets)
 word_dict, embed_vecs = load_or_build_text_dict(dataset=datasets['train'], embed_file='glove.6B.300d')
-tokenizer = None
 
 ######################################################################
 # Initialize a model
@@ -91,13 +90,12 @@
 for split in ['train', 'val', 'test']:
     loaders[split] = get_dataset_loader(
         data=datasets[split],
-        word_dict=word_dict,
         classes=classes,
         device=device,
         max_seq_length=512,
         batch_size=8,
         shuffle=True if split == 'train' else False,
-        tokenizer=tokenizer
+        word_dict=word_dict
     )
 
 ######################################################################
@@ -125,4 +123,3 @@
 #      'P@3':      0.7772253751754761,
 #      'P@5':      0.5449321269989014,
 #  }
-
diff --git a/docs/examples/plot_bert_quickstart.py b/docs/examples/plot_bert_quickstart.py
@@ -18,7 +18,7 @@
 ######################################################################
 # Setup device
 # --------------------
-# If you need to reproduce the results, please use the function ``set_seed``. 
+# If you need to reproduce the results, please use the function ``set_seed``.
 # For example, you will get the same result as you always use the seed ``1337``.
 #
 # For initial a hardware device, please use ``init_device`` to assign the hardware device that you want to use.
@@ -29,12 +29,12 @@
 ######################################################################
 # Load and tokenize data
 # ------------------------------------------
-# We assume that the ``rcv1`` data is located at the directory ``./data/rcv1``, 
+# We assume that the ``rcv1`` data is located at the directory ``./data/rcv1``,
 # and there exist the files ``train.txt`` and ``test.txt``.
-# You can utilize the function ``load_datasets()`` to load the data sets. 
-# By default, LibMultiLabel tokenizes documents, but the BERT model uses its own tokenizer. 
+# You can utilize the function ``load_datasets()`` to load the data sets.
+# By default, LibMultiLabel tokenizes documents, but the BERT model uses its own tokenizer.
 # Thus, we must set ``tokenize_text=False``.
-# Note that ``datasets`` contains three sets: ``datasets['train']``, ``datasets['val']`` and ``datasets['test']``, 
+# Note that ``datasets`` contains three sets: ``datasets['train']``, ``datasets['val']`` and ``datasets['test']``,
 # where ``datasets['train']`` and ``datasets['val']`` are randomly splitted from ``train.txt`` with the ratio ``8:2``.
 #
 # For the labels of the data, we apply the function ``load_or_build_label()`` to generate the label set.
@@ -44,7 +44,6 @@
 
 datasets = load_datasets('data/rcv1/train.txt', 'data/rcv1/test.txt', tokenize_text=False)
 classes = load_or_build_label(datasets)
-word_dict, embed_vecs = None, None
 tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 
 ######################################################################
@@ -63,8 +62,6 @@
     model_name=model_name,
     network_config=network_config,
     classes=classes,
-    word_dict=word_dict,
-    embed_vecs=embed_vecs,
     learning_rate=learning_rate,
     monitor_metrics=['Micro-F1', 'Macro-F1', 'P@1', 'P@3', 'P@5']
 )
@@ -80,7 +77,7 @@
 # Initialize a trainer
 # ----------------------------
 #
-# We use the function ``init_trainer`` to initialize a trainer. 
+# We use the function ``init_trainer`` to initialize a trainer.
 
 trainer = init_trainer(checkpoint_dir='runs/NN-example', epochs=15, val_metric='P@5')
 
@@ -97,7 +94,6 @@
 for split in ['train', 'val', 'test']:
     loaders[split] = get_dataset_loader(
         data=datasets[split],
-        word_dict=word_dict,
         classes=classes,
         device=device,
         max_seq_length=512,
@@ -112,7 +108,7 @@
 # Train and test a model
 # ------------------------------
 #
-# The bert model training process can be started via 
+# The bert model training process can be started via
 
 trainer.fit(model, loaders['train'], loaders['val'])
 
@@ -125,9 +121,9 @@
 # The results should be similar to::
 #
 #  {
-#      'Macro-F1': 0.569891024909958, 
-#      'Micro-F1': 0.8142925500869751, 
-#      'P@1':      0.9552904367446899, 
-#      'P@3':      0.7907078266143799, 
+#      'Macro-F1': 0.569891024909958,
+#      'Micro-F1': 0.8142925500869751,
+#      'P@1':      0.9552904367446899,
+#      'P@3':      0.7907078266143799,
 #      'P@5':      0.5505486726760864
-#  }
+#  }
diff --git a/docs/examples/plot_dataset_tutorial.py b/docs/examples/plot_dataset_tutorial.py
@@ -65,4 +65,4 @@
 
 from libmultilabel.nn.data_utils import load_datasets
 
-datasets = load_datasets(data_sets['train'], data_sets['test'], tokenize_text=False)
+datasets = load_datasets(data_sets['train'], data_sets['test'], tokenize_text=False)
diff --git a/libmultilabel/nn/data_utils.py b/libmultilabel/nn/data_utils.py
@@ -11,7 +11,7 @@
 from sklearn.preprocessing import MultiLabelBinarizer
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import Dataset
-from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases
+from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases, Vocab
 from tqdm import tqdm
 
 transformers.logging.set_verbosity_error()
@@ -22,24 +22,48 @@
 
 
 class TextDataset(Dataset):
-    """Class for text dataset"""
+    """Class for text dataset.
 
-    def __init__(self, data, word_dict, classes, max_seq_length, tokenizer=None, add_special_tokens=True):
+    Args:
+        data (list[dict]): List of instances with index, label, and text.
+        classes (list): List of labels.
+        max_seq_length (int, optional): The maximum number of tokens of a sample.
+        add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
+        tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
+            the transformer-based pretrained language model. Defaults to None.
+        word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
+            map tokens to indices. Defaults to None.
+    """
+    def __init__(
+        self,
+        data,
+        classes,
+        max_seq_length,
+        add_special_tokens=True,
+        *,
+        tokenizer=None,
+        word_dict=None,
+    ):
         self.data = data
-        self.word_dict = word_dict
         self.classes = classes
         self.max_seq_length = max_seq_length
-        self.num_classes = len(self.classes)
-        self.label_binarizer = MultiLabelBinarizer().fit([classes])
+        self.word_dict = word_dict
         self.tokenizer = tokenizer
         self.add_special_tokens = add_special_tokens
 
+        self.num_classes = len(self.classes)
+        self.label_binarizer = MultiLabelBinarizer().fit([classes])
+
+        if not isinstance(self.word_dict, Vocab) ^ isinstance(
+            self.tokenizer, transformers.PreTrainedTokenizerBase):
+            raise ValueError(
+                'Please specify exactly one of word_dict or tokenizer')
+
     def __len__(self):
         return len(self.data)
 
     def __getitem__(self, index):
         data = self.data[index]
-
         if self.tokenizer is not None:  # transformers tokenizer
             if self.add_special_tokens:  # tentatively hard code
                 input_ids = self.tokenizer.encode(data['text'],
@@ -83,35 +107,44 @@ def generate_batch(data_batch):
 
 def get_dataset_loader(
     data,
-    word_dict,
     classes,
     device,
     max_seq_length=500,
     batch_size=1,
     shuffle=False,
     data_workers=4,
+    add_special_tokens=True,
+    *,
     tokenizer=None,
-    add_special_tokens=True
+    word_dict=None,
 ):
     """Create a pytorch DataLoader.
 
     Args:
-        data (list): List of training instances with index, label, and tokenized text.
-        word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
+        data (list[dict]): List of training instances with index, label, and tokenized text.
         classes (list): List of labels.
         device (torch.device): One of cuda or cpu.
         max_seq_length (int, optional): The maximum number of tokens of a sample. Defaults to 500.
         batch_size (int, optional): Size of training batches. Defaults to 1.
         shuffle (bool, optional): Whether to shuffle training data before each epoch. Defaults to False.
         data_workers (int, optional): Use multi-cpu core for data pre-processing. Defaults to 4.
-        tokenizer (optional): Tokenizer of the transformer-based language model. Defaults to None.
         add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
+        tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
+            the transformer-based pretrained language model. Defaults to None.
+        word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
+            map tokens to indices. Defaults to None.
 
     Returns:
         torch.utils.data.DataLoader: A pytorch DataLoader.
     """
-    dataset = TextDataset(data, word_dict, classes, max_seq_length, tokenizer=tokenizer,
-                          add_special_tokens=add_special_tokens)
+    dataset = TextDataset(
+        data,
+        classes,
+        max_seq_length,
+        word_dict=word_dict,
+        tokenizer=tokenizer,
+        add_special_tokens=add_special_tokens
+    )
     dataset_loader = torch.utils.data.DataLoader(
         dataset,
         batch_size=batch_size,
diff --git a/libmultilabel/nn/nn_utils.py b/libmultilabel/nn/nn_utils.py
@@ -37,8 +37,8 @@ def init_device(use_cpu=False):
 def init_model(model_name,
                network_config,
                classes,
-               word_dict,
-               embed_vecs,
+               word_dict=None,
+               embed_vecs=None,
                init_weight=None,
                log_path=None,
                learning_rate=0.0001,
@@ -57,8 +57,10 @@ def init_model(model_name,
         model_name (str): Model to be used such as KimCNN.
         network_config (dict): Configuration for defining the network.
         classes (list): List of class names.
-        word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
-        embed_vecs (torch.Tensor): The pre-trained word vectors of shape (vocab_size, embed_dim).
+        word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
+            map tokens to indices. Defaults to None.
+        embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape
+            (vocab_size, embed_dim). Defaults to None.
         init_weight (str): Weight initialization method from `torch.nn.init`.
             For example, the `init_weight` of `torch.nn.init.kaiming_uniform_`
             is `kaiming_uniform`. Defaults to None.
@@ -79,11 +81,14 @@ def init_model(model_name,
         Model: A class that implements `MultiLabelModel` for initializing and training a neural network.
     """
 
-    network = getattr(networks, model_name)(
-        embed_vecs=embed_vecs,
-        num_classes=len(classes),
-        **dict(network_config)
-    )
+    try:
+        network = getattr(networks, model_name)(
+            embed_vecs=embed_vecs,
+            num_classes=len(classes),
+            **dict(network_config)
+        )
+    except:
+        raise AttributeError(f'Failed to initialize {model_name}.')
 
     if init_weight is not None:
         init_weight = networks.get_init_weight_func(
diff --git a/torch_trainer.py b/torch_trainer.py
@@ -182,13 +182,13 @@ def _get_dataset_loader(self, split, shuffle=False):
         """
         return data_utils.get_dataset_loader(
             data=self.datasets[split],
-            word_dict=self.model.word_dict,
             classes=self.model.classes,
             device=self.device,
             max_seq_length=self.config.max_seq_length,
             batch_size=self.config.batch_size if split == 'train' else self.config.eval_batch_size,
             shuffle=shuffle,
             data_workers=self.config.data_workers,
+            word_dict=self.model.word_dict,
             tokenizer=self.tokenizer,
             add_special_tokens=self.config.add_special_tokens
         )

Original file line number	Diff line number	Diff line change
`@@ -65,4 +65,4 @@`
`65`	`65`
`66`	`66`	`from libmultilabel.nn.data_utils import load_datasets`
`67`	`67`
`68`		`-datasets = load_datasets(data_sets['train'], data_sets['test'], tokenize_text=False)`
	`68`	`+datasets = load_datasets(data_sets['train'], data_sets['test'], tokenize_text=False)`