clarify lora adaptation

markus583 · markus583 · commit 9cc33e2d2e8e · 2025-01-25T17:26:03.000+01:00
diff --git a/README.md b/README.md
@@ -194,7 +194,7 @@ pip install adapters==0.2.1 --no-dependencies
 cd ..
 ```
 
-Create data in this format:
+1. Create data in this format:
 ```python
 import torch
 
@@ -217,19 +217,22 @@ torch.save(
     "dummy-dataset.pth"
 )
 ```
-Note that there should not be any newlines within individual sentences! Your corpus should already be well-split.
+Note that there should not be any newlines within individual sentences! This now raises an error. Instead, each entry of a list should be a sentence, and there should be no "\n" characters. So your corpus should already be well-split.
 
-Create/adapt config; provide base model via `model_name_or_path` and training data .pth via `text_path`:
+2. Create/adapt config; provide base model via `model_name_or_path` and training data .pth via `text_path`:
 
 
 `configs/lora/lora_dummy_config.json`
 
-Train LoRA:
+We recommend starting using this config, and adapting `model_name_or_path`, `output_dir`, and `text_path` if needed.
+You may also wish to adapt other aspects such as `adapter_config` and batch sizes, but this is more experimental.
+
+3. Train LoRA:
 ```
 python3 wtpsplit/train/train_lora.py configs/lora/lora_dummy_config.json
 ```
 
-Once training is done, provide your saved module's path to SaT:
+4. Once training is done, provide your saved module's path to SaT:
 ```python
 
 sat_lora_adapted = SaT("model-used", lora_path="dummy_lora_path")
diff --git a/requirements.txt b/requirements.txt
@@ -18,7 +18,6 @@ cohere
 replicate
 onnx
 onnxruntime
-torchinfo
 mosestokenizer
 cached_property
 tqdm
diff --git a/setup.py b/setup.py
@@ -2,23 +2,23 @@
 
 setup(
     name="wtpsplit",
-    version="2.1.2",
+    version="2.1.3",
     packages=find_packages(),
     description="Universal Robust, Efficient and Adaptable Sentence Segmentation",
     author="Markus Frohmann, Igor Sterner, Benjamin Minixhofer",
     author_email="markus.frohmann@gmail.com",
     install_requires=[
         "onnxruntime>=1.13.1",
         "transformers>=4.22.2",
-        "huggingface-hub==0.25.2",   # see https://github.com/segment-any-text/wtpsplit/issues/135
+        "huggingface-hub",
         "numpy>=1.0",
         "scikit-learn>=1",
         "tqdm",
         "skops",
         "pandas>=1",
         "cached_property",  # for Py37
         "mosestokenizer",
-        "adapters",
+        "adapters>=1.0.1",
     ],
     url="https://github.com/segment-any-text/wtpsplit",
     package_data={"wtpsplit": ["data/*"]},
diff --git a/wtpsplit/__init__.py b/wtpsplit/__init__.py
@@ -19,7 +19,7 @@
 from wtpsplit.extract import BertCharORTWrapper, SaTORTWrapper, PyTorchWrapper, extract
 from wtpsplit.utils import Constants, indices_to_sentences, sigmoid, token_to_char_probs
 
-__version__ = "2.1.2"
+__version__ = "2.1.3"
 
 warnings.simplefilter("default", DeprecationWarning)  # show by default
 warnings.simplefilter("ignore", category=FutureWarning)  # for tranformers
diff --git a/wtpsplit/train/train_lora.py b/wtpsplit/train/train_lora.py
@@ -126,6 +126,10 @@ def prepare_dataset(
                 if one_sample_per_line or isinstance(dataset[0], list):
                     processed_dataset = []
                     for chunk in dataset:
+                        if "\n" in chunk:
+                            raise ValueError(
+                                "Newlines in text are not supported! Data needs to be processed as a list of sentences."
+                            )
                         processed_chunk = {}
                         processed_chunk["lang"] = lang
                         processed_chunk["ends_with_punctuation"] = chunk[-1].endswith(
@@ -137,10 +141,15 @@ def prepare_dataset(
                     dataset = datasets.Dataset.from_list(processed_dataset)
 
                 else:
+                    for i, chunk in enumerate(dataset):
+                        if "\n" in chunk:
+                            raise ValueError(
+                                "Newlines in text are not supported! Data needs to be processed as a list of sentences."
+                            )
                     dataset = datasets.Dataset.from_list(
                         [
                             {
-                                args.text_column: sample + "\n" if sample and sample[-1] != "\n" else sample,  # TODO
+                                args.text_column: sample + "\n" if sample and sample[-1] != "\n" else sample,
                                 "lang": lang,
                                 "ends_with_punctuation": sample.endswith(tuple(Constants.PUNCTUATION_CHARS)),
                             }
diff --git a/wtpsplit/utils/create_dummy_data.py b/wtpsplit/utils/create_dummy_data.py
@@ -0,0 +1,19 @@
+import torch
+
+torch.save(
+    {
+        "language_code": {
+            "sentence": {
+                "dummy-dataset": {
+                    "meta": {
+                        "train_data": ["train sentence 1", "train sentence 2"],
+                    },
+                    "data": [
+                        "train sentence 1", "train sentence 2"
+                    ]
+                }
+            }
+        }
+    },
+    "dummy-dataset.pth"
+)