[DEBUG] with expandable segments (#2841)

felipemello1 · Felipe Mello · web-flow · commit 9d91fe39f086 · 2025-06-20T22:02:16.000-04:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
@@ -51,7 +51,7 @@ jobs:
         if: ${{ matrix.torch-version == 'stable' }}
         run: python -m pip install torch torchvision torchao
       - name: Install recipe-specific dependencies
-        run: python -m pip install lm-eval>=0.4.5
+        run: python -m pip install lm-eval==0.4.8
       - name: Install the torchtune library with dev options
         run: python -m pip install -e ".[dev]"
       - name: Run recipe and unit tests with coverage
diff --git a/.github/workflows/regression_test.yaml b/.github/workflows/regression_test.yaml
@@ -26,8 +26,6 @@ jobs:
         python-version: ['3.11']
         torch-version: ["stable", "nightly"]
       fail-fast: false
-    env:
-      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
     steps:
       - name: Check out repo
         uses: actions/checkout@v4
@@ -57,7 +55,7 @@ jobs:
         run: python -m pip install torch torchvision torchao
       - name: Install remaining dependencies
         run: |
-          python -m pip install lm-eval>=0.4.5
+          python -m pip install lm-eval==0.4.8
           python -m pip install -e ".[dev]"
       - name: Run regression tests with coverage
         run: pytest tests -m slow_integration_test --silence-s3-logs --cov=. --cov-report=xml --durations=20 -vv
diff --git a/.github/workflows/rl_test.yaml b/.github/workflows/rl_test.yaml
@@ -59,7 +59,7 @@ jobs:
         if: ${{ matrix.torch-version == 'stable' }}
         run: python -m pip install torch torchvision torchao
       - name: Install recipe-specific dependencies
-        run: python -m pip install lm-eval>=0.4.5
+        run: python -m pip install lm-eval==0.4.8
       - name: Install the torchtune library with dev options
         run: python -m pip install -e ".[dev]"
       - name: Install the torchtune libary with async_rl options
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
@@ -461,10 +461,10 @@ def __init__(self, cfg: DictConfig) -> None:
         # Double check we have the right Eval Harness version
         from importlib.metadata import version
 
-        if version("lm-eval") < "0.4.5":
+        if version("lm-eval") < "0.4.5" or version("lm-eval") > "0.4.8":
             raise RuntimeError(
-                "This recipe requires EleutherAI Eval Harness v0.4.5 or higher. "
-                "Please install with `pip install lm-eval>=0.4.5`"
+                "This recipe requires EleutherAI Eval Harness between v0.4.5 - 0.4.8."
+                "Please install with `pip install lm-eval==0.4.8`"
             )
 
         # General variable initialization
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -4,6 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
+import sys
+import warnings
+
+# Avoid memory fragmentation and peak reserved memory increasing over time
+# To overwrite, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
+if "PYTORCH_CUDA_ALLOC_CONF" not in os.environ:
+    if "torch" in sys.modules:
+        warnings.warn(
+            "The 'torch' module has already been imported. "
+            "Setting PYTORCH_CUDA_ALLOC_CONF may not have an effect."
+            "For best results, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True before importing 'torch'."
+        )
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
 # Check at the top-level that torchao is installed.
 # This is better than doing it at every import site.
 # We have to do this because it is not currently possible to
diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
@@ -135,8 +135,8 @@ def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir):
         monkeypatch.setattr(sys, "argv", cmd)
         with pytest.raises(
             RuntimeError,
-            match="This recipe requires EleutherAI Eval Harness v0.4.5 or higher. "
-            "Please install with `pip install lm-eval>=0.4.5`",
+            match="This recipe requires EleutherAI Eval Harness between v0.4.5 - 0.4.8."
+            "Please install with `pip install lm-eval==0.4.8`",
         ):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
diff --git a/torchtune/__init__.py b/torchtune/__init__.py
@@ -7,6 +7,22 @@
 __version__ = ""
 
 
+import os
+import sys
+import warnings
+
+# Avoid memory fragmentation and peak reserved memory increasing over time
+# To overwrite, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
+if "PYTORCH_CUDA_ALLOC_CONF" not in os.environ:
+    if "torch" in sys.modules:
+        warnings.warn(
+            "The 'torch' module has already been imported. "
+            "Setting PYTORCH_CUDA_ALLOC_CONF may not have an effect."
+            "For best results, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True before importing 'torch'."
+        )
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
+
 # Check at the top-level that torchao is installed.
 # This is better than doing it at every import site.
 # We have to do this because it is not currently possible to