Skip to content

Commit 9d91fe3

Browse files
felipemello1Felipe Mello
andauthored
[DEBUG] with expandable segments (#2841)
Co-authored-by: Felipe Mello <felipemello@fb.com>
1 parent 05b3b07 commit 9d91fe3

File tree

7 files changed

+39
-10
lines changed

7 files changed

+39
-10
lines changed

.github/workflows/gpu_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
if: ${{ matrix.torch-version == 'stable' }}
5252
run: python -m pip install torch torchvision torchao
5353
- name: Install recipe-specific dependencies
54-
run: python -m pip install lm-eval>=0.4.5
54+
run: python -m pip install lm-eval==0.4.8
5555
- name: Install the torchtune library with dev options
5656
run: python -m pip install -e ".[dev]"
5757
- name: Run recipe and unit tests with coverage

.github/workflows/regression_test.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ jobs:
2626
python-version: ['3.11']
2727
torch-version: ["stable", "nightly"]
2828
fail-fast: false
29-
env:
30-
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
3129
steps:
3230
- name: Check out repo
3331
uses: actions/checkout@v4
@@ -57,7 +55,7 @@ jobs:
5755
run: python -m pip install torch torchvision torchao
5856
- name: Install remaining dependencies
5957
run: |
60-
python -m pip install lm-eval>=0.4.5
58+
python -m pip install lm-eval==0.4.8
6159
python -m pip install -e ".[dev]"
6260
- name: Run regression tests with coverage
6361
run: pytest tests -m slow_integration_test --silence-s3-logs --cov=. --cov-report=xml --durations=20 -vv

.github/workflows/rl_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
if: ${{ matrix.torch-version == 'stable' }}
6060
run: python -m pip install torch torchvision torchao
6161
- name: Install recipe-specific dependencies
62-
run: python -m pip install lm-eval>=0.4.5
62+
run: python -m pip install lm-eval==0.4.8
6363
- name: Install the torchtune library with dev options
6464
run: python -m pip install -e ".[dev]"
6565
- name: Install the torchtune libary with async_rl options

recipes/eleuther_eval.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,10 +461,10 @@ def __init__(self, cfg: DictConfig) -> None:
461461
# Double check we have the right Eval Harness version
462462
from importlib.metadata import version
463463

464-
if version("lm-eval") < "0.4.5":
464+
if version("lm-eval") < "0.4.5" or version("lm-eval") > "0.4.8":
465465
raise RuntimeError(
466-
"This recipe requires EleutherAI Eval Harness v0.4.5 or higher. "
467-
"Please install with `pip install lm-eval>=0.4.5`"
466+
"This recipe requires EleutherAI Eval Harness between v0.4.5 - 0.4.8."
467+
"Please install with `pip install lm-eval==0.4.8`"
468468
)
469469

470470
# General variable initialization

tests/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,21 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
import os
8+
import sys
9+
import warnings
10+
11+
# Avoid memory fragmentation and peak reserved memory increasing over time
12+
# To overwrite, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
13+
if "PYTORCH_CUDA_ALLOC_CONF" not in os.environ:
14+
if "torch" in sys.modules:
15+
warnings.warn(
16+
"The 'torch' module has already been imported. "
17+
"Setting PYTORCH_CUDA_ALLOC_CONF may not have an effect."
18+
"For best results, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True before importing 'torch'."
19+
)
20+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
21+
722
# Check at the top-level that torchao is installed.
823
# This is better than doing it at every import site.
924
# We have to do this because it is not currently possible to

tests/recipes/test_eleuther_eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir):
135135
monkeypatch.setattr(sys, "argv", cmd)
136136
with pytest.raises(
137137
RuntimeError,
138-
match="This recipe requires EleutherAI Eval Harness v0.4.5 or higher. "
139-
"Please install with `pip install lm-eval>=0.4.5`",
138+
match="This recipe requires EleutherAI Eval Harness between v0.4.5 - 0.4.8."
139+
"Please install with `pip install lm-eval==0.4.8`",
140140
):
141141
runpy.run_path(TUNE_PATH, run_name="__main__")
142142

torchtune/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@
77
__version__ = ""
88

99

10+
import os
11+
import sys
12+
import warnings
13+
14+
# Avoid memory fragmentation and peak reserved memory increasing over time
15+
# To overwrite, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False
16+
if "PYTORCH_CUDA_ALLOC_CONF" not in os.environ:
17+
if "torch" in sys.modules:
18+
warnings.warn(
19+
"The 'torch' module has already been imported. "
20+
"Setting PYTORCH_CUDA_ALLOC_CONF may not have an effect."
21+
"For best results, set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True before importing 'torch'."
22+
)
23+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
24+
25+
1026
# Check at the top-level that torchao is installed.
1127
# This is better than doing it at every import site.
1228
# We have to do this because it is not currently possible to

0 commit comments

Comments
 (0)