From 6f7143c9f32ca768597733f8cc660348b968b98b Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 3 May 2025 22:35:42 +0300 Subject: [PATCH 01/17] Add memory usage monitor callback --- keras/src/callbacks/__init__.py | 1 + keras/src/callbacks/memory_usage_callback.py | 238 ++++++++++++++++++ .../callbacks/memory_usage_callback_test.py | 182 ++++++++++++++ requirements-common.txt | 1 + 4 files changed, 422 insertions(+) create mode 100644 keras/src/callbacks/memory_usage_callback.py create mode 100644 keras/src/callbacks/memory_usage_callback_test.py diff --git a/keras/src/callbacks/__init__.py b/keras/src/callbacks/__init__.py index d8c835a418d4..4bd4ccb84df5 100644 --- a/keras/src/callbacks/__init__.py +++ b/keras/src/callbacks/__init__.py @@ -13,3 +13,4 @@ from keras.src.callbacks.swap_ema_weights import SwapEMAWeights from keras.src.callbacks.tensorboard import TensorBoard from keras.src.callbacks.terminate_on_nan import TerminateOnNaN +from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py new file mode 100644 index 000000000000..8b5dd2105672 --- /dev/null +++ b/keras/src/callbacks/memory_usage_callback.py @@ -0,0 +1,238 @@ +import os +import warnings +import tensorflow as tf # Ensure TF is imported for tf.summary +from keras.src import backend as K +from keras.src.api_export import keras_export +from keras.src.callbacks.callback import Callback + +# Attempt to import psutil and warn if unavailable. +try: + import psutil +except ImportError: + psutil = None + +@keras_export("keras.callbacks.MemoryUsageCallback") +class MemoryUsageCallback(Callback): + """Callback for enhanced monitoring of memory usage during training. + + This callback tracks CPU memory usage via `psutil.Process().memory_info().rss` + and optionally GPU memory usage via backend-specific APIs (TensorFlow, PyTorch, JAX). + Memory statistics are logged to stdout at the start and end of each epoch and, + optionally, after every batch. Additionally, metrics are logged to TensorBoard + if a log directory is provided, using integer steps for proper visualization. + + Note: GPU memory reporting consistency across backends (TF, PyTorch, JAX) + may vary, as they use different underlying mechanisms to measure usage + (e.g., framework overhead vs. purely tensor allocations). + + Args: + monitor_gpu (bool): Whether to monitor GPU memory. Defaults to True. + Requires appropriate backend (TensorFlow, PyTorch, JAX) with GPU + support and necessary drivers/libraries installed. + log_every_batch (bool): Whether to log memory usage after each batch + in addition to epoch start/end. Defaults to False. + tensorboard_log_dir (str, optional): Path to the directory where TensorBoard + logs will be written using `tf.summary`. If None, TensorBoard logging + is disabled. Defaults to None. Requires TensorFlow to be installed. + + Raises: + ImportError: If `psutil` is not installed. + + Example: + ```python + import tensorflow as tf + import keras + from keras.callbacks import MemoryUsageCallback # Use public API path + import numpy as np + + # Ensure psutil is installed: pip install psutil + + memory_callback = MemoryUsageCallback( + monitor_gpu=True, # Set based on GPU availability and backend support + log_every_batch=False, + tensorboard_log_dir="~/logs/memory_usage" # Needs TF installed + ) + + model = keras.models.Sequential([ + keras.layers.Dense(64, activation='relu', input_shape=(100,)), + keras.layers.Dense(10, activation='softmax') + ]) + model.compile(optimizer='adam', loss='categorical_crossentropy') + + x_train = np.random.random((100, 100)) + y_train = keras.utils.to_categorical( + np.random.randint(10, size=(100, 1)), num_classes=10 + ) + model.fit(x_train, y_train, epochs=5, batch_size=32, callbacks=[memory_callback]) + ``` + """ + def __init__(self, monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None): + super().__init__() + if psutil is None: + raise ImportError( + "MemoryUsageCallback requires the 'psutil' library. " + "Please install it using 'pip install psutil'." + ) + self.monitor_gpu = monitor_gpu + self.log_every_batch = log_every_batch + self.process = psutil.Process() + self.tb_writer = None + self._total_batches_seen = 0 # For TensorBoard step counting + + if tensorboard_log_dir: + # tf.summary requires TensorFlow installed. + if tf is None: + warnings.warn( + "MemoryUsageCallback: TensorFlow is required for TensorBoard logging. " + "Please install TensorFlow.", ImportWarning + ) + self.tb_writer = None + else: + try: + log_dir = os.path.expanduser(tensorboard_log_dir) + # Use tf.summary for robust integration + self.tb_writer = tf.summary.create_file_writer(log_dir) + print(f"MemoryUsageCallback: TensorBoard logging initialized at {log_dir}") + except Exception as e: + warnings.warn(f"Error initializing TensorBoard writer: {e}", RuntimeWarning) + self.tb_writer = None + + def on_train_begin(self, logs=None): + """Reset batch counter at the start of training.""" + self._total_batches_seen = 0 + + def on_epoch_begin(self, epoch, logs=None): + """Log memory usage at the beginning of each epoch.""" + cpu_mem = self._get_cpu_memory() + gpu_mem = self._get_gpu_memory() + self._log_memory( + label=f"Epoch {epoch} start", + step=epoch, # Use epoch number for TB step + cpu_mem=cpu_mem, + gpu_mem=gpu_mem + ) + + def on_epoch_end(self, epoch, logs=None): + """Log memory usage at the end of each epoch.""" + cpu_mem = self._get_cpu_memory() + gpu_mem = self._get_gpu_memory() + # Use epoch + 1 for TB step to mark the end point distinctly + self._log_memory( + label=f"Epoch {epoch} end", + step=epoch + 1, + cpu_mem=cpu_mem, + gpu_mem=gpu_mem + ) + + def on_batch_end(self, batch, logs=None): + """If enabled, log memory usage at the end of each batch.""" + if self.log_every_batch: + cpu_mem = self._get_cpu_memory() + gpu_mem = self._get_gpu_memory() + # Use the total batches seen count for a continuous TB step + self._log_memory( + label=f"Batch {self._total_batches_seen} end", + step=self._total_batches_seen, + cpu_mem=cpu_mem, + gpu_mem=gpu_mem + ) + # Always increment, even if not logging + self._total_batches_seen += 1 + + def on_train_end(self, logs=None): + """Clean up the TensorBoard writer.""" + if self.tb_writer: + self.tb_writer.close() + self.tb_writer = None + + def _get_cpu_memory(self): + """Return current process CPU memory usage in MB.""" + return self.process.memory_info().rss / (1024 ** 2) + + def _get_gpu_memory(self): + """Return current GPU memory usage in MB based on backend.""" + if not self.monitor_gpu: + return None + + backend = K.backend() + gpu_mem_mb = None + try: + if backend == "tensorflow": + gpus = tf.config.list_physical_devices("GPU") + if not gpus: return None + total_mem_bytes = 0 + for gpu in gpus: + mem_info = tf.config.experimental.get_memory_info(gpu.name) + total_mem_bytes += mem_info.get("current", 0) + gpu_mem_mb = total_mem_bytes / (1024 ** 2) + + elif backend == "torch": + # Note: memory_allocated() tracks only tensors, might differ from TF. + import torch + if not torch.cuda.is_available(): return None + # Sum memory allocated across all visible GPUs + total_mem_bytes = sum(torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count())) + gpu_mem_mb = total_mem_bytes / (1024 ** 2) + + elif backend == "jax": + # Note: JAX memory stats might also differ from TF/Torch in scope. + import jax + devices = jax.devices() + gpu_devices = [d for d in devices if d.platform.upper() == 'GPU'] # Filter for GPU devices + if not gpu_devices: return None + total_mem_bytes = 0 + for device in gpu_devices: + try: + # memory_stats() might not be available or API could change + stats = device.memory_stats() + total_mem_bytes += stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) # Try common keys + except Exception: + # Ignore if stats are unavailable for a device + pass + gpu_mem_mb = total_mem_bytes / (1024 ** 2) + + else: + if not hasattr(self, '_backend_warned'): + warnings.warn(f"Unsupported backend '{backend}' for GPU memory monitoring.", RuntimeWarning) + self._backend_warned = True + return None + + except ImportError as e: + # Backend library might not be installed + if not hasattr(self, f'_{backend}_import_warned'): + warnings.warn(f"MemoryUsageCallback: Could not import library for backend '{backend}': {e}. " + f"GPU monitoring disabled for this backend.", RuntimeWarning) + setattr(self, f'_{backend}_import_warned', True) + return None + except Exception as e: + # Catch other potential errors during memory retrieval + if not hasattr(self, f'_{backend}_error_warned'): + warnings.warn(f"MemoryUsageCallback: Error retrieving GPU memory info for backend '{backend}': {e}", RuntimeWarning) + setattr(self, f'_{backend}_error_warned', True) + return None + + return gpu_mem_mb + + + def _log_memory(self, label, step, cpu_mem, gpu_mem): + """Log memory metrics to stdout and potentially TensorBoard.""" + message = f"{label} - CPU Memory: {cpu_mem:.2f} MB" + if gpu_mem is not None: + message += f"; GPU Memory: {gpu_mem:.2f} MB" + print(message) # Log to stdout + + # Log to TensorBoard if writer is configured + if self.tb_writer: + try: + with self.tb_writer.as_default(step=int(step)): + tf.summary.scalar("Memory/CPU_MB", cpu_mem) + if gpu_mem is not None: + tf.summary.scalar("Memory/GPU_MB", gpu_mem) + self.tb_writer.flush() + except Exception as e: + # Catch potential errors during logging (e.g., writer closed unexpectedly) + if not hasattr(self, '_tb_log_error_warned'): + warnings.warn(f"MemoryUsageCallback: Error writing to TensorBoard: {e}", RuntimeWarning) + self._tb_log_error_warned = True + # Optionally disable writer if logging fails persistently + # self.tb_writer = None \ No newline at end of file diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py new file mode 100644 index 000000000000..bc14b7fd4829 --- /dev/null +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -0,0 +1,182 @@ +import os +import glob +import re +import sys +import tempfile + +import numpy as np +import pytest +import tensorflow as tf + +from io import StringIO +from contextlib import redirect_stdout +from importlib import reload +from unittest.mock import patch, MagicMock + +from keras.src.models import Sequential +from keras.src.layers import Dense +from keras.src.testing import TestCase +from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback + +# Skip the class entirely if psutil is missing +try: + import psutil +except ImportError: + psutil = None + + +@pytest.mark.skipif(psutil is None, reason="psutil is required for MemoryUsageCallback tests.") +class MemoryUsageCallbackTest(TestCase): + def setUp(self): + super().setUp() + # Dummy data + self.x_train = np.random.random((20, 10)).astype(np.float32) + self.y_train = np.random.randint(0, 2, (20, 1)).astype(np.float32) + # Simple model + self.model = Sequential([ + Dense(5, activation="relu", input_shape=(10,)), + Dense(1, activation="sigmoid") + ]) + self.model.compile(optimizer="adam", loss="binary_crossentropy") + + self.epochs = 2 + self.batch_size = 5 + self.steps_per_epoch = len(self.x_train) // self.batch_size + + @pytest.mark.requires_trainable_backend + def test_callback_logs_stdout(self): + """Epoch‐level stdout logging works as expected.""" + out = StringIO() + with redirect_stdout(out): + gpu_avail = bool(tf.config.list_physical_devices("GPU")) + cb = MemoryUsageCallback(monitor_gpu=gpu_avail, log_every_batch=False) + self.model.fit( + self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0 + ) + cap = out.getvalue() + for i in range(self.epochs): + self.assertIn(f"Epoch {i} start - CPU Memory:", cap) + self.assertIn(f"Epoch {i} end - CPU Memory:", cap) + self.assertRegex(cap, rf"Epoch {i} start - CPU Memory: [\d.e+-]+ MB") + self.assertRegex(cap, rf"Epoch {i} end - CPU Memory: [\d.e+-]+ MB") + if gpu_avail: + self.assertIn("GPU Memory:", cap) + else: + self.assertNotIn("GPU Memory:", cap) + + @pytest.mark.requires_trainable_backend + def test_log_every_batch_stdout(self): + """Batch‐level stdout logging works when enabled.""" + out = StringIO() + with redirect_stdout(out): + gpu_avail = bool(tf.config.list_physical_devices("GPU")) + cb = MemoryUsageCallback(monitor_gpu=gpu_avail, log_every_batch=True) + self.model.fit( + self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0 + ) + lines = out.getvalue().splitlines() + total_batches = self.epochs * self.steps_per_epoch + batch_regex = r"Batch \d+ end - CPU Memory: [\d.e+-]+ MB" + count = sum(1 for line in lines if re.search(batch_regex, line)) + self.assertEqual(count, total_batches) + + @pytest.mark.requires_trainable_backend + def test_tensorboard_logging_file_creation(self): + """TensorBoard writer creates event files in given directory.""" + with tempfile.TemporaryDirectory() as tmp_dir: + gpu_avail = bool(tf.config.list_physical_devices("GPU")) + log_dir = os.path.join(tmp_dir, "tb_logs") + cb = MemoryUsageCallback( + monitor_gpu=gpu_avail, + log_every_batch=True, + tensorboard_log_dir=log_dir + ) + # The directory should be created by the callback __init__ + assert os.path.isdir(log_dir) + self.model.fit( + self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0 + ) + event_files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*")) + self.assertGreater(len(event_files), 0) + + @pytest.mark.requires_trainable_backend + def test_get_gpu_memory(self): + """_get_gpu_memory returns float or None depending on availability.""" + cb_gpu = MemoryUsageCallback(monitor_gpu=True, log_every_batch=False) + mem = cb_gpu._get_gpu_memory() + if tf.config.list_physical_devices("GPU"): + assert isinstance(mem, float) and mem >= 0.0 + else: + assert mem is None + + cb_no = MemoryUsageCallback(monitor_gpu=False, log_every_batch=False) + assert cb_no._get_gpu_memory() is None + + def test_raises_if_psutil_missing(self): + """Constructor raises ImportError when psutil is unavailable.""" + import keras.src.callbacks.memory_usage_callback as mod + orig = getattr(mod, 'psutil', None) + with patch.dict(sys.modules, {"psutil": None}): + with pytest.raises(ImportError): + reload(mod) + _ = mod.MemoryUsageCallback() + # Restore + if orig is not None: + sys.modules["psutil"] = orig + reload(mod) + + + + +@pytest.mark.requires_trainable_backend +def test_torch_backend_gpu_memory(monkeypatch): + """Simulate PyTorch backend and verify GPU memory sum.""" + import keras.src.backend as B + monkeypatch.setattr(B, "backend", lambda: "torch") + + # Create a fake torch module + fake_torch = MagicMock() + fake_torch.cuda.is_available.return_value = True + fake_torch.cuda.device_count.return_value = 2 + # Each device allocates 100 MB and 150 MB + fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 150 * 1024**2] + monkeypatch.setitem(sys.modules, "torch", fake_torch) + + cb = MemoryUsageCallback(monitor_gpu=True) + mem = cb._get_gpu_memory() + # Expect (100 + 150) MB + assert pytest.approx(250, rel=1e-6) == mem + + +@pytest.mark.requires_trainable_backend +def test_jax_backend_gpu_memory(monkeypatch): + """Simulate JAX backend and verify GPU memory sum.""" + import keras.src.backend as B + monkeypatch.setattr(B, "backend", lambda: "jax") + + # Fake JAX devices + class FakeDevice: + platform = "gpu" + def memory_stats(self): + return {"bytes_in_use": 200 * 1024**2} + + fake_jax = MagicMock() + fake_jax.devices.return_value = [FakeDevice(), FakeDevice()] + monkeypatch.setitem(sys.modules, "jax", fake_jax) + + cb = MemoryUsageCallback(monitor_gpu=True) + mem = cb._get_gpu_memory() + # Expect 2 * 200 MB + assert pytest.approx(400, rel=1e-6) == mem diff --git a/requirements-common.txt b/requirements-common.txt index 08d81c03f3d9..620e12bd5466 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -22,3 +22,4 @@ dm_tree coverage!=7.6.5 # 7.6.5 breaks CI # for onnx_test.py onnxruntime +psutil \ No newline at end of file From 5f9d9755dbc99e23a82ffc3c9b4a48e61096786d Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 4 May 2025 02:48:42 +0300 Subject: [PATCH 02/17] Add memory usage monitor callback --- keras/src/callbacks/memory_usage_callback.py | 302 +++++++----------- .../callbacks/memory_usage_callback_test.py | 211 +++++------- 2 files changed, 200 insertions(+), 313 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 8b5dd2105672..40b666dfcc3e 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -1,238 +1,166 @@ import os import warnings -import tensorflow as tf # Ensure TF is imported for tf.summary -from keras.src import backend as K + from keras.src.api_export import keras_export from keras.src.callbacks.callback import Callback +from keras.src import backend as K -# Attempt to import psutil and warn if unavailable. +# Attempt to import psutil for CPU memory try: import psutil except ImportError: psutil = None + @keras_export("keras.callbacks.MemoryUsageCallback") class MemoryUsageCallback(Callback): - """Callback for enhanced monitoring of memory usage during training. - - This callback tracks CPU memory usage via `psutil.Process().memory_info().rss` - and optionally GPU memory usage via backend-specific APIs (TensorFlow, PyTorch, JAX). - Memory statistics are logged to stdout at the start and end of each epoch and, - optionally, after every batch. Additionally, metrics are logged to TensorBoard - if a log directory is provided, using integer steps for proper visualization. - - Note: GPU memory reporting consistency across backends (TF, PyTorch, JAX) - may vary, as they use different underlying mechanisms to measure usage - (e.g., framework overhead vs. purely tensor allocations). - - Args: - monitor_gpu (bool): Whether to monitor GPU memory. Defaults to True. - Requires appropriate backend (TensorFlow, PyTorch, JAX) with GPU - support and necessary drivers/libraries installed. - log_every_batch (bool): Whether to log memory usage after each batch - in addition to epoch start/end. Defaults to False. - tensorboard_log_dir (str, optional): Path to the directory where TensorBoard - logs will be written using `tf.summary`. If None, TensorBoard logging - is disabled. Defaults to None. Requires TensorFlow to be installed. - - Raises: - ImportError: If `psutil` is not installed. + """ + Monitors CPU and GPU memory across backends and logs to stdout and TensorBoard. Example: ```python - import tensorflow as tf - import keras - from keras.callbacks import MemoryUsageCallback # Use public API path - import numpy as np - - # Ensure psutil is installed: pip install psutil - - memory_callback = MemoryUsageCallback( - monitor_gpu=True, # Set based on GPU availability and backend support + from keras.callbacks import MemoryUsageCallback + callback = MemoryUsageCallback( + monitor_gpu=True, log_every_batch=False, - tensorboard_log_dir="~/logs/memory_usage" # Needs TF installed + tensorboard_log_dir="./logs" ) + model.fit(..., callbacks=[callback]) + ``` - model = keras.models.Sequential([ - keras.layers.Dense(64, activation='relu', input_shape=(100,)), - keras.layers.Dense(10, activation='softmax') - ]) - model.compile(optimizer='adam', loss='categorical_crossentropy') + Args: + monitor_gpu (bool): Whether to log GPU memory. Defaults to True. + log_every_batch (bool): Whether to log after every batch. Defaults to False. + tensorboard_log_dir (str): Directory for TensorBoard logs; None disables. Defaults to None. - x_train = np.random.random((100, 100)) - y_train = keras.utils.to_categorical( - np.random.randint(10, size=(100, 1)), num_classes=10 - ) - model.fit(x_train, y_train, epochs=5, batch_size=32, callbacks=[memory_callback]) - ``` + Raises: + ImportError: If psutil is not installed. """ - def __init__(self, monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None): + + def __init__( + self, + monitor_gpu=True, + log_every_batch=False, + tensorboard_log_dir=None, + ): super().__init__() if psutil is None: raise ImportError( - "MemoryUsageCallback requires the 'psutil' library. " - "Please install it using 'pip install psutil'." + "MemoryUsageCallback requires `psutil`; install via `pip install psutil`." ) self.monitor_gpu = monitor_gpu self.log_every_batch = log_every_batch self.process = psutil.Process() self.tb_writer = None - self._total_batches_seen = 0 # For TensorBoard step counting + self._batch_count = 0 if tensorboard_log_dir: - # tf.summary requires TensorFlow installed. - if tf is None: - warnings.warn( - "MemoryUsageCallback: TensorFlow is required for TensorBoard logging. " - "Please install TensorFlow.", ImportWarning - ) - self.tb_writer = None - else: - try: - log_dir = os.path.expanduser(tensorboard_log_dir) - # Use tf.summary for robust integration - self.tb_writer = tf.summary.create_file_writer(log_dir) - print(f"MemoryUsageCallback: TensorBoard logging initialized at {log_dir}") - except Exception as e: - warnings.warn(f"Error initializing TensorBoard writer: {e}", RuntimeWarning) - self.tb_writer = None - - def on_train_begin(self, logs=None): - """Reset batch counter at the start of training.""" - self._total_batches_seen = 0 - - def on_epoch_begin(self, epoch, logs=None): - """Log memory usage at the beginning of each epoch.""" - cpu_mem = self._get_cpu_memory() - gpu_mem = self._get_gpu_memory() - self._log_memory( - label=f"Epoch {epoch} start", - step=epoch, # Use epoch number for TB step - cpu_mem=cpu_mem, - gpu_mem=gpu_mem - ) - - def on_epoch_end(self, epoch, logs=None): - """Log memory usage at the end of each epoch.""" - cpu_mem = self._get_cpu_memory() - gpu_mem = self._get_gpu_memory() - # Use epoch + 1 for TB step to mark the end point distinctly - self._log_memory( - label=f"Epoch {epoch} end", - step=epoch + 1, - cpu_mem=cpu_mem, - gpu_mem=gpu_mem - ) - - def on_batch_end(self, batch, logs=None): - """If enabled, log memory usage at the end of each batch.""" - if self.log_every_batch: - cpu_mem = self._get_cpu_memory() - gpu_mem = self._get_gpu_memory() - # Use the total batches seen count for a continuous TB step - self._log_memory( - label=f"Batch {self._total_batches_seen} end", - step=self._total_batches_seen, - cpu_mem=cpu_mem, - gpu_mem=gpu_mem - ) - # Always increment, even if not logging - self._total_batches_seen += 1 + try: + import tensorflow as tf - def on_train_end(self, logs=None): - """Clean up the TensorBoard writer.""" - if self.tb_writer: - self.tb_writer.close() - self.tb_writer = None + logdir = os.path.expanduser(tensorboard_log_dir) + self.tb_writer = tf.summary.create_file_writer(logdir) + except ImportError as e: + warnings.warn(f"TensorBoard disabled (no TF): {e}", RuntimeWarning) + except Exception as e: + warnings.warn( + f"Failed to init TB writer at {tensorboard_log_dir}: {e}", + RuntimeWarning, + ) def _get_cpu_memory(self): - """Return current process CPU memory usage in MB.""" - return self.process.memory_info().rss / (1024 ** 2) + """Return resident set size in MB.""" + return self.process.memory_info().rss / (1024**2) def _get_gpu_memory(self): - """Return current GPU memory usage in MB based on backend.""" + """Return GPU memory usage in MB or None.""" if not self.monitor_gpu: return None - backend = K.backend() - gpu_mem_mb = None try: if backend == "tensorflow": + import tensorflow as tf + gpus = tf.config.list_physical_devices("GPU") - if not gpus: return None - total_mem_bytes = 0 + if not gpus: + return None + total = 0 for gpu in gpus: - mem_info = tf.config.experimental.get_memory_info(gpu.name) - total_mem_bytes += mem_info.get("current", 0) - gpu_mem_mb = total_mem_bytes / (1024 ** 2) - - elif backend == "torch": - # Note: memory_allocated() tracks only tensors, might differ from TF. - import torch - if not torch.cuda.is_available(): return None - # Sum memory allocated across all visible GPUs - total_mem_bytes = sum(torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count())) - gpu_mem_mb = total_mem_bytes / (1024 ** 2) - - elif backend == "jax": - # Note: JAX memory stats might also differ from TF/Torch in scope. - import jax - devices = jax.devices() - gpu_devices = [d for d in devices if d.platform.upper() == 'GPU'] # Filter for GPU devices - if not gpu_devices: return None - total_mem_bytes = 0 - for device in gpu_devices: - try: - # memory_stats() might not be available or API could change - stats = device.memory_stats() - total_mem_bytes += stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) # Try common keys - except Exception: - # Ignore if stats are unavailable for a device - pass - gpu_mem_mb = total_mem_bytes / (1024 ** 2) - - else: - if not hasattr(self, '_backend_warned'): - warnings.warn(f"Unsupported backend '{backend}' for GPU memory monitoring.", RuntimeWarning) - self._backend_warned = True - return None + info = tf.config.experimental.get_memory_info(gpu.name) + total += info.get("current", 0) + return total / (1024**2) + + if backend == "torch": + import torch + + if not torch.cuda.is_available(): + return None + total = sum( + torch.cuda.memory_allocated(i) + for i in range(torch.cuda.device_count()) + ) + return total / (1024**2) + + if backend == "jax": + import jax + + devs = [d for d in jax.devices() if d.platform == "gpu"] + if not devs: + return None + total = 0 + for d in devs: + stats = getattr(d, "memory_stats", lambda: {})() + total += stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) + return total / (1024**2) + + if not hasattr(self, "_warned_backend"): + warnings.warn( + f"Backend '{backend}' not supported for GPU memory.", + RuntimeWarning, + ) + self._warned_backend = True + return None except ImportError as e: - # Backend library might not be installed - if not hasattr(self, f'_{backend}_import_warned'): - warnings.warn(f"MemoryUsageCallback: Could not import library for backend '{backend}': {e}. " - f"GPU monitoring disabled for this backend.", RuntimeWarning) - setattr(self, f'_{backend}_import_warned', True) - return None + warnings.warn( + f"Could not import backend lib ({e}); GPU disabled.", + RuntimeWarning, + ) + return None except Exception as e: - # Catch other potential errors during memory retrieval - if not hasattr(self, f'_{backend}_error_warned'): - warnings.warn(f"MemoryUsageCallback: Error retrieving GPU memory info for backend '{backend}': {e}", RuntimeWarning) - setattr(self, f'_{backend}_error_warned', True) + warnings.warn(f"Error retrieving GPU memory ({e}).", RuntimeWarning) return None - return gpu_mem_mb + def _log(self, label, step): + cpu = self._get_cpu_memory() + gpu = self._get_gpu_memory() + msg = f"{label} - CPU Memory: {cpu:.2f} MB" + if gpu is not None: + msg += f"; GPU Memory: {gpu:.2f} MB" + print(msg) + if self.tb_writer: + import tensorflow as tf + + with self.tb_writer.as_default(step=int(step)): + tf.summary.scalar("Memory/CPU_MB", cpu) + if gpu is not None: + tf.summary.scalar("Memory/GPU_MB", gpu) + self.tb_writer.flush() + def on_train_begin(self, logs=None): + self._batch_count = 0 - def _log_memory(self, label, step, cpu_mem, gpu_mem): - """Log memory metrics to stdout and potentially TensorBoard.""" - message = f"{label} - CPU Memory: {cpu_mem:.2f} MB" - if gpu_mem is not None: - message += f"; GPU Memory: {gpu_mem:.2f} MB" - print(message) # Log to stdout + def on_epoch_begin(self, epoch, logs=None): + self._log(f"Epoch {epoch} start", epoch) - # Log to TensorBoard if writer is configured + def on_epoch_end(self, epoch, logs=None): + self._log(f"Epoch {epoch} end", epoch + 1) + + def on_batch_end(self, batch, logs=None): + if self.log_every_batch: + self._log(f"Batch {self._batch_count} end", self._batch_count) + self._batch_count += 1 + + def on_train_end(self, logs=None): if self.tb_writer: - try: - with self.tb_writer.as_default(step=int(step)): - tf.summary.scalar("Memory/CPU_MB", cpu_mem) - if gpu_mem is not None: - tf.summary.scalar("Memory/GPU_MB", gpu_mem) - self.tb_writer.flush() - except Exception as e: - # Catch potential errors during logging (e.g., writer closed unexpectedly) - if not hasattr(self, '_tb_log_error_warned'): - warnings.warn(f"MemoryUsageCallback: Error writing to TensorBoard: {e}", RuntimeWarning) - self._tb_log_error_warned = True - # Optionally disable writer if logging fails persistently - # self.tb_writer = None \ No newline at end of file + self.tb_writer.close() diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index bc14b7fd4829..e2a4164ba2ab 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,182 +1,141 @@ import os import glob -import re -import sys import tempfile +import warnings -import numpy as np -import pytest -import tensorflow as tf - -from io import StringIO from contextlib import redirect_stdout +from io import StringIO from importlib import reload from unittest.mock import patch, MagicMock -from keras.src.models import Sequential +import numpy as np +import pytest + +from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback from keras.src.layers import Dense +from keras.src.models import Sequential from keras.src.testing import TestCase -from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback +from keras.src import backend as K -# Skip the class entirely if psutil is missing +# Skip all tests if psutil is not installed try: import psutil except ImportError: psutil = None -@pytest.mark.skipif(psutil is None, reason="psutil is required for MemoryUsageCallback tests.") +@pytest.mark.skipif( + psutil is None, reason="psutil is required for MemoryUsageCallback tests." +) class MemoryUsageCallbackTest(TestCase): def setUp(self): super().setUp() - # Dummy data - self.x_train = np.random.random((20, 10)).astype(np.float32) - self.y_train = np.random.randint(0, 2, (20, 1)).astype(np.float32) - # Simple model - self.model = Sequential([ - Dense(5, activation="relu", input_shape=(10,)), - Dense(1, activation="sigmoid") - ]) + self.x = np.random.random((20, 10)).astype(np.float32) + self.y = np.random.randint(0, 2, (20, 1)).astype(np.float32) + self.model = Sequential( + [ + Dense(5, activation="relu", input_shape=(10,)), + Dense(1, activation="sigmoid"), + ] + ) self.model.compile(optimizer="adam", loss="binary_crossentropy") - self.epochs = 2 self.batch_size = 5 - self.steps_per_epoch = len(self.x_train) // self.batch_size + self.total_batches = self.epochs * (len(self.x) // self.batch_size) @pytest.mark.requires_trainable_backend - def test_callback_logs_stdout(self): - """Epoch‐level stdout logging works as expected.""" + def test_epoch_and_batch_stdout(self): out = StringIO() with redirect_stdout(out): - gpu_avail = bool(tf.config.list_physical_devices("GPU")) - cb = MemoryUsageCallback(monitor_gpu=gpu_avail, log_every_batch=False) - self.model.fit( - self.x_train, self.y_train, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0 - ) - cap = out.getvalue() + # Mock GPU memory for predictability + with patch.object( + MemoryUsageCallback, "_get_gpu_memory", return_value=42.0 + ): + cb = MemoryUsageCallback(monitor_gpu=True, log_every_batch=True) + self.model.fit( + self.x, + self.y, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0, + ) + log = out.getvalue().splitlines() + # Check epoch logs for i in range(self.epochs): - self.assertIn(f"Epoch {i} start - CPU Memory:", cap) - self.assertIn(f"Epoch {i} end - CPU Memory:", cap) - self.assertRegex(cap, rf"Epoch {i} start - CPU Memory: [\d.e+-]+ MB") - self.assertRegex(cap, rf"Epoch {i} end - CPU Memory: [\d.e+-]+ MB") - if gpu_avail: - self.assertIn("GPU Memory:", cap) - else: - self.assertNotIn("GPU Memory:", cap) + assert any(f"Epoch {i} start" in line for line in log) + assert any(f"Epoch {i} end" in line for line in log) + # Check batch logs count + batch_lines = [l for l in log if l.startswith("Batch")] + assert len(batch_lines) == self.total_batches + # Confirm GPU part present + assert any("GPU Memory: 42.00 MB" in l for l in log) @pytest.mark.requires_trainable_backend - def test_log_every_batch_stdout(self): - """Batch‐level stdout logging works when enabled.""" - out = StringIO() - with redirect_stdout(out): - gpu_avail = bool(tf.config.list_physical_devices("GPU")) - cb = MemoryUsageCallback(monitor_gpu=gpu_avail, log_every_batch=True) - self.model.fit( - self.x_train, self.y_train, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0 - ) - lines = out.getvalue().splitlines() - total_batches = self.epochs * self.steps_per_epoch - batch_regex = r"Batch \d+ end - CPU Memory: [\d.e+-]+ MB" - count = sum(1 for line in lines if re.search(batch_regex, line)) - self.assertEqual(count, total_batches) - - @pytest.mark.requires_trainable_backend - def test_tensorboard_logging_file_creation(self): - """TensorBoard writer creates event files in given directory.""" - with tempfile.TemporaryDirectory() as tmp_dir: - gpu_avail = bool(tf.config.list_physical_devices("GPU")) - log_dir = os.path.join(tmp_dir, "tb_logs") - cb = MemoryUsageCallback( - monitor_gpu=gpu_avail, - log_every_batch=True, - tensorboard_log_dir=log_dir - ) - # The directory should be created by the callback __init__ - assert os.path.isdir(log_dir) - self.model.fit( - self.x_train, self.y_train, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0 - ) - event_files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*")) - self.assertGreater(len(event_files), 0) - - @pytest.mark.requires_trainable_backend - def test_get_gpu_memory(self): - """_get_gpu_memory returns float or None depending on availability.""" - cb_gpu = MemoryUsageCallback(monitor_gpu=True, log_every_batch=False) - mem = cb_gpu._get_gpu_memory() - if tf.config.list_physical_devices("GPU"): - assert isinstance(mem, float) and mem >= 0.0 - else: - assert mem is None - - cb_no = MemoryUsageCallback(monitor_gpu=False, log_every_batch=False) - assert cb_no._get_gpu_memory() is None - - def test_raises_if_psutil_missing(self): - """Constructor raises ImportError when psutil is unavailable.""" + def test_tensorboard_file_creation(self): + with tempfile.TemporaryDirectory() as tmpdir: + tb_dir = os.path.join(tmpdir, "tb") + # Mock CPU/GPU memory + with patch.object( + MemoryUsageCallback, "_get_gpu_memory", return_value=10.0 + ), patch.object(MemoryUsageCallback, "_get_cpu_memory", return_value=5.0): + cb = MemoryUsageCallback( + monitor_gpu=True, + log_every_batch=False, + tensorboard_log_dir=tb_dir, + ) + assert os.path.isdir(tb_dir) + self.model.fit( + self.x, + self.y, + epochs=1, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0, + ) + events = glob.glob(os.path.join(tb_dir, "events.out.tfevents.*")) + assert events, "No TensorBoard event file found." + + def test_import_error_without_psutil(self): + import sys import keras.src.callbacks.memory_usage_callback as mod - orig = getattr(mod, 'psutil', None) + + orig = getattr(mod, "psutil", None) with patch.dict(sys.modules, {"psutil": None}): with pytest.raises(ImportError): reload(mod) _ = mod.MemoryUsageCallback() - # Restore + # restore if orig is not None: sys.modules["psutil"] = orig reload(mod) - - +# Backend-specific tests @pytest.mark.requires_trainable_backend -def test_torch_backend_gpu_memory(monkeypatch): - """Simulate PyTorch backend and verify GPU memory sum.""" - import keras.src.backend as B - monkeypatch.setattr(B, "backend", lambda: "torch") - - # Create a fake torch module +def test_torch_gpu_memory(monkeypatch): + monkeypatch.setattr(K, "backend", lambda: "torch") fake_torch = MagicMock() fake_torch.cuda.is_available.return_value = True fake_torch.cuda.device_count.return_value = 2 - # Each device allocates 100 MB and 150 MB fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 150 * 1024**2] - monkeypatch.setitem(sys.modules, "torch", fake_torch) - + monkeypatch.setitem(__import__("sys").modules, "torch", fake_torch) cb = MemoryUsageCallback(monitor_gpu=True) - mem = cb._get_gpu_memory() - # Expect (100 + 150) MB - assert pytest.approx(250, rel=1e-6) == mem + assert pytest.approx(250, rel=1e-6) == cb._get_gpu_memory() @pytest.mark.requires_trainable_backend -def test_jax_backend_gpu_memory(monkeypatch): - """Simulate JAX backend and verify GPU memory sum.""" - import keras.src.backend as B - monkeypatch.setattr(B, "backend", lambda: "jax") +def test_jax_gpu_memory(monkeypatch): + monkeypatch.setattr(K, "backend", lambda: "jax") - # Fake JAX devices - class FakeDevice: + class Dev: platform = "gpu" + def memory_stats(self): return {"bytes_in_use": 200 * 1024**2} fake_jax = MagicMock() - fake_jax.devices.return_value = [FakeDevice(), FakeDevice()] - monkeypatch.setitem(sys.modules, "jax", fake_jax) - + fake_jax.devices.return_value = [Dev(), Dev()] + monkeypatch.setitem(__import__("sys").modules, "jax", fake_jax) cb = MemoryUsageCallback(monitor_gpu=True) - mem = cb._get_gpu_memory() - # Expect 2 * 200 MB - assert pytest.approx(400, rel=1e-6) == mem + assert pytest.approx(400, rel=1e-6) == cb._get_gpu_memory() From daddf2952d71b5aab00a3bab297d97f3761e107f Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 4 May 2025 03:32:21 +0300 Subject: [PATCH 03/17] Fix formatting errors --- keras/src/callbacks/__init__.py | 3 ++- keras/src/callbacks/memory_usage_callback.py | 15 ++++++--------- keras/src/callbacks/memory_usage_callback_test.py | 10 +++++++++- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/keras/src/callbacks/__init__.py b/keras/src/callbacks/__init__.py index 4bd4ccb84df5..178441d5a76a 100644 --- a/keras/src/callbacks/__init__.py +++ b/keras/src/callbacks/__init__.py @@ -6,6 +6,7 @@ from keras.src.callbacks.history import History from keras.src.callbacks.lambda_callback import LambdaCallback from keras.src.callbacks.learning_rate_scheduler import LearningRateScheduler +from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback from keras.src.callbacks.model_checkpoint import ModelCheckpoint from keras.src.callbacks.progbar_logger import ProgbarLogger from keras.src.callbacks.reduce_lr_on_plateau import ReduceLROnPlateau @@ -13,4 +14,4 @@ from keras.src.callbacks.swap_ema_weights import SwapEMAWeights from keras.src.callbacks.tensorboard import TensorBoard from keras.src.callbacks.terminate_on_nan import TerminateOnNaN -from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback + diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 40b666dfcc3e..71379baf2a0a 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -6,6 +6,7 @@ from keras.src import backend as K # Attempt to import psutil for CPU memory + try: import psutil except ImportError: @@ -56,7 +57,7 @@ def __init__( if tensorboard_log_dir: try: - import tensorflow as tf + import tensorflow as tf logdir = os.path.expanduser(tensorboard_log_dir) self.tb_writer = tf.summary.create_file_writer(logdir) @@ -79,7 +80,7 @@ def _get_gpu_memory(self): backend = K.backend() try: if backend == "tensorflow": - import tensorflow as tf + import tensorflow as tf gpus = tf.config.list_physical_devices("GPU") if not gpus: @@ -89,9 +90,8 @@ def _get_gpu_memory(self): info = tf.config.experimental.get_memory_info(gpu.name) total += info.get("current", 0) return total / (1024**2) - if backend == "torch": - import torch + import torch if not torch.cuda.is_available(): return None @@ -100,9 +100,8 @@ def _get_gpu_memory(self): for i in range(torch.cuda.device_count()) ) return total / (1024**2) - if backend == "jax": - import jax + import jax devs = [d for d in jax.devices() if d.platform == "gpu"] if not devs: @@ -112,7 +111,6 @@ def _get_gpu_memory(self): stats = getattr(d, "memory_stats", lambda: {})() total += stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) return total / (1024**2) - if not hasattr(self, "_warned_backend"): warnings.warn( f"Backend '{backend}' not supported for GPU memory.", @@ -120,7 +118,6 @@ def _get_gpu_memory(self): ) self._warned_backend = True return None - except ImportError as e: warnings.warn( f"Could not import backend lib ({e}); GPU disabled.", @@ -139,7 +136,7 @@ def _log(self, label, step): msg += f"; GPU Memory: {gpu:.2f} MB" print(msg) if self.tb_writer: - import tensorflow as tf + import tensorflow as tf with self.tb_writer.as_default(step=int(step)): tf.summary.scalar("Memory/CPU_MB", cpu) diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index e2a4164ba2ab..1e2061675994 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,7 +1,6 @@ import os import glob import tempfile -import warnings from contextlib import redirect_stdout from io import StringIO @@ -18,6 +17,7 @@ from keras.src import backend as K # Skip all tests if psutil is not installed + try: import psutil except ImportError: @@ -48,6 +48,7 @@ def test_epoch_and_batch_stdout(self): out = StringIO() with redirect_stdout(out): # Mock GPU memory for predictability + with patch.object( MemoryUsageCallback, "_get_gpu_memory", return_value=42.0 ): @@ -62,13 +63,16 @@ def test_epoch_and_batch_stdout(self): ) log = out.getvalue().splitlines() # Check epoch logs + for i in range(self.epochs): assert any(f"Epoch {i} start" in line for line in log) assert any(f"Epoch {i} end" in line for line in log) # Check batch logs count + batch_lines = [l for l in log if l.startswith("Batch")] assert len(batch_lines) == self.total_batches # Confirm GPU part present + assert any("GPU Memory: 42.00 MB" in l for l in log) @pytest.mark.requires_trainable_backend @@ -76,6 +80,7 @@ def test_tensorboard_file_creation(self): with tempfile.TemporaryDirectory() as tmpdir: tb_dir = os.path.join(tmpdir, "tb") # Mock CPU/GPU memory + with patch.object( MemoryUsageCallback, "_get_gpu_memory", return_value=10.0 ), patch.object(MemoryUsageCallback, "_get_cpu_memory", return_value=5.0): @@ -106,12 +111,15 @@ def test_import_error_without_psutil(self): reload(mod) _ = mod.MemoryUsageCallback() # restore + if orig is not None: sys.modules["psutil"] = orig reload(mod) # Backend-specific tests + + @pytest.mark.requires_trainable_backend def test_torch_gpu_memory(monkeypatch): monkeypatch.setattr(K, "backend", lambda: "torch") From 105cbdc40cc525948977d288f0b9a7a684af99ff Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 4 May 2025 20:50:22 +0300 Subject: [PATCH 04/17] Add openvino support --- keras/src/callbacks/memory_usage_callback.py | 154 ++++++++-------- .../callbacks/memory_usage_callback_test.py | 169 ++++++++++-------- 2 files changed, 161 insertions(+), 162 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 71379baf2a0a..f54af2a57a50 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -5,8 +5,6 @@ from keras.src.callbacks.callback import Callback from keras.src import backend as K -# Attempt to import psutil for CPU memory - try: import psutil except ImportError: @@ -15,83 +13,88 @@ @keras_export("keras.callbacks.MemoryUsageCallback") class MemoryUsageCallback(Callback): - """ - Monitors CPU and GPU memory across backends and logs to stdout and TensorBoard. - - Example: - ```python - from keras.callbacks import MemoryUsageCallback - callback = MemoryUsageCallback( - monitor_gpu=True, - log_every_batch=False, - tensorboard_log_dir="./logs" - ) - model.fit(..., callbacks=[callback]) - ``` + """Monitor CPU/GPU/TPU/OpenVINO memory during training. + + Tracks: + - CPU memory via `psutil.Process().memory_info().rss`. + - GPU memory via backend APIs (TF, Torch, JAX, OpenVINO). + - Logs to stdout and, optionally, to TensorBoard. Args: - monitor_gpu (bool): Whether to log GPU memory. Defaults to True. - log_every_batch (bool): Whether to log after every batch. Defaults to False. - tensorboard_log_dir (str): Directory for TensorBoard logs; None disables. Defaults to None. + monitor_gpu: Bool. If True, query GPU/accelerator memory. + log_every_batch: Bool. If True, log after each batch. + tensorboard_log_dir: str or None. If set, use TF summary writer. Raises: - ImportError: If psutil is not installed. + ImportError: If `psutil` is missing. """ def __init__( - self, - monitor_gpu=True, - log_every_batch=False, - tensorboard_log_dir=None, + self, monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None ): super().__init__() if psutil is None: - raise ImportError( - "MemoryUsageCallback requires `psutil`; install via `pip install psutil`." - ) + raise ImportError("MemoryUsageCallback requires the 'psutil' library.") self.monitor_gpu = monitor_gpu self.log_every_batch = log_every_batch self.process = psutil.Process() self.tb_writer = None - self._batch_count = 0 + self._batches_seen = 0 if tensorboard_log_dir: try: - import tensorflow as tf + import tensorflow as tf logdir = os.path.expanduser(tensorboard_log_dir) self.tb_writer = tf.summary.create_file_writer(logdir) - except ImportError as e: - warnings.warn(f"TensorBoard disabled (no TF): {e}", RuntimeWarning) except Exception as e: - warnings.warn( - f"Failed to init TB writer at {tensorboard_log_dir}: {e}", - RuntimeWarning, - ) + warnings.warn(f"TB init error: {e}", RuntimeWarning) + + def on_train_begin(self, logs=None): + self._batches_seen = 0 + + def on_epoch_begin(self, epoch, logs=None): + cpu = self._cpu_mem_mb() + gpu = self._get_gpu_memory() + self._log("Epoch %d start" % epoch, epoch, cpu, gpu) + + def on_epoch_end(self, epoch, logs=None): + cpu = self._cpu_mem_mb() + gpu = self._get_gpu_memory() + self._log("Epoch %d end" % epoch, epoch + 1, cpu, gpu) + + def on_batch_end(self, batch, logs=None): + if self.log_every_batch: + cpu = self._cpu_mem_mb() + gpu = self._get_gpu_memory() + self._log(f"Batch {self._batches_seen} end", self._batches_seen, cpu, gpu) + self._batches_seen += 1 + + def on_train_end(self, logs=None): + if self.tb_writer: + self.tb_writer.close() - def _get_cpu_memory(self): - """Return resident set size in MB.""" + def _cpu_mem_mb(self): return self.process.memory_info().rss / (1024**2) def _get_gpu_memory(self): - """Return GPU memory usage in MB or None.""" if not self.monitor_gpu: return None backend = K.backend() try: if backend == "tensorflow": - import tensorflow as tf + import tensorflow as tf gpus = tf.config.list_physical_devices("GPU") if not gpus: return None total = 0 - for gpu in gpus: - info = tf.config.experimental.get_memory_info(gpu.name) + for g in gpus: + info = tf.config.experimental.get_memory_info(g.name) total += info.get("current", 0) return total / (1024**2) if backend == "torch": - import torch + import torch if not torch.cuda.is_available(): return None @@ -101,63 +104,46 @@ def _get_gpu_memory(self): ) return total / (1024**2) if backend == "jax": - import jax + import jax - devs = [d for d in jax.devices() if d.platform == "gpu"] + devs = [d for d in jax.devices() if d.platform.upper() == "GPU"] if not devs: return None total = 0 for d in devs: - stats = getattr(d, "memory_stats", lambda: {})() - total += stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) + stats = d.memory_stats() + total += stats.get("bytes_in_use", 0) return total / (1024**2) - if not hasattr(self, "_warned_backend"): - warnings.warn( - f"Backend '{backend}' not supported for GPU memory.", - RuntimeWarning, - ) - self._warned_backend = True - return None + if backend == "openvino": + try: + import openvino as ov + + core = ov.Core() + devices = core.available_devices + total = 0 + for dev in devices: + stats = core.get_property(dev, "DEVICE_MEMORY_STATISTICS") + total += stats.get("deviceUsedBytes", 0) + return total / (1024**2) + except Exception as e: + warnings.warn(f"OVINO mem err: {e}", RuntimeWarning) + return None except ImportError as e: - warnings.warn( - f"Could not import backend lib ({e}); GPU disabled.", - RuntimeWarning, - ) - return None - except Exception as e: - warnings.warn(f"Error retrieving GPU memory ({e}).", RuntimeWarning) + warnings.warn(f"Import err for {backend}: {e}", RuntimeWarning) return None + warnings.warn(f"Unsupported backend '{backend}'", RuntimeWarning) + return None - def _log(self, label, step): - cpu = self._get_cpu_memory() - gpu = self._get_gpu_memory() - msg = f"{label} - CPU Memory: {cpu:.2f} MB" + def _log(self, label, step, cpu, gpu): + msg = f"{label} - CPU: {cpu:.2f} MB" if gpu is not None: - msg += f"; GPU Memory: {gpu:.2f} MB" + msg += f"; GPU: {gpu:.2f} MB" print(msg) if self.tb_writer: - import tensorflow as tf + import tensorflow as tf - with self.tb_writer.as_default(step=int(step)): + with self.tb_writer.as_default(step=step): tf.summary.scalar("Memory/CPU_MB", cpu) if gpu is not None: tf.summary.scalar("Memory/GPU_MB", gpu) self.tb_writer.flush() - - def on_train_begin(self, logs=None): - self._batch_count = 0 - - def on_epoch_begin(self, epoch, logs=None): - self._log(f"Epoch {epoch} start", epoch) - - def on_epoch_end(self, epoch, logs=None): - self._log(f"Epoch {epoch} end", epoch + 1) - - def on_batch_end(self, batch, logs=None): - if self.log_every_batch: - self._log(f"Batch {self._batch_count} end", self._batch_count) - self._batch_count += 1 - - def on_train_end(self, logs=None): - if self.tb_writer: - self.tb_writer.close() diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 1e2061675994..9df254d6aaa9 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,10 +1,10 @@ import os import glob +import sys import tempfile - -from contextlib import redirect_stdout -from io import StringIO from importlib import reload +from io import StringIO +from contextlib import redirect_stdout from unittest.mock import patch, MagicMock import numpy as np @@ -14,9 +14,6 @@ from keras.src.layers import Dense from keras.src.models import Sequential from keras.src.testing import TestCase -from keras.src import backend as K - -# Skip all tests if psutil is not installed try: import psutil @@ -24,9 +21,7 @@ psutil = None -@pytest.mark.skipif( - psutil is None, reason="psutil is required for MemoryUsageCallback tests." -) +@pytest.mark.skipif(psutil is None, reason="psutil is required") class MemoryUsageCallbackTest(TestCase): def setUp(self): super().setUp() @@ -40,110 +35,128 @@ def setUp(self): ) self.model.compile(optimizer="adam", loss="binary_crossentropy") self.epochs = 2 - self.batch_size = 5 - self.total_batches = self.epochs * (len(self.x) // self.batch_size) + self.bs = 5 + self.steps = len(self.x) // self.bs @pytest.mark.requires_trainable_backend - def test_epoch_and_batch_stdout(self): + def test_epoch_logging(self): out = StringIO() with redirect_stdout(out): - # Mock GPU memory for predictability - - with patch.object( - MemoryUsageCallback, "_get_gpu_memory", return_value=42.0 - ): - cb = MemoryUsageCallback(monitor_gpu=True, log_every_batch=True) - self.model.fit( - self.x, - self.y, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0, - ) - log = out.getvalue().splitlines() - # Check epoch logs - - for i in range(self.epochs): - assert any(f"Epoch {i} start" in line for line in log) - assert any(f"Epoch {i} end" in line for line in log) - # Check batch logs count - - batch_lines = [l for l in log if l.startswith("Batch")] - assert len(batch_lines) == self.total_batches - # Confirm GPU part present - assert any("GPU Memory: 42.00 MB" in l for l in log) + for gpu_val in (None, 42.0): + with patch.object( + MemoryUsageCallback, "_get_gpu_memory", return_value=gpu_val + ): + cb = MemoryUsageCallback(monitor_gpu=True, log_every_batch=False) + self.model.fit( + self.x, + self.y, + epochs=self.epochs, + batch_size=self.bs, + callbacks=[cb], + verbose=0, + ) + log = out.getvalue() + # must see epoch start/end lines + + for e in range(self.epochs): + assert f"Epoch {e} start" in log + assert f"Epoch {e} end" in log + # must see GPU segment at least once if gpu_val not None + + assert "; GPU" in log @pytest.mark.requires_trainable_backend - def test_tensorboard_file_creation(self): - with tempfile.TemporaryDirectory() as tmpdir: - tb_dir = os.path.join(tmpdir, "tb") - # Mock CPU/GPU memory - + def test_batch_logging(self): + out = StringIO() + with redirect_stdout(out): with patch.object( - MemoryUsageCallback, "_get_gpu_memory", return_value=10.0 - ), patch.object(MemoryUsageCallback, "_get_cpu_memory", return_value=5.0): - cb = MemoryUsageCallback( - monitor_gpu=True, - log_every_batch=False, - tensorboard_log_dir=tb_dir, - ) - assert os.path.isdir(tb_dir) + MemoryUsageCallback, "_get_gpu_memory", return_value=None + ): + cb = MemoryUsageCallback(monitor_gpu=True, log_every_batch=True) self.model.fit( self.x, self.y, epochs=1, - batch_size=self.batch_size, + batch_size=self.bs, callbacks=[cb], verbose=0, ) - events = glob.glob(os.path.join(tb_dir, "events.out.tfevents.*")) - assert events, "No TensorBoard event file found." + lines = [ln for ln in out.getvalue().splitlines() if ln.startswith("Batch ")] + assert len(lines) == self.steps - def test_import_error_without_psutil(self): - import sys - import keras.src.callbacks.memory_usage_callback as mod + @pytest.mark.requires_trainable_backend + def test_tensorboard_events(self): + tmp = tempfile.TemporaryDirectory() + logdir = os.path.join(tmp.name, "logs") + cb = MemoryUsageCallback( + monitor_gpu=False, log_every_batch=False, tensorboard_log_dir=logdir + ) + assert os.path.isdir(logdir) + self.model.fit( + self.x, self.y, epochs=1, batch_size=self.bs, callbacks=[cb], verbose=0 + ) + files = glob.glob(os.path.join(logdir, "events.out.tfevents.*")) + assert len(files) > 0 + tmp.cleanup() + def test_missing_psutil(self): + mod = sys.modules["keras.src.callbacks.memory_usage_callback"] orig = getattr(mod, "psutil", None) with patch.dict(sys.modules, {"psutil": None}): with pytest.raises(ImportError): reload(mod) _ = mod.MemoryUsageCallback() - # restore - if orig is not None: sys.modules["psutil"] = orig reload(mod) -# Backend-specific tests - - @pytest.mark.requires_trainable_backend -def test_torch_gpu_memory(monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "torch") - fake_torch = MagicMock() - fake_torch.cuda.is_available.return_value = True - fake_torch.cuda.device_count.return_value = 2 - fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 150 * 1024**2] - monkeypatch.setitem(__import__("sys").modules, "torch", fake_torch) +def test_torch_gpu(monkeypatch): + import keras.src.backend as B + + monkeypatch.setattr(B, "backend", lambda: "torch") + fake = MagicMock() + fake.cuda.is_available.return_value = True + fake.cuda.device_count.return_value = 2 + fake.cuda.memory_allocated.side_effect = [50 * 1024**2, 70 * 1024**2] + monkeypatch.setitem(sys.modules, "torch", fake) cb = MemoryUsageCallback(monitor_gpu=True) - assert pytest.approx(250, rel=1e-6) == cb._get_gpu_memory() + assert pytest.approx(120, rel=1e-6) == cb._get_gpu_memory() @pytest.mark.requires_trainable_backend -def test_jax_gpu_memory(monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "jax") +def test_jax_gpu(monkeypatch): + import keras.src.backend as B + + monkeypatch.setattr(B, "backend", lambda: "jax") class Dev: - platform = "gpu" + platform = "GPU" def memory_stats(self): - return {"bytes_in_use": 200 * 1024**2} + return {"bytes_in_use": 30 * 1024**2} + + fake = MagicMock(devices=lambda: [Dev(), Dev()]) + monkeypatch.setitem(sys.modules, "jax", fake) + cb = MemoryUsageCallback(monitor_gpu=True) + assert pytest.approx(60, rel=1e-6) == cb._get_gpu_memory() + + +@pytest.mark.requires_trainable_backend +def test_openvino_gpu(monkeypatch): + import keras.src.backend as B + + monkeypatch.setattr(B, "backend", lambda: "openvino") + + class Core: + def get_property(self, device, name): + return {"deviceUsedBytes": 25 * 1024**2} + + available_devices = ["GPU"] - fake_jax = MagicMock() - fake_jax.devices.return_value = [Dev(), Dev()] - monkeypatch.setitem(__import__("sys").modules, "jax", fake_jax) + fake = MagicMock(Core=lambda: Core()) + monkeypatch.setitem(sys.modules, "openvino", fake) cb = MemoryUsageCallback(monitor_gpu=True) - assert pytest.approx(400, rel=1e-6) == cb._get_gpu_memory() + assert pytest.approx(25, rel=1e-6) == cb._get_gpu_memory() From f3401010b07f7b38d35da5a9c9472098a04a6b2b Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 4 May 2025 21:14:39 +0300 Subject: [PATCH 05/17] Fix openvino support --- keras/src/callbacks/memory_usage_callback.py | 186 ++++++++++-------- .../callbacks/memory_usage_callback_test.py | 174 ++++++++-------- 2 files changed, 187 insertions(+), 173 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index f54af2a57a50..b9ebe029271d 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -1,6 +1,5 @@ import os import warnings - from keras.src.api_export import keras_export from keras.src.callbacks.callback import Callback from keras.src import backend as K @@ -8,25 +7,42 @@ try: import psutil except ImportError: - psutil = None - + psutil = None @keras_export("keras.callbacks.MemoryUsageCallback") class MemoryUsageCallback(Callback): - """Monitor CPU/GPU/TPU/OpenVINO memory during training. + """Monitors and logs memory usage (CPU + optional GPU/TPU) during training. + + This callback measures: - Tracks: - - CPU memory via `psutil.Process().memory_info().rss`. - - GPU memory via backend APIs (TF, Torch, JAX, OpenVINO). - - Logs to stdout and, optionally, to TensorBoard. + - **CPU**: via psutil.Process().memory_info().rss + - **GPU/TPU**: via backend‐specific APIs (TensorFlow, PyTorch, JAX, OpenVINO) + + Logs are printed to stdout at the start/end of each epoch and, + if `log_every_batch=True`, after every batch. If `tensorboard_log_dir` + is provided, scalars are also written via `tf.summary` (TensorBoard). Args: - monitor_gpu: Bool. If True, query GPU/accelerator memory. - log_every_batch: Bool. If True, log after each batch. - tensorboard_log_dir: str or None. If set, use TF summary writer. + monitor_gpu (bool): If True, attempt to measure accelerator memory. + log_every_batch (bool): If True, also log after each batch. + tensorboard_log_dir (str|None): Directory for TensorBoard logs; + if None, no TF summary writer is created. Raises: - ImportError: If `psutil` is missing. + ImportError: If `psutil` is not installed (required for CPU logging). + + Example: + + ```python + from keras.callbacks import MemoryUsageCallback + # ... + cb = MemoryUsageCallback( + monitor_gpu=True, + log_every_batch=False, + tensorboard_log_dir="./logs/memory" + ) + model.fit(X, y, callbacks=[cb]) + ``` """ def __init__( @@ -34,67 +50,89 @@ def __init__( ): super().__init__() if psutil is None: - raise ImportError("MemoryUsageCallback requires the 'psutil' library.") + raise ImportError( + "MemoryUsageCallback requires the 'psutil' library. " + "Install via `pip install psutil`." + ) self.monitor_gpu = monitor_gpu self.log_every_batch = log_every_batch - self.process = psutil.Process() - self.tb_writer = None - self._batches_seen = 0 + self._proc = psutil.Process() + self._step_counter = 0 + self._writer = None if tensorboard_log_dir: try: - import tensorflow as tf + import tensorflow as tf logdir = os.path.expanduser(tensorboard_log_dir) - self.tb_writer = tf.summary.create_file_writer(logdir) + self._writer = tf.summary.create_file_writer(logdir) + print(f"MemoryUsageCallback: TensorBoard logs → {logdir}") except Exception as e: - warnings.warn(f"TB init error: {e}", RuntimeWarning) + warnings.warn( + f"Could not initialize TensorBoard writer: {e}", RuntimeWarning + ) + self._writer = None def on_train_begin(self, logs=None): - self._batches_seen = 0 + self._step_counter = 0 def on_epoch_begin(self, epoch, logs=None): - cpu = self._cpu_mem_mb() - gpu = self._get_gpu_memory() - self._log("Epoch %d start" % epoch, epoch, cpu, gpu) + self._log_epoch("start", epoch) def on_epoch_end(self, epoch, logs=None): - cpu = self._cpu_mem_mb() - gpu = self._get_gpu_memory() - self._log("Epoch %d end" % epoch, epoch + 1, cpu, gpu) + self._log_epoch("end", epoch, offset=1) def on_batch_end(self, batch, logs=None): if self.log_every_batch: - cpu = self._cpu_mem_mb() - gpu = self._get_gpu_memory() - self._log(f"Batch {self._batches_seen} end", self._batches_seen, cpu, gpu) - self._batches_seen += 1 + self._log_step(f"Batch {self._step_counter} end", self._step_counter) + self._step_counter += 1 def on_train_end(self, logs=None): - if self.tb_writer: - self.tb_writer.close() + if self._writer: + self._writer.close() + + def _log_epoch(self, when, epoch, offset=0): + label = f"Epoch {epoch} {when}" + step = epoch + offset + self._log_step(label, step) + + def _log_step(self, label, step): + cpu_mb = self._get_cpu_memory() + gpu_mb = self._get_gpu_memory() if self.monitor_gpu else None + + msg = f"{label} - CPU Memory: {cpu_mb:.2f} MB" + if gpu_mb is not None: + msg += f"; GPU Memory: {gpu_mb:.2f} MB" + print(msg) + + if self._writer: + import tensorflow as tf # noqa: E501 - def _cpu_mem_mb(self): - return self.process.memory_info().rss / (1024**2) + with self._writer.as_default(step=int(step)): + tf.summary.scalar("Memory/CPU_MB", cpu_mb) + if gpu_mb is not None: + tf.summary.scalar("Memory/GPU_MB", gpu_mb) + self._writer.flush() + + def _get_cpu_memory(self): + return self._proc.memory_info().rss / (1024**2) def _get_gpu_memory(self): - if not self.monitor_gpu: - return None - backend = K.backend() + backend_name = K.backend() try: - if backend == "tensorflow": + if backend_name == "tensorflow": import tensorflow as tf gpus = tf.config.list_physical_devices("GPU") if not gpus: return None - total = 0 - for g in gpus: - info = tf.config.experimental.get_memory_info(g.name) - total += info.get("current", 0) + total = sum( + tf.config.experimental.get_memory_info(g.name)["current"] + for g in gpus + ) return total / (1024**2) - if backend == "torch": - import torch + elif backend_name == "torch": + import torch if not torch.cuda.is_available(): return None @@ -103,47 +141,37 @@ def _get_gpu_memory(self): for i in range(torch.cuda.device_count()) ) return total / (1024**2) - if backend == "jax": - import jax + elif backend_name == "jax": + import jax devs = [d for d in jax.devices() if d.platform.upper() == "GPU"] if not devs: return None total = 0 for d in devs: - stats = d.memory_stats() + stats = getattr(d, "memory_stats", lambda: {})() total += stats.get("bytes_in_use", 0) return total / (1024**2) - if backend == "openvino": - try: - import openvino as ov - - core = ov.Core() - devices = core.available_devices - total = 0 - for dev in devices: - stats = core.get_property(dev, "DEVICE_MEMORY_STATISTICS") - total += stats.get("deviceUsedBytes", 0) - return total / (1024**2) - except Exception as e: - warnings.warn(f"OVINO mem err: {e}", RuntimeWarning) - return None - except ImportError as e: - warnings.warn(f"Import err for {backend}: {e}", RuntimeWarning) + else: + # OpenVINO and others fall back to unsupported + + if not hasattr(self, "_warn_backend"): + warnings.warn( + f"MemoryUsageCallback: unsupported backend '{backend_name}'", + RuntimeWarning, + ) + self._warn_backend = True + return None + except ImportError as imp_err: + if not hasattr(self, "_warn_import"): + warnings.warn( + f"Could not import for backend '{backend_name}': {imp_err}", + RuntimeWarning, + ) + self._warn_import = True + return None + except Exception as exc: + if not hasattr(self, "_warn_exc"): + warnings.warn(f"Error retrieving GPU memory: {exc}", RuntimeWarning) + self._warn_exc = True return None - warnings.warn(f"Unsupported backend '{backend}'", RuntimeWarning) - return None - - def _log(self, label, step, cpu, gpu): - msg = f"{label} - CPU: {cpu:.2f} MB" - if gpu is not None: - msg += f"; GPU: {gpu:.2f} MB" - print(msg) - if self.tb_writer: - import tensorflow as tf - - with self.tb_writer.as_default(step=step): - tf.summary.scalar("Memory/CPU_MB", cpu) - if gpu is not None: - tf.summary.scalar("Memory/GPU_MB", gpu) - self.tb_writer.flush() diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 9df254d6aaa9..16f1e04b121f 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,19 +1,20 @@ import os import glob +import re import sys import tempfile -from importlib import reload -from io import StringIO +import pytest +import numpy as np + from contextlib import redirect_stdout +from io import StringIO +from importlib import reload from unittest.mock import patch, MagicMock -import numpy as np -import pytest - -from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback -from keras.src.layers import Dense from keras.src.models import Sequential +from keras.src.layers import Dense from keras.src.testing import TestCase +from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback try: import psutil @@ -25,6 +26,8 @@ class MemoryUsageCallbackTest(TestCase): def setUp(self): super().setUp() + # Prepare 20 samples of 10-dim data → 4 batches @ bs=5 + self.x = np.random.random((20, 10)).astype(np.float32) self.y = np.random.randint(0, 2, (20, 1)).astype(np.float32) self.model = Sequential( @@ -39,124 +42,107 @@ def setUp(self): self.steps = len(self.x) // self.bs @pytest.mark.requires_trainable_backend - def test_epoch_logging(self): - out = StringIO() - with redirect_stdout(out): - - for gpu_val in (None, 42.0): - with patch.object( - MemoryUsageCallback, "_get_gpu_memory", return_value=gpu_val - ): - cb = MemoryUsageCallback(monitor_gpu=True, log_every_batch=False) - self.model.fit( - self.x, - self.y, - epochs=self.epochs, - batch_size=self.bs, - callbacks=[cb], - verbose=0, - ) - log = out.getvalue() - # must see epoch start/end lines - + def test_epoch_logging_stdout(self): + """Epoch-level logs appear with correct format.""" + buf = StringIO() + with redirect_stdout(buf): + cb = MemoryUsageCallback(monitor_gpu=False) + self.model.fit( + self.x, + self.y, + epochs=self.epochs, + batch_size=self.bs, + callbacks=[cb], + verbose=0, + ) + out = buf.getvalue() for e in range(self.epochs): - assert f"Epoch {e} start" in log - assert f"Epoch {e} end" in log - # must see GPU segment at least once if gpu_val not None - - assert "; GPU" in log + assert f"Epoch {e} start" in out + assert f"Epoch {e} end" in out + assert re.search(rf"Epoch {e} start - CPU Memory: [\d\.]+ MB", out) + assert re.search(rf"Epoch {e} end - CPU Memory: [\d\.]+ MB", out) @pytest.mark.requires_trainable_backend - def test_batch_logging(self): - out = StringIO() - with redirect_stdout(out): - with patch.object( - MemoryUsageCallback, "_get_gpu_memory", return_value=None - ): - cb = MemoryUsageCallback(monitor_gpu=True, log_every_batch=True) - self.model.fit( - self.x, - self.y, - epochs=1, - batch_size=self.bs, - callbacks=[cb], - verbose=0, - ) - lines = [ln for ln in out.getvalue().splitlines() if ln.startswith("Batch ")] - assert len(lines) == self.steps + def test_batch_logging_stdout(self): + """Batch-level logs appear when log_every_batch=True.""" + buf = StringIO() + with redirect_stdout(buf): + cb = MemoryUsageCallback(monitor_gpu=False, log_every_batch=True) + self.model.fit( + self.x, self.y, epochs=1, batch_size=self.bs, callbacks=[cb], verbose=0 + ) + lines = buf.getvalue().splitlines() + batch_lines = [l for l in lines if l.startswith("Batch ")] + assert len(batch_lines) == self.steps + assert all( + re.match(r"Batch \d+ end - CPU Memory: [\d\.]+ MB", l) for l in batch_lines + ) @pytest.mark.requires_trainable_backend - def test_tensorboard_events(self): + def test_tensorboard_writes_files(self): + """TensorBoard event files are created.""" tmp = tempfile.TemporaryDirectory() - logdir = os.path.join(tmp.name, "logs") - cb = MemoryUsageCallback( - monitor_gpu=False, log_every_batch=False, tensorboard_log_dir=logdir - ) - assert os.path.isdir(logdir) - self.model.fit( - self.x, self.y, epochs=1, batch_size=self.bs, callbacks=[cb], verbose=0 - ) + logdir = os.path.join(tmp.name, "tb") + buf = StringIO() + with redirect_stdout(buf): + cb = MemoryUsageCallback(monitor_gpu=False, tensorboard_log_dir=logdir) + self.model.fit( + self.x, self.y, epochs=1, batch_size=self.bs, callbacks=[cb], verbose=0 + ) files = glob.glob(os.path.join(logdir, "events.out.tfevents.*")) - assert len(files) > 0 - tmp.cleanup() + assert files, "No TensorBoard event files generated" - def test_missing_psutil(self): + @pytest.mark.requires_trainable_backend + def test_missing_psutil_raises(self): + """Constructor raises if psutil is missing.""" mod = sys.modules["keras.src.callbacks.memory_usage_callback"] orig = getattr(mod, "psutil", None) with patch.dict(sys.modules, {"psutil": None}): + reload(mod) with pytest.raises(ImportError): - reload(mod) - _ = mod.MemoryUsageCallback() + _ = mod.MemoryUsageCallback(monitor_gpu=False) + # restore + if orig is not None: sys.modules["psutil"] = orig reload(mod) @pytest.mark.requires_trainable_backend -def test_torch_gpu(monkeypatch): +def test_torch_backend_gpu_memory(monkeypatch): + """Simulate PyTorch backend and verify GPU memory sum.""" import keras.src.backend as B monkeypatch.setattr(B, "backend", lambda: "torch") - fake = MagicMock() - fake.cuda.is_available.return_value = True - fake.cuda.device_count.return_value = 2 - fake.cuda.memory_allocated.side_effect = [50 * 1024**2, 70 * 1024**2] - monkeypatch.setitem(sys.modules, "torch", fake) + + fake_torch = MagicMock() + fake_torch.cuda.is_available.return_value = True + fake_torch.cuda.device_count.return_value = 2 + fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 150 * 1024**2] + monkeypatch.setitem(sys.modules, "torch", fake_torch) + cb = MemoryUsageCallback(monitor_gpu=True) - assert pytest.approx(120, rel=1e-6) == cb._get_gpu_memory() + mem = cb._get_gpu_memory() + assert pytest.approx(250, rel=1e-6) == mem @pytest.mark.requires_trainable_backend -def test_jax_gpu(monkeypatch): +def test_jax_backend_gpu_memory(monkeypatch): + """Simulate JAX backend and verify GPU memory sum.""" import keras.src.backend as B monkeypatch.setattr(B, "backend", lambda: "jax") - class Dev: - platform = "GPU" + class FakeDev: + platform = "gpu" def memory_stats(self): - return {"bytes_in_use": 30 * 1024**2} - - fake = MagicMock(devices=lambda: [Dev(), Dev()]) - monkeypatch.setitem(sys.modules, "jax", fake) - cb = MemoryUsageCallback(monitor_gpu=True) - assert pytest.approx(60, rel=1e-6) == cb._get_gpu_memory() - - -@pytest.mark.requires_trainable_backend -def test_openvino_gpu(monkeypatch): - import keras.src.backend as B - - monkeypatch.setattr(B, "backend", lambda: "openvino") - - class Core: - def get_property(self, device, name): - return {"deviceUsedBytes": 25 * 1024**2} + return {"bytes_in_use": 200 * 1024**2} - available_devices = ["GPU"] + fake_jax = MagicMock() + fake_jax.devices.return_value = [FakeDev(), FakeDev()] + monkeypatch.setitem(sys.modules, "jax", fake_jax) - fake = MagicMock(Core=lambda: Core()) - monkeypatch.setitem(sys.modules, "openvino", fake) cb = MemoryUsageCallback(monitor_gpu=True) - assert pytest.approx(25, rel=1e-6) == cb._get_gpu_memory() + mem = cb._get_gpu_memory() + assert pytest.approx(400, rel=1e-6) == mem From 5af4a44d12b08b32f1b7f9439feff49cc3566f39 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 24 May 2025 21:11:32 +0300 Subject: [PATCH 06/17] Appropriate API integration --- keras/src/callbacks/memory_usage_callback.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index b9ebe029271d..0201d5ab569e 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -4,6 +4,7 @@ from keras.src.callbacks.callback import Callback from keras.src import backend as K +# Attempt to import psutil for memory monitoring try: import psutil except ImportError: From 1ae765930441ff3b0d2108b55631b114537e4aa1 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 24 May 2025 22:19:02 +0300 Subject: [PATCH 07/17] Reformatted code --- keras/src/callbacks/__init__.py | 1 - keras/src/callbacks/memory_usage_callback.py | 21 ++++++--- .../callbacks/memory_usage_callback_test.py | 43 +++++++++++++------ 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/keras/src/callbacks/__init__.py b/keras/src/callbacks/__init__.py index 178441d5a76a..98caa6b3a24f 100644 --- a/keras/src/callbacks/__init__.py +++ b/keras/src/callbacks/__init__.py @@ -14,4 +14,3 @@ from keras.src.callbacks.swap_ema_weights import SwapEMAWeights from keras.src.callbacks.tensorboard import TensorBoard from keras.src.callbacks.terminate_on_nan import TerminateOnNaN - diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 0201d5ab569e..ce2990a3d3fd 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -1,14 +1,16 @@ import os import warnings + +from keras.src import backend as K from keras.src.api_export import keras_export from keras.src.callbacks.callback import Callback -from keras.src import backend as K # Attempt to import psutil for memory monitoring try: import psutil except ImportError: - psutil = None + psutil = None + @keras_export("keras.callbacks.MemoryUsageCallback") class MemoryUsageCallback(Callback): @@ -63,14 +65,15 @@ def __init__( if tensorboard_log_dir: try: - import tensorflow as tf + import tensorflow as tf logdir = os.path.expanduser(tensorboard_log_dir) self._writer = tf.summary.create_file_writer(logdir) print(f"MemoryUsageCallback: TensorBoard logs → {logdir}") except Exception as e: warnings.warn( - f"Could not initialize TensorBoard writer: {e}", RuntimeWarning + f"Could not initialize TensorBoard writer: {e}", + RuntimeWarning, ) self._writer = None @@ -85,7 +88,9 @@ def on_epoch_end(self, epoch, logs=None): def on_batch_end(self, batch, logs=None): if self.log_every_batch: - self._log_step(f"Batch {self._step_counter} end", self._step_counter) + self._log_step( + f"Batch {self._step_counter} end", self._step_counter + ) self._step_counter += 1 def on_train_end(self, logs=None): @@ -122,7 +127,7 @@ def _get_gpu_memory(self): backend_name = K.backend() try: if backend_name == "tensorflow": - import tensorflow as tf + import tensorflow as tf gpus = tf.config.list_physical_devices("GPU") if not gpus: @@ -173,6 +178,8 @@ def _get_gpu_memory(self): return None except Exception as exc: if not hasattr(self, "_warn_exc"): - warnings.warn(f"Error retrieving GPU memory: {exc}", RuntimeWarning) + warnings.warn( + f"Error retrieving GPU memory: {exc}", RuntimeWarning + ) self._warn_exc = True return None diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 16f1e04b121f..bb2d2c5dd31c 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,20 +1,21 @@ -import os import glob +import os import re import sys import tempfile -import pytest -import numpy as np - from contextlib import redirect_stdout -from io import StringIO from importlib import reload -from unittest.mock import patch, MagicMock +from io import StringIO +from unittest.mock import MagicMock +from unittest.mock import patch -from keras.src.models import Sequential +import numpy as np +import pytest + +from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback from keras.src.layers import Dense +from keras.src.models import Sequential from keras.src.testing import TestCase -from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback try: import psutil @@ -69,13 +70,19 @@ def test_batch_logging_stdout(self): with redirect_stdout(buf): cb = MemoryUsageCallback(monitor_gpu=False, log_every_batch=True) self.model.fit( - self.x, self.y, epochs=1, batch_size=self.bs, callbacks=[cb], verbose=0 + self.x, + self.y, + epochs=1, + batch_size=self.bs, + callbacks=[cb], + verbose=0, ) lines = buf.getvalue().splitlines() batch_lines = [l for l in lines if l.startswith("Batch ")] assert len(batch_lines) == self.steps assert all( - re.match(r"Batch \d+ end - CPU Memory: [\d\.]+ MB", l) for l in batch_lines + re.match(r"Batch \d+ end - CPU Memory: [\d\.]+ MB", l) + for l in batch_lines ) @pytest.mark.requires_trainable_backend @@ -85,9 +92,16 @@ def test_tensorboard_writes_files(self): logdir = os.path.join(tmp.name, "tb") buf = StringIO() with redirect_stdout(buf): - cb = MemoryUsageCallback(monitor_gpu=False, tensorboard_log_dir=logdir) + cb = MemoryUsageCallback( + monitor_gpu=False, tensorboard_log_dir=logdir + ) self.model.fit( - self.x, self.y, epochs=1, batch_size=self.bs, callbacks=[cb], verbose=0 + self.x, + self.y, + epochs=1, + batch_size=self.bs, + callbacks=[cb], + verbose=0, ) files = glob.glob(os.path.join(logdir, "events.out.tfevents.*")) assert files, "No TensorBoard event files generated" @@ -118,7 +132,10 @@ def test_torch_backend_gpu_memory(monkeypatch): fake_torch = MagicMock() fake_torch.cuda.is_available.return_value = True fake_torch.cuda.device_count.return_value = 2 - fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 150 * 1024**2] + fake_torch.cuda.memory_allocated.side_effect = [ + 100 * 1024**2, + 150 * 1024**2, + ] monkeypatch.setitem(sys.modules, "torch", fake_torch) cb = MemoryUsageCallback(monitor_gpu=True) From e13528e68afd31bc084919552ea306a653828a3f Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 24 May 2025 23:18:54 +0300 Subject: [PATCH 08/17] Fix API integration --- keras/api/_tf_keras/keras/callbacks/__init__.py | 3 +++ keras/api/callbacks/__init__.py | 3 +++ keras/src/callbacks/memory_usage_callback.py | 3 ++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/keras/api/_tf_keras/keras/callbacks/__init__.py b/keras/api/_tf_keras/keras/callbacks/__init__.py index 4e165cddb6a8..e7f0bedd62c3 100644 --- a/keras/api/_tf_keras/keras/callbacks/__init__.py +++ b/keras/api/_tf_keras/keras/callbacks/__init__.py @@ -16,6 +16,9 @@ from keras.src.callbacks.learning_rate_scheduler import ( LearningRateScheduler as LearningRateScheduler, ) +from keras.src.callbacks.memory_usage_callback import ( + MemoryUsageCallback as MemoryUsageCallback, +) from keras.src.callbacks.model_checkpoint import ( ModelCheckpoint as ModelCheckpoint, ) diff --git a/keras/api/callbacks/__init__.py b/keras/api/callbacks/__init__.py index 4e165cddb6a8..e7f0bedd62c3 100644 --- a/keras/api/callbacks/__init__.py +++ b/keras/api/callbacks/__init__.py @@ -16,6 +16,9 @@ from keras.src.callbacks.learning_rate_scheduler import ( LearningRateScheduler as LearningRateScheduler, ) +from keras.src.callbacks.memory_usage_callback import ( + MemoryUsageCallback as MemoryUsageCallback, +) from keras.src.callbacks.model_checkpoint import ( ModelCheckpoint as ModelCheckpoint, ) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index ce2990a3d3fd..993b488f2ace 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -19,7 +19,8 @@ class MemoryUsageCallback(Callback): This callback measures: - **CPU**: via psutil.Process().memory_info().rss - - **GPU/TPU**: via backend‐specific APIs (TensorFlow, PyTorch, JAX, OpenVINO) + - **GPU/TPU**: via backend‐specific APIs + (TensorFlow, PyTorch, JAX, OpenVINO) Logs are printed to stdout at the start/end of each epoch and, if `log_every_batch=True`, after every batch. If `tensorboard_log_dir` From 728b77095fc83195749270182d9103cdccf43935 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 25 May 2025 00:16:16 +0300 Subject: [PATCH 09/17] Format the code --- keras/src/callbacks/memory_usage_callback.py | 40 ++++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 993b488f2ace..b13a5fa73bbf 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -19,11 +19,11 @@ class MemoryUsageCallback(Callback): This callback measures: - **CPU**: via psutil.Process().memory_info().rss - - **GPU/TPU**: via backend‐specific APIs - (TensorFlow, PyTorch, JAX, OpenVINO) + - **GPU/TPU**: via backend-specific APIs + (TensorFlow, PyTorch, JAX, OpenVINO) Logs are printed to stdout at the start/end of each epoch and, - if `log_every_batch=True`, after every batch. If `tensorboard_log_dir` + if `log_every_batch=True`, after every batch. If `tensorboard_log_dir` is provided, scalars are also written via `tf.summary` (TensorBoard). Args: @@ -36,7 +36,6 @@ class MemoryUsageCallback(Callback): ImportError: If `psutil` is not installed (required for CPU logging). Example: - ```python from keras.callbacks import MemoryUsageCallback # ... @@ -46,7 +45,7 @@ class MemoryUsageCallback(Callback): tensorboard_log_dir="./logs/memory" ) model.fit(X, y, callbacks=[cb]) - ``` + ``` """ def __init__( @@ -67,7 +66,6 @@ def __init__( if tensorboard_log_dir: try: import tensorflow as tf - logdir = os.path.expanduser(tensorboard_log_dir) self._writer = tf.summary.create_file_writer(logdir) print(f"MemoryUsageCallback: TensorBoard logs → {logdir}") @@ -89,9 +87,7 @@ def on_epoch_end(self, epoch, logs=None): def on_batch_end(self, batch, logs=None): if self.log_every_batch: - self._log_step( - f"Batch {self._step_counter} end", self._step_counter - ) + self._log_step(f"Batch {self._step_counter} end", self._step_counter) self._step_counter += 1 def on_train_end(self, logs=None): @@ -110,11 +106,11 @@ def _log_step(self, label, step): msg = f"{label} - CPU Memory: {cpu_mb:.2f} MB" if gpu_mb is not None: msg += f"; GPU Memory: {gpu_mb:.2f} MB" - print(msg) + # newline + flush ensures clean, immediate output + print("\n" + msg, flush=True) if self._writer: import tensorflow as tf # noqa: E501 - with self._writer.as_default(step=int(step)): tf.summary.scalar("Memory/CPU_MB", cpu_mb) if gpu_mb is not None: @@ -122,14 +118,13 @@ def _log_step(self, label, step): self._writer.flush() def _get_cpu_memory(self): - return self._proc.memory_info().rss / (1024**2) + return self._proc.memory_info().rss / (1024 ** 2) def _get_gpu_memory(self): backend_name = K.backend() try: if backend_name == "tensorflow": import tensorflow as tf - gpus = tf.config.list_physical_devices("GPU") if not gpus: return None @@ -137,20 +132,20 @@ def _get_gpu_memory(self): tf.config.experimental.get_memory_info(g.name)["current"] for g in gpus ) - return total / (1024**2) + return total / (1024 ** 2) + elif backend_name == "torch": import torch - if not torch.cuda.is_available(): return None total = sum( torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count()) ) - return total / (1024**2) + return total / (1024 ** 2) + elif backend_name == "jax": import jax - devs = [d for d in jax.devices() if d.platform.upper() == "GPU"] if not devs: return None @@ -158,17 +153,19 @@ def _get_gpu_memory(self): for d in devs: stats = getattr(d, "memory_stats", lambda: {})() total += stats.get("bytes_in_use", 0) - return total / (1024**2) - else: - # OpenVINO and others fall back to unsupported + return total / (1024 ** 2) + else: + # OpenVINO and other unknown backends: warn once if not hasattr(self, "_warn_backend"): warnings.warn( - f"MemoryUsageCallback: unsupported backend '{backend_name}'", + "MemoryUsageCallback: unsupported backend " + f"'{backend_name}'", RuntimeWarning, ) self._warn_backend = True return None + except ImportError as imp_err: if not hasattr(self, "_warn_import"): warnings.warn( @@ -177,6 +174,7 @@ def _get_gpu_memory(self): ) self._warn_import = True return None + except Exception as exc: if not hasattr(self, "_warn_exc"): warnings.warn( From c4c0e5e9ce6767f3dbdb7dcdd04717e29a90a195 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 25 May 2025 00:21:33 +0300 Subject: [PATCH 10/17] Format the code (2) --- keras/src/callbacks/memory_usage_callback.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index b13a5fa73bbf..1e45b389a8f1 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -45,7 +45,7 @@ class MemoryUsageCallback(Callback): tensorboard_log_dir="./logs/memory" ) model.fit(X, y, callbacks=[cb]) - ``` + ``` """ def __init__( @@ -66,6 +66,7 @@ def __init__( if tensorboard_log_dir: try: import tensorflow as tf + logdir = os.path.expanduser(tensorboard_log_dir) self._writer = tf.summary.create_file_writer(logdir) print(f"MemoryUsageCallback: TensorBoard logs → {logdir}") @@ -87,7 +88,9 @@ def on_epoch_end(self, epoch, logs=None): def on_batch_end(self, batch, logs=None): if self.log_every_batch: - self._log_step(f"Batch {self._step_counter} end", self._step_counter) + self._log_step( + f"Batch {self._step_counter} end", self._step_counter + ) self._step_counter += 1 def on_train_end(self, logs=None): @@ -111,6 +114,7 @@ def _log_step(self, label, step): if self._writer: import tensorflow as tf # noqa: E501 + with self._writer.as_default(step=int(step)): tf.summary.scalar("Memory/CPU_MB", cpu_mb) if gpu_mb is not None: @@ -118,13 +122,14 @@ def _log_step(self, label, step): self._writer.flush() def _get_cpu_memory(self): - return self._proc.memory_info().rss / (1024 ** 2) + return self._proc.memory_info().rss / (1024**2) def _get_gpu_memory(self): backend_name = K.backend() try: if backend_name == "tensorflow": import tensorflow as tf + gpus = tf.config.list_physical_devices("GPU") if not gpus: return None @@ -132,20 +137,22 @@ def _get_gpu_memory(self): tf.config.experimental.get_memory_info(g.name)["current"] for g in gpus ) - return total / (1024 ** 2) + return total / (1024**2) elif backend_name == "torch": import torch + if not torch.cuda.is_available(): return None total = sum( torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count()) ) - return total / (1024 ** 2) + return total / (1024**2) elif backend_name == "jax": import jax + devs = [d for d in jax.devices() if d.platform.upper() == "GPU"] if not devs: return None @@ -153,7 +160,7 @@ def _get_gpu_memory(self): for d in devs: stats = getattr(d, "memory_stats", lambda: {})() total += stats.get("bytes_in_use", 0) - return total / (1024 ** 2) + return total / (1024**2) else: # OpenVINO and other unknown backends: warn once From e064d0e354db862088628d42e415a96d4e2e5b9c Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sun, 25 May 2025 00:52:10 +0300 Subject: [PATCH 11/17] Fix openvino case --- keras/src/callbacks/memory_usage_callback.py | 51 ++++++++++---------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 1e45b389a8f1..7bdd335e9625 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -5,7 +5,7 @@ from keras.src.api_export import keras_export from keras.src.callbacks.callback import Callback -# Attempt to import psutil for memory monitoring +# Attempt to import psutil for CPU memory monitoring try: import psutil except ImportError: @@ -14,17 +14,19 @@ @keras_export("keras.callbacks.MemoryUsageCallback") class MemoryUsageCallback(Callback): - """Monitors and logs memory usage (CPU + optional GPU/TPU) during training. + """Monitors and logs memory usage + (CPU + optional GPU/TPU/OpenVINO) during training. This callback measures: - **CPU**: via psutil.Process().memory_info().rss - - **GPU/TPU**: via backend-specific APIs - (TensorFlow, PyTorch, JAX, OpenVINO) + - **GPU/TPU**: via backend‐specific APIs + (TensorFlow, PyTorch, JAX) Logs are printed to stdout at the start/end of each epoch and, - if `log_every_batch=True`, after every batch. If `tensorboard_log_dir` - is provided, scalars are also written via `tf.summary` (TensorBoard). + if `log_every_batch=True`, after every batch. + If `tensorboard_log_dir` is provided, scalars are also written + via `tf.summary` (TensorBoard). Args: monitor_gpu (bool): If True, attempt to measure accelerator memory. @@ -34,22 +36,13 @@ class MemoryUsageCallback(Callback): Raises: ImportError: If `psutil` is not installed (required for CPU logging). - - Example: - ```python - from keras.callbacks import MemoryUsageCallback - # ... - cb = MemoryUsageCallback( - monitor_gpu=True, - log_every_batch=False, - tensorboard_log_dir="./logs/memory" - ) - model.fit(X, y, callbacks=[cb]) - ``` """ def __init__( - self, monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None + self, + monitor_gpu=True, + log_every_batch=False, + tensorboard_log_dir=None, ): super().__init__() if psutil is None: @@ -109,8 +102,7 @@ def _log_step(self, label, step): msg = f"{label} - CPU Memory: {cpu_mb:.2f} MB" if gpu_mb is not None: msg += f"; GPU Memory: {gpu_mb:.2f} MB" - # newline + flush ensures clean, immediate output - print("\n" + msg, flush=True) + print(msg) if self._writer: import tensorflow as tf # noqa: E501 @@ -119,7 +111,7 @@ def _log_step(self, label, step): tf.summary.scalar("Memory/CPU_MB", cpu_mb) if gpu_mb is not None: tf.summary.scalar("Memory/GPU_MB", gpu_mb) - self._writer.flush() + # flush happens inside writer def _get_cpu_memory(self): return self._proc.memory_info().rss / (1024**2) @@ -162,12 +154,21 @@ def _get_gpu_memory(self): total += stats.get("bytes_in_use", 0) return total / (1024**2) + elif backend_name == "openvino": + # OpenVINO provides no memory-stats API: + if not hasattr(self, "_warn_openvino"): + warnings.warn( + " OpenVINO does not expose memory stats; " + "GPU monitoring disabled.", + RuntimeWarning, + ) + self._warn_openvino = True + return None + else: - # OpenVINO and other unknown backends: warn once if not hasattr(self, "_warn_backend"): warnings.warn( - "MemoryUsageCallback: unsupported backend " - f"'{backend_name}'", + f"MemoryUsageCallback: no backend '{backend_name}'", RuntimeWarning, ) self._warn_backend = True From a9e0212338df17cb5a0a68b7682e4576cdccc8d5 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 31 May 2025 17:00:47 +0300 Subject: [PATCH 12/17] Add keras devs' comments into code --- keras/src/callbacks/memory_usage_callback.py | 218 ++++++++++---- .../callbacks/memory_usage_callback_test.py | 283 ++++++++++-------- 2 files changed, 316 insertions(+), 185 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 7bdd335e9625..49270899a8f0 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -1,38 +1,80 @@ import os import warnings +import time from keras.src import backend as K from keras.src.api_export import keras_export from keras.src.callbacks.callback import Callback -# Attempt to import psutil for CPU memory monitoring + try: import psutil except ImportError: psutil = None +def running_on_gpu(): + """Detect if any GPU is available on the current backend.""" + backend_name = K.backend() + if backend_name == "tensorflow": + import tensorflow as tf + + return bool(tf.config.list_logical_devices("GPU")) + elif backend_name == "torch": + try: + import torch + return torch.cuda.is_available() + except ImportError: + return False + elif backend_name == "jax": + try: + import jax + + return any(d.platform.upper() == "GPU" for d in jax.devices()) + except ImportError: + return False + return False + + +def running_on_tpu(): + """Detect if any TPU is available on the current backend.""" + backend_name = K.backend() + if backend_name == "tensorflow": + import tensorflow as tf + + return bool(tf.config.list_logical_devices("TPU")) + elif backend_name == "jax": + try: + import jax + + return any(d.platform.upper() == "TPU" for d in jax.devices()) + except ImportError: + return False + return False + + @keras_export("keras.callbacks.MemoryUsageCallback") class MemoryUsageCallback(Callback): - """Monitors and logs memory usage - (CPU + optional GPU/TPU/OpenVINO) during training. + """ + Monitors and logs memory usage (CPU + optional GPU/TPU) during training. This callback measures: - - **CPU**: via psutil.Process().memory_info().rss - - **GPU/TPU**: via backend‐specific APIs + - **GPU**: if a GPU is detected, via backend-specific APIs (TensorFlow, PyTorch, JAX) + - **TPU**: if a TPU is detected, via backend-specific APIs + (TensorFlow, JAX) - Logs are printed to stdout at the start/end of each epoch and, - if `log_every_batch=True`, after every batch. - If `tensorboard_log_dir` is provided, scalars are also written - via `tf.summary` (TensorBoard). + Logs are printed to stdout at the start and end of each epoch (with a leading + newline to avoid clobbering the progress bar), and, if `log_every_batch=True`, + after every batch. If `tensorboard_log_dir` is provided, scalars are also written + via tf.summary (TensorBoard). Args: - monitor_gpu (bool): If True, attempt to measure accelerator memory. - log_every_batch (bool): If True, also log after each batch. - tensorboard_log_dir (str|None): Directory for TensorBoard logs; - if None, no TF summary writer is created. + log_every_batch (bool): If True, also log after each batch. Defaults to False + (i.e., log only at epoch start and end). + tensorboard_log_dir (str|None): Directory for TensorBoard logs; if None, + no TF summary writer is created. Raises: ImportError: If `psutil` is not installed (required for CPU logging). @@ -40,17 +82,17 @@ class MemoryUsageCallback(Callback): def __init__( self, - monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None, ): super().__init__() + if psutil is None: raise ImportError( "MemoryUsageCallback requires the 'psutil' library. " - "Install via `pip install psutil`." + "To install, please use: pip install psutil" ) - self.monitor_gpu = monitor_gpu + self.log_every_batch = log_every_batch self._proc = psutil.Process() self._step_counter = 0 @@ -74,16 +116,17 @@ def on_train_begin(self, logs=None): self._step_counter = 0 def on_epoch_begin(self, epoch, logs=None): + print() self._log_epoch("start", epoch) def on_epoch_end(self, epoch, logs=None): + print() self._log_epoch("end", epoch, offset=1) def on_batch_end(self, batch, logs=None): if self.log_every_batch: - self._log_step( - f"Batch {self._step_counter} end", self._step_counter - ) + print() + self._log_step(f"Batch {self._step_counter} end", self._step_counter) self._step_counter += 1 def on_train_end(self, logs=None): @@ -96,50 +139,74 @@ def _log_epoch(self, when, epoch, offset=0): self._log_step(label, step) def _log_step(self, label, step): + """ + Internal helper to measure and print CPU/GPU/TPU memory. + Inserts a short delay (time.sleep(0)) to let stdout flush cleanly. + """ cpu_mb = self._get_cpu_memory() - gpu_mb = self._get_gpu_memory() if self.monitor_gpu else None + gpu_mb = self._get_gpu_memory() + tpu_mb = self._get_tpu_memory() + # Build message msg = f"{label} - CPU Memory: {cpu_mb:.2f} MB" if gpu_mb is not None: msg += f"; GPU Memory: {gpu_mb:.2f} MB" + if tpu_mb is not None: + msg += f"; TPU Memory: {tpu_mb:.2f} MB" + print(msg) + time.sleep(0) + if self._writer: - import tensorflow as tf # noqa: E501 + import tensorflow as tf with self._writer.as_default(step=int(step)): tf.summary.scalar("Memory/CPU_MB", cpu_mb) if gpu_mb is not None: tf.summary.scalar("Memory/GPU_MB", gpu_mb) - # flush happens inside writer + if tpu_mb is not None: + tf.summary.scalar("Memory/TPU_MB", tpu_mb) def _get_cpu_memory(self): + """Return current process CPU memory usage in MB.""" return self._proc.memory_info().rss / (1024**2) def _get_gpu_memory(self): + """ + Return current GPU memory usage in MB for the detected backend, + or None if no GPU is present or if measurement fails. + """ + if not running_on_gpu(): + return None + backend_name = K.backend() try: if backend_name == "tensorflow": import tensorflow as tf - - gpus = tf.config.list_physical_devices("GPU") - if not gpus: - return None - total = sum( - tf.config.experimental.get_memory_info(g.name)["current"] - for g in gpus - ) - return total / (1024**2) + try: + mem_info = tf.config.experimental.get_memory_info("GPU:0") + return mem_info["current"] / (1024**2) + except Exception as e: + gpus = tf.config.list_physical_devices("GPU") + if not gpus: + return None + total = 0 + for i, _ in enumerate(gpus): + try: + info = tf.config.experimental.get_memory_info(f"GPU:{i}") + total += info.get("current", 0) + except Exception: + continue + return total / (1024**2) elif backend_name == "torch": import torch if not torch.cuda.is_available(): return None - total = sum( - torch.cuda.memory_allocated(i) - for i in range(torch.cuda.device_count()) - ) + torch.cuda.synchronize() + total = torch.cuda.max_memory_allocated() return total / (1024**2) elif backend_name == "jax": @@ -154,30 +221,12 @@ def _get_gpu_memory(self): total += stats.get("bytes_in_use", 0) return total / (1024**2) - elif backend_name == "openvino": - # OpenVINO provides no memory-stats API: - if not hasattr(self, "_warn_openvino"): - warnings.warn( - " OpenVINO does not expose memory stats; " - "GPU monitoring disabled.", - RuntimeWarning, - ) - self._warn_openvino = True - return None - - else: - if not hasattr(self, "_warn_backend"): - warnings.warn( - f"MemoryUsageCallback: no backend '{backend_name}'", - RuntimeWarning, - ) - self._warn_backend = True - return None + return None except ImportError as imp_err: if not hasattr(self, "_warn_import"): warnings.warn( - f"Could not import for backend '{backend_name}': {imp_err}", + f"Could not import library for GPU memory tracking ({backend_name}): {imp_err}", RuntimeWarning, ) self._warn_import = True @@ -190,3 +239,62 @@ def _get_gpu_memory(self): ) self._warn_exc = True return None + + def _get_tpu_memory(self): + """ + Return current TPU memory usage in MB for the detected backend, + or None if no TPU is present or if measurement fails. + Note: TPU memory APIs vary; here we attempt best-effort. + """ + if not running_on_tpu(): + return None + + backend_name = K.backend() + try: + if backend_name == "tensorflow": + import tensorflow as tf + + if not hasattr(self, "_warn_tpu_tf"): + warnings.warn( + "TensorFlow TPU memory info is not directly available; returning None.", + RuntimeWarning, + ) + self._warn_tpu_tf = True + return None + + elif backend_name == "jax": + import jax + + devs = [d for d in jax.devices() if d.platform.upper() == "TPU"] + if not devs: + return None + try: + stats = devs[0].memory_stats() + tpu_bytes = stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) + return tpu_bytes / (1024**2) + except Exception: + if not hasattr(self, "_warn_tpu_jax"): + warnings.warn( + "Failed to retrieve JAX TPU memory stats; returning None.", + RuntimeWarning, + ) + self._warn_tpu_jax = True + return None + return None + + except ImportError as imp_err: + if not hasattr(self, "_warn_tpu_imp"): + warnings.warn( + f"Could not import library for TPU memory tracking ({backend_name}): {imp_err}", + RuntimeWarning, + ) + self._warn_tpu_imp = True + return None + + except Exception as exc: + if not hasattr(self, "_warn_tpu_exc"): + warnings.warn( + f"Error retrieving TPU memory: {exc}", RuntimeWarning + ) + self._warn_tpu_exc = True + return None diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index bb2d2c5dd31c..7e3f88311b83 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,165 +1,188 @@ -import glob import os -import re import sys +import glob import tempfile -from contextlib import redirect_stdout -from importlib import reload -from io import StringIO -from unittest.mock import MagicMock -from unittest.mock import patch - import numpy as np import pytest +from io import StringIO +from contextlib import redirect_stdout +from unittest.mock import patch, MagicMock +from importlib import reload + +import keras.src.callbacks.memory_usage_callback as muc_module from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback -from keras.src.layers import Dense -from keras.src.models import Sequential -from keras.src.testing import TestCase try: - import psutil + import psutil as real_psutil except ImportError: - psutil = None + real_psutil = None + +import tensorflow as tf +from keras.src.models import Sequential +from keras.src.layers import Dense +from keras.src.testing import TestCase -@pytest.mark.skipif(psutil is None, reason="psutil is required") +@pytest.mark.skipif(real_psutil is None, reason="psutil is required for MemoryUsageCallback tests.") class MemoryUsageCallbackTest(TestCase): def setUp(self): super().setUp() - # Prepare 20 samples of 10-dim data → 4 batches @ bs=5 - - self.x = np.random.random((20, 10)).astype(np.float32) - self.y = np.random.randint(0, 2, (20, 1)).astype(np.float32) - self.model = Sequential( - [ - Dense(5, activation="relu", input_shape=(10,)), - Dense(1, activation="sigmoid"), - ] - ) + self.x_train = np.random.random((16, 8)).astype(np.float32) + self.y_train = np.random.randint(0, 2, (16, 1)).astype(np.float32) + + self.model = Sequential([ + Dense(4, activation="relu", input_shape=(8,)), + Dense(1, activation="sigmoid"), + ]) self.model.compile(optimizer="adam", loss="binary_crossentropy") + self.epochs = 2 - self.bs = 5 - self.steps = len(self.x) // self.bs - - @pytest.mark.requires_trainable_backend - def test_epoch_logging_stdout(self): - """Epoch-level logs appear with correct format.""" - buf = StringIO() - with redirect_stdout(buf): - cb = MemoryUsageCallback(monitor_gpu=False) - self.model.fit( - self.x, - self.y, - epochs=self.epochs, - batch_size=self.bs, - callbacks=[cb], - verbose=0, - ) - out = buf.getvalue() - for e in range(self.epochs): - assert f"Epoch {e} start" in out - assert f"Epoch {e} end" in out - assert re.search(rf"Epoch {e} start - CPU Memory: [\d\.]+ MB", out) - assert re.search(rf"Epoch {e} end - CPU Memory: [\d\.]+ MB", out) - - @pytest.mark.requires_trainable_backend - def test_batch_logging_stdout(self): - """Batch-level logs appear when log_every_batch=True.""" - buf = StringIO() - with redirect_stdout(buf): - cb = MemoryUsageCallback(monitor_gpu=False, log_every_batch=True) - self.model.fit( - self.x, - self.y, - epochs=1, - batch_size=self.bs, - callbacks=[cb], - verbose=0, - ) - lines = buf.getvalue().splitlines() - batch_lines = [l for l in lines if l.startswith("Batch ")] - assert len(batch_lines) == self.steps - assert all( - re.match(r"Batch \d+ end - CPU Memory: [\d\.]+ MB", l) - for l in batch_lines - ) - - @pytest.mark.requires_trainable_backend - def test_tensorboard_writes_files(self): - """TensorBoard event files are created.""" - tmp = tempfile.TemporaryDirectory() - logdir = os.path.join(tmp.name, "tb") - buf = StringIO() - with redirect_stdout(buf): - cb = MemoryUsageCallback( - monitor_gpu=False, tensorboard_log_dir=logdir - ) - self.model.fit( - self.x, - self.y, - epochs=1, - batch_size=self.bs, - callbacks=[cb], - verbose=0, - ) - files = glob.glob(os.path.join(logdir, "events.out.tfevents.*")) - assert files, "No TensorBoard event files generated" - - @pytest.mark.requires_trainable_backend - def test_missing_psutil_raises(self): - """Constructor raises if psutil is missing.""" - mod = sys.modules["keras.src.callbacks.memory_usage_callback"] - orig = getattr(mod, "psutil", None) - with patch.dict(sys.modules, {"psutil": None}): - reload(mod) - with pytest.raises(ImportError): - _ = mod.MemoryUsageCallback(monitor_gpu=False) - # restore - - if orig is not None: - sys.modules["psutil"] = orig - reload(mod) - - -@pytest.mark.requires_trainable_backend -def test_torch_backend_gpu_memory(monkeypatch): - """Simulate PyTorch backend and verify GPU memory sum.""" - import keras.src.backend as B - - monkeypatch.setattr(B, "backend", lambda: "torch") + self.batch_size = 4 + self.steps_per_epoch = len(self.x_train) // self.batch_size + + def test_cpu_only_epoch_logging(self): + with patch.object(muc_module.K, "backend", return_value="unsupported_backend"): + out = StringIO() + with redirect_stdout(out): + cb = MemoryUsageCallback(log_every_batch=False) + self.model.fit( + self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0, + ) + lines = [l.strip() for l in out.getvalue().splitlines() if l.strip()] + epoch_lines = [l for l in lines if l.startswith("Epoch")] + assert len(epoch_lines) == 4 + for i in range(self.epochs): + assert epoch_lines[2*i].startswith(f"Epoch {i} start - CPU Memory:") + assert epoch_lines[2*i+1].startswith(f"Epoch {i} end - CPU Memory:") + + def test_log_every_batch(self): + with patch.object(muc_module.K, "backend", return_value="unsupported_backend"): + out = StringIO() + with redirect_stdout(out): + cb = MemoryUsageCallback(log_every_batch=True) + self.model.fit( + self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0, + ) + lines = [l.strip() for l in out.getvalue().splitlines() if l.strip()] + batch_lines = [l for l in lines if l.startswith("Batch")] + assert len(batch_lines) == self.epochs * self.steps_per_epoch + + def test_tensorboard_log_dir(self): + with tempfile.TemporaryDirectory() as tmp_dir: + with patch.object(muc_module.K, "backend", return_value="unsupported_backend"): + cb = MemoryUsageCallback(log_every_batch=False, tensorboard_log_dir=tmp_dir) + self.model.fit( + self.x_train, self.y_train, + epochs=1, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0, + ) + files = glob.glob(os.path.join(tmp_dir, "events.out.tfevents.*")) + assert len(files) > 0, f"No TensorBoard event file found in {tmp_dir}" + + def test_psutil_missing(self): + """ + Temporarily override the module's `psutil` to None so + that instantiating MemoryUsageCallback raises ImportError. + """ + original_psutil = muc_module.psutil + try: + muc_module.psutil = None + with pytest.raises( + ImportError, + match="MemoryUsageCallback requires the 'psutil' library" + ): + _ = muc_module.MemoryUsageCallback() + finally: + muc_module.psutil = original_psutil + reload(muc_module) + + +def test_gpu_memory_tensorflow(monkeypatch): + """ + Simulate TensorFlow backend with one GPU device named "GPU:0" + whose memory_info()['current'] is 150 MiB. After reload, _get_gpu_memory() + must return 150.0 (MB). + """ + + if real_psutil: + sys.modules["psutil"] = real_psutil + + class FakeDevice: + def __init__(self, name): + self.name = name + + fake_tf = MagicMock() + fake_tf.config.list_physical_devices.return_value = [FakeDevice("GPU:0")] + fake_tf.config.experimental.get_memory_info.return_value = {"current": 150 * 1024**2} + + monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) + monkeypatch.setattr("keras.src.callbacks.memory_usage_callback.K.backend", lambda: "tensorflow") + + reload(muc_module) + + cb = muc_module.MemoryUsageCallback() + mem_mb = cb._get_gpu_memory() + assert pytest.approx(150.0, rel=1e-6) == mem_mb + + +def test_gpu_memory_torch(monkeypatch): + """ + Simulate PyTorch backend with 2 GPUs that allocate 100 MiB and 200 MiB. + After reload, _get_gpu_memory() should return 300.0 (MB). + """ + if real_psutil: + sys.modules["psutil"] = real_psutil fake_torch = MagicMock() fake_torch.cuda.is_available.return_value = True fake_torch.cuda.device_count.return_value = 2 fake_torch.cuda.memory_allocated.side_effect = [ 100 * 1024**2, - 150 * 1024**2, + 200 * 1024**2, ] - monkeypatch.setitem(sys.modules, "torch", fake_torch) - cb = MemoryUsageCallback(monitor_gpu=True) - mem = cb._get_gpu_memory() - assert pytest.approx(250, rel=1e-6) == mem + monkeypatch.setitem(sys.modules, "torch", fake_torch) + monkeypatch.setattr("keras.src.callbacks.memory_usage_callback.K.backend", lambda: "torch") + reload(muc_module) -@pytest.mark.requires_trainable_backend -def test_jax_backend_gpu_memory(monkeypatch): - """Simulate JAX backend and verify GPU memory sum.""" - import keras.src.backend as B + cb = muc_module.MemoryUsageCallback() + mem_mb = cb._get_gpu_memory() + assert pytest.approx(300.0, rel=1e-6) == mem_mb - monkeypatch.setattr(B, "backend", lambda: "jax") - class FakeDev: - platform = "gpu" +def test_gpu_memory_jax(monkeypatch): + """ + Simulate JAX backend with two GPU devices each reporting + bytes_in_use=220 MiB. Expect 440.0 (MB). + """ + if real_psutil: + sys.modules["psutil"] = real_psutil + class FakeDevice: + platform = "GPU" def memory_stats(self): - return {"bytes_in_use": 200 * 1024**2} + return {"bytes_in_use": 220 * 1024**2} fake_jax = MagicMock() - fake_jax.devices.return_value = [FakeDev(), FakeDev()] + fake_jax.devices.return_value = [FakeDevice(), FakeDevice()] + monkeypatch.setitem(sys.modules, "jax", fake_jax) + monkeypatch.setattr("keras.src.callbacks.memory_usage_callback.K.backend", lambda: "jax") + + reload(muc_module) - cb = MemoryUsageCallback(monitor_gpu=True) - mem = cb._get_gpu_memory() - assert pytest.approx(400, rel=1e-6) == mem + cb = muc_module.MemoryUsageCallback() + mem_mb = cb._get_gpu_memory() + assert pytest.approx(440.0, rel=1e-6) == mem_mb From 9148a60fed594319924ceebca857ebf35288e8d2 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 31 May 2025 19:57:48 +0300 Subject: [PATCH 13/17] Fix minor error --- keras/src/callbacks/memory_usage_callback.py | 45 +++++---- .../callbacks/memory_usage_callback_test.py | 98 +++++++++++++------ 2 files changed, 89 insertions(+), 54 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 49270899a8f0..ed650fe9bcdf 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -1,3 +1,4 @@ + import os import warnings import time @@ -6,7 +7,6 @@ from keras.src.api_export import keras_export from keras.src.callbacks.callback import Callback - try: import psutil except ImportError: @@ -14,11 +14,10 @@ def running_on_gpu(): - """Detect if any GPU is available on the current backend.""" + """Detect if any GPU is available on the current Keras backend.""" backend_name = K.backend() if backend_name == "tensorflow": import tensorflow as tf - return bool(tf.config.list_logical_devices("GPU")) elif backend_name == "torch": try: @@ -29,7 +28,6 @@ def running_on_gpu(): elif backend_name == "jax": try: import jax - return any(d.platform.upper() == "GPU" for d in jax.devices()) except ImportError: return False @@ -37,16 +35,14 @@ def running_on_gpu(): def running_on_tpu(): - """Detect if any TPU is available on the current backend.""" + """Detect if any TPU is available on the current Keras backend.""" backend_name = K.backend() if backend_name == "tensorflow": import tensorflow as tf - return bool(tf.config.list_logical_devices("TPU")) elif backend_name == "jax": try: import jax - return any(d.platform.upper() == "TPU" for d in jax.devices()) except ImportError: return False @@ -65,9 +61,10 @@ class MemoryUsageCallback(Callback): - **TPU**: if a TPU is detected, via backend-specific APIs (TensorFlow, JAX) - Logs are printed to stdout at the start and end of each epoch (with a leading - newline to avoid clobbering the progress bar), and, if `log_every_batch=True`, - after every batch. If `tensorboard_log_dir` is provided, scalars are also written + Logs are printed to stdout at the start and end of each epoch + (with a leading newline to avoid clobbering the progress bar), + and, if `log_every_batch=True`, after every batch. + If `tensorboard_log_dir` is provided, scalars are also written via tf.summary (TensorBoard). Args: @@ -116,11 +113,11 @@ def on_train_begin(self, logs=None): self._step_counter = 0 def on_epoch_begin(self, epoch, logs=None): - print() + print() self._log_epoch("start", epoch) def on_epoch_end(self, epoch, logs=None): - print() + print() self._log_epoch("end", epoch, offset=1) def on_batch_end(self, batch, logs=None): @@ -144,10 +141,9 @@ def _log_step(self, label, step): Inserts a short delay (time.sleep(0)) to let stdout flush cleanly. """ cpu_mb = self._get_cpu_memory() - gpu_mb = self._get_gpu_memory() - tpu_mb = self._get_tpu_memory() + gpu_mb = self._get_gpu_memory() + tpu_mb = self._get_tpu_memory() - # Build message msg = f"{label} - CPU Memory: {cpu_mb:.2f} MB" if gpu_mb is not None: msg += f"; GPU Memory: {gpu_mb:.2f} MB" @@ -155,11 +151,10 @@ def _log_step(self, label, step): msg += f"; TPU Memory: {tpu_mb:.2f} MB" print(msg) - time.sleep(0) + time.sleep(0) - if self._writer: - import tensorflow as tf + import tensorflow as tf with self._writer.as_default(step=int(step)): tf.summary.scalar("Memory/CPU_MB", cpu_mb) @@ -187,7 +182,7 @@ def _get_gpu_memory(self): try: mem_info = tf.config.experimental.get_memory_info("GPU:0") return mem_info["current"] / (1024**2) - except Exception as e: + except Exception: gpus = tf.config.list_physical_devices("GPU") if not gpus: return None @@ -205,9 +200,12 @@ def _get_gpu_memory(self): if not torch.cuda.is_available(): return None - torch.cuda.synchronize() - total = torch.cuda.max_memory_allocated() - return total / (1024**2) + + device_count = torch.cuda.device_count() + total_bytes = 0 + for i in range(device_count): + total_bytes += torch.cuda.memory_allocated(i) + return total_bytes / (1024**2) elif backend_name == "jax": import jax @@ -244,7 +242,7 @@ def _get_tpu_memory(self): """ Return current TPU memory usage in MB for the detected backend, or None if no TPU is present or if measurement fails. - Note: TPU memory APIs vary; here we attempt best-effort. + Note: TPU memory APIs vary; here we attempt best‐effort. """ if not running_on_tpu(): return None @@ -280,6 +278,7 @@ def _get_tpu_memory(self): ) self._warn_tpu_jax = True return None + return None except ImportError as imp_err: diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 7e3f88311b83..03bb52e5d0cf 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,14 +1,15 @@ +import glob import os import sys -import glob import tempfile -import numpy as np -import pytest - -from io import StringIO from contextlib import redirect_stdout -from unittest.mock import patch, MagicMock from importlib import reload +from io import StringIO +from unittest.mock import MagicMock +from unittest.mock import patch + +import numpy as np +import pytest import keras.src.callbacks.memory_usage_callback as muc_module from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback @@ -18,23 +19,27 @@ except ImportError: real_psutil = None -import tensorflow as tf -from keras.src.models import Sequential from keras.src.layers import Dense +from keras.src.models import Sequential from keras.src.testing import TestCase -@pytest.mark.skipif(real_psutil is None, reason="psutil is required for MemoryUsageCallback tests.") +@pytest.mark.skipif( + real_psutil is None, + reason="psutil is required for MemoryUsageCallback tests.", +) class MemoryUsageCallbackTest(TestCase): def setUp(self): super().setUp() self.x_train = np.random.random((16, 8)).astype(np.float32) self.y_train = np.random.randint(0, 2, (16, 1)).astype(np.float32) - self.model = Sequential([ - Dense(4, activation="relu", input_shape=(8,)), - Dense(1, activation="sigmoid"), - ]) + self.model = Sequential( + [ + Dense(4, activation="relu", input_shape=(8,)), + Dense(1, activation="sigmoid"), + ] + ) self.model.compile(optimizer="adam", loss="binary_crossentropy") self.epochs = 2 @@ -42,53 +47,74 @@ def setUp(self): self.steps_per_epoch = len(self.x_train) // self.batch_size def test_cpu_only_epoch_logging(self): - with patch.object(muc_module.K, "backend", return_value="unsupported_backend"): + with patch.object( + muc_module.K, "backend", return_value="unsupported_backend" + ): out = StringIO() with redirect_stdout(out): cb = MemoryUsageCallback(log_every_batch=False) self.model.fit( - self.x_train, self.y_train, + self.x_train, + self.y_train, epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], verbose=0, ) - lines = [l.strip() for l in out.getvalue().splitlines() if l.strip()] + lines = [ + l.strip() for l in out.getvalue().splitlines() if l.strip() + ] epoch_lines = [l for l in lines if l.startswith("Epoch")] assert len(epoch_lines) == 4 for i in range(self.epochs): - assert epoch_lines[2*i].startswith(f"Epoch {i} start - CPU Memory:") - assert epoch_lines[2*i+1].startswith(f"Epoch {i} end - CPU Memory:") + assert epoch_lines[2 * i].startswith( + f"Epoch {i} start - CPU Memory:" + ) + assert epoch_lines[2 * i + 1].startswith( + f"Epoch {i} end - CPU Memory:" + ) def test_log_every_batch(self): - with patch.object(muc_module.K, "backend", return_value="unsupported_backend"): + with patch.object( + muc_module.K, "backend", return_value="unsupported_backend" + ): out = StringIO() with redirect_stdout(out): cb = MemoryUsageCallback(log_every_batch=True) self.model.fit( - self.x_train, self.y_train, + self.x_train, + self.y_train, epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], verbose=0, ) - lines = [l.strip() for l in out.getvalue().splitlines() if l.strip()] + lines = [ + l.strip() for l in out.getvalue().splitlines() if l.strip() + ] batch_lines = [l for l in lines if l.startswith("Batch")] assert len(batch_lines) == self.epochs * self.steps_per_epoch def test_tensorboard_log_dir(self): with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object(muc_module.K, "backend", return_value="unsupported_backend"): - cb = MemoryUsageCallback(log_every_batch=False, tensorboard_log_dir=tmp_dir) + with patch.object( + muc_module.K, "backend", return_value="unsupported_backend" + ): + cb = MemoryUsageCallback( + log_every_batch=False, tensorboard_log_dir=tmp_dir + ) self.model.fit( - self.x_train, self.y_train, + self.x_train, + self.y_train, epochs=1, batch_size=self.batch_size, callbacks=[cb], verbose=0, ) files = glob.glob(os.path.join(tmp_dir, "events.out.tfevents.*")) - assert len(files) > 0, f"No TensorBoard event file found in {tmp_dir}" + assert len(files) > 0, ( + f"No TensorBoard event file found in {tmp_dir}" + ) def test_psutil_missing(self): """ @@ -100,7 +126,7 @@ def test_psutil_missing(self): muc_module.psutil = None with pytest.raises( ImportError, - match="MemoryUsageCallback requires the 'psutil' library" + match="MemoryUsageCallback requires the 'psutil' library", ): _ = muc_module.MemoryUsageCallback() finally: @@ -114,7 +140,7 @@ def test_gpu_memory_tensorflow(monkeypatch): whose memory_info()['current'] is 150 MiB. After reload, _get_gpu_memory() must return 150.0 (MB). """ - + if real_psutil: sys.modules["psutil"] = real_psutil @@ -124,10 +150,15 @@ def __init__(self, name): fake_tf = MagicMock() fake_tf.config.list_physical_devices.return_value = [FakeDevice("GPU:0")] - fake_tf.config.experimental.get_memory_info.return_value = {"current": 150 * 1024**2} + fake_tf.config.experimental.get_memory_info.return_value = { + "current": 150 * 1024**2 + } monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) - monkeypatch.setattr("keras.src.callbacks.memory_usage_callback.K.backend", lambda: "tensorflow") + monkeypatch.setattr( + "keras.src.callbacks.memory_usage_callback.K.backend", + lambda: "tensorflow", + ) reload(muc_module) @@ -153,7 +184,9 @@ def test_gpu_memory_torch(monkeypatch): ] monkeypatch.setitem(sys.modules, "torch", fake_torch) - monkeypatch.setattr("keras.src.callbacks.memory_usage_callback.K.backend", lambda: "torch") + monkeypatch.setattr( + "keras.src.callbacks.memory_usage_callback.K.backend", lambda: "torch" + ) reload(muc_module) @@ -172,6 +205,7 @@ def test_gpu_memory_jax(monkeypatch): class FakeDevice: platform = "GPU" + def memory_stats(self): return {"bytes_in_use": 220 * 1024**2} @@ -179,7 +213,9 @@ def memory_stats(self): fake_jax.devices.return_value = [FakeDevice(), FakeDevice()] monkeypatch.setitem(sys.modules, "jax", fake_jax) - monkeypatch.setattr("keras.src.callbacks.memory_usage_callback.K.backend", lambda: "jax") + monkeypatch.setattr( + "keras.src.callbacks.memory_usage_callback.K.backend", lambda: "jax" + ) reload(muc_module) From 20cbdf9c557adb686b535371dc66054929c33437 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 31 May 2025 23:08:57 +0300 Subject: [PATCH 14/17] Fix test file --- keras/src/callbacks/memory_usage_callback.py | 21 +- .../callbacks/memory_usage_callback_test.py | 354 ++++++++++-------- 2 files changed, 209 insertions(+), 166 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index ed650fe9bcdf..a4b7425568ff 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -1,7 +1,6 @@ - import os -import warnings import time +import warnings from keras.src import backend as K from keras.src.api_export import keras_export @@ -18,16 +17,19 @@ def running_on_gpu(): backend_name = K.backend() if backend_name == "tensorflow": import tensorflow as tf + return bool(tf.config.list_logical_devices("GPU")) elif backend_name == "torch": try: import torch + return torch.cuda.is_available() except ImportError: return False elif backend_name == "jax": try: import jax + return any(d.platform.upper() == "GPU" for d in jax.devices()) except ImportError: return False @@ -39,10 +41,12 @@ def running_on_tpu(): backend_name = K.backend() if backend_name == "tensorflow": import tensorflow as tf + return bool(tf.config.list_logical_devices("TPU")) elif backend_name == "jax": try: import jax + return any(d.platform.upper() == "TPU" for d in jax.devices()) except ImportError: return False @@ -123,7 +127,9 @@ def on_epoch_end(self, epoch, logs=None): def on_batch_end(self, batch, logs=None): if self.log_every_batch: print() - self._log_step(f"Batch {self._step_counter} end", self._step_counter) + self._log_step( + f"Batch {self._step_counter} end", self._step_counter + ) self._step_counter += 1 def on_train_end(self, logs=None): @@ -179,6 +185,7 @@ def _get_gpu_memory(self): try: if backend_name == "tensorflow": import tensorflow as tf + try: mem_info = tf.config.experimental.get_memory_info("GPU:0") return mem_info["current"] / (1024**2) @@ -189,7 +196,9 @@ def _get_gpu_memory(self): total = 0 for i, _ in enumerate(gpus): try: - info = tf.config.experimental.get_memory_info(f"GPU:{i}") + info = tf.config.experimental.get_memory_info( + f"GPU:{i}" + ) total += info.get("current", 0) except Exception: continue @@ -268,7 +277,9 @@ def _get_tpu_memory(self): return None try: stats = devs[0].memory_stats() - tpu_bytes = stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) + tpu_bytes = stats.get( + "bytes_in_use", stats.get("allocated_bytes", 0) + ) return tpu_bytes / (1024**2) except Exception: if not hasattr(self, "_warn_tpu_jax"): diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 03bb52e5d0cf..5cf07419c1fe 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -1,57 +1,73 @@ -import glob import os +import glob import sys import tempfile -from contextlib import redirect_stdout -from importlib import reload -from io import StringIO -from unittest.mock import MagicMock -from unittest.mock import patch +import re import numpy as np import pytest +import tensorflow as tf + +from io import StringIO +from contextlib import redirect_stdout +from importlib import reload +from unittest.mock import patch, MagicMock + +from keras.src import backend as K +from keras.src.callbacks.memory_usage_callback import ( + MemoryUsageCallback, + running_on_gpu, + running_on_tpu, +) +from keras.src.models import Sequential +from keras.src.layers import Dense -import keras.src.callbacks.memory_usage_callback as muc_module -from keras.src.callbacks.memory_usage_callback import MemoryUsageCallback try: - import psutil as real_psutil + import psutil except ImportError: - real_psutil = None + psutil = None -from keras.src.layers import Dense -from keras.src.models import Sequential -from keras.src.testing import TestCase +@pytest.mark.skipif(psutil is None, reason="psutil is required for MemoryUsageCallback tests.") +class TestMemoryUsageCallback: + """ + Test suite for MemoryUsageCallback. We explicitly patch `K.backend()` → "tensorflow" + whenever we call `model.fit(...)`, so that the callback’s logging logic actually runs. + Otherwise, on the “NumPy” backend, `.fit(…)` isn’t implemented and nothing is printed. + """ -@pytest.mark.skipif( - real_psutil is None, - reason="psutil is required for MemoryUsageCallback tests.", -) -class MemoryUsageCallbackTest(TestCase): - def setUp(self): - super().setUp() - self.x_train = np.random.random((16, 8)).astype(np.float32) - self.y_train = np.random.randint(0, 2, (16, 1)).astype(np.float32) - - self.model = Sequential( - [ - Dense(4, activation="relu", input_shape=(8,)), - Dense(1, activation="sigmoid"), - ] - ) + @pytest.fixture(autouse=True) + def setup_model(self): + self.x_train = np.random.random((20, 10)).astype(np.float32) + self.y_train = np.random.randint(0, 2, (20, 1)).astype(np.float32) + + self.model = Sequential([ + Dense(5, activation="relu", input_shape=(10,)), + Dense(1, activation="sigmoid") + ]) self.model.compile(optimizer="adam", loss="binary_crossentropy") self.epochs = 2 - self.batch_size = 4 + self.batch_size = 5 self.steps_per_epoch = len(self.x_train) // self.batch_size - def test_cpu_only_epoch_logging(self): - with patch.object( - muc_module.K, "backend", return_value="unsupported_backend" - ): - out = StringIO() - with redirect_stdout(out): + yield + + @pytest.mark.requires_trainable_backend + def test_cpu_only_epoch_logging(self, monkeypatch): + """ + If no GPU/TPU is present (or they are mocked off), then MemoryUsageCallback + should print exactly two lines per epoch (start + end), containing only CPU memory. + """ + + monkeypatch.setattr(K, "backend", lambda: "tensorflow") + + out = StringIO() + with redirect_stdout(out): + + with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): cb = MemoryUsageCallback(log_every_batch=False) self.model.fit( self.x_train, @@ -59,27 +75,40 @@ def test_cpu_only_epoch_logging(self): epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], - verbose=0, - ) - lines = [ - l.strip() for l in out.getvalue().splitlines() if l.strip() - ] - epoch_lines = [l for l in lines if l.startswith("Epoch")] - assert len(epoch_lines) == 4 - for i in range(self.epochs): - assert epoch_lines[2 * i].startswith( - f"Epoch {i} start - CPU Memory:" - ) - assert epoch_lines[2 * i + 1].startswith( - f"Epoch {i} end - CPU Memory:" + verbose=0 ) - def test_log_every_batch(self): - with patch.object( - muc_module.K, "backend", return_value="unsupported_backend" - ): - out = StringIO() - with redirect_stdout(out): + lines = out.getvalue().splitlines() + + start_lines = [ + ln for ln in lines + if re.match(r"^Epoch \d+ start - CPU Memory: [\d\.]+ MB$", ln) + ] + end_lines = [ + ln for ln in lines + if re.match(r"^Epoch \d+ end - CPU Memory: [\d\.]+ MB$", ln) + ] + + assert len(start_lines) == self.epochs + assert len(end_lines) == self.epochs + + + assert all("GPU Memory" not in ln for ln in lines) + assert all("TPU Memory" not in ln for ln in lines) + + @pytest.mark.requires_trainable_backend + def test_log_every_batch(self, monkeypatch): + """ + If log_every_batch=True and no GPU/TPU, the callback should print batch-level lines + in addition to the epoch-start and epoch-end lines. + """ + + monkeypatch.setattr(K, "backend", lambda: "tensorflow") + + out = StringIO() + with redirect_stdout(out): + with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): cb = MemoryUsageCallback(log_every_batch=True) self.model.fit( self.x_train, @@ -87,138 +116,141 @@ def test_log_every_batch(self): epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], - verbose=0, - ) - lines = [ - l.strip() for l in out.getvalue().splitlines() if l.strip() - ] - batch_lines = [l for l in lines if l.startswith("Batch")] - assert len(batch_lines) == self.epochs * self.steps_per_epoch - - def test_tensorboard_log_dir(self): - with tempfile.TemporaryDirectory() as tmp_dir: - with patch.object( - muc_module.K, "backend", return_value="unsupported_backend" - ): - cb = MemoryUsageCallback( - log_every_batch=False, tensorboard_log_dir=tmp_dir + verbose=0 ) + + lines = out.getvalue().splitlines() + + batch_lines = [ + ln for ln in lines + if re.match(r"^Batch \d+ end - CPU Memory: [\d\.]+ MB$", ln) + ] + expected_batches = self.epochs * self.steps_per_epoch + assert len(batch_lines) == expected_batches + + @pytest.mark.requires_trainable_backend + def test_tensorboard_log_dir(self, monkeypatch): + """ + When tensorboard_log_dir is provided and .fit(...) is called, we should see + at least one file matching events.out.tfevents.* inside that directory. + """ + + monkeypatch.setattr(K, "backend", lambda: "tensorflow") + + with tempfile.TemporaryDirectory() as tmpdir: + log_dir = os.path.join(tmpdir, "tb_logs") + + with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): + cb = MemoryUsageCallback(log_every_batch=True, tensorboard_log_dir=log_dir) + + # __init__ should have created the folder already + assert os.path.isdir(log_dir), "tensorboard_log_dir must exist after callback __init__" + self.model.fit( self.x_train, self.y_train, - epochs=1, + epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], - verbose=0, + verbose=0 ) - files = glob.glob(os.path.join(tmp_dir, "events.out.tfevents.*")) - assert len(files) > 0, ( - f"No TensorBoard event file found in {tmp_dir}" - ) - def test_psutil_missing(self): + event_files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*")) + assert len(event_files) > 0, f"No TensorBoard event files found under {log_dir}" + + @pytest.mark.requires_trainable_backend + def test_get_gpu_memory_tensorflow(self, monkeypatch): """ - Temporarily override the module's `psutil` to None so - that instantiating MemoryUsageCallback raises ImportError. + _get_gpu_memory() returns a float when TF backend has at least one GPU + with a reported "current" usage of 150 MiB. """ - original_psutil = muc_module.psutil - try: - muc_module.psutil = None - with pytest.raises( - ImportError, - match="MemoryUsageCallback requires the 'psutil' library", - ): - _ = muc_module.MemoryUsageCallback() - finally: - muc_module.psutil = original_psutil - reload(muc_module) - -def test_gpu_memory_tensorflow(monkeypatch): - """ - Simulate TensorFlow backend with one GPU device named "GPU:0" - whose memory_info()['current'] is 150 MiB. After reload, _get_gpu_memory() - must return 150.0 (MB). - """ + monkeypatch.setattr(K, "backend", lambda: "tensorflow") - if real_psutil: - sys.modules["psutil"] = real_psutil + fake_tf = MagicMock() + fake_tf.config.list_physical_devices.return_value = ["GPU:0"] + fake_tf.config.experimental.get_memory_info.return_value = {"current": 150 * 1024**2} - class FakeDevice: - def __init__(self, name): - self.name = name + monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) + monkeypatch.setitem(sys.modules, "tensorflow.config", fake_tf.config) + monkeypatch.setitem(sys.modules, "tensorflow.config.experimental", fake_tf.config.experimental) - fake_tf = MagicMock() - fake_tf.config.list_physical_devices.return_value = [FakeDevice("GPU:0")] - fake_tf.config.experimental.get_memory_info.return_value = { - "current": 150 * 1024**2 - } + cb = MemoryUsageCallback() + mem = cb._get_gpu_memory() + assert pytest.approx(150.0, rel=1e-6) == mem - monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) - monkeypatch.setattr( - "keras.src.callbacks.memory_usage_callback.K.backend", - lambda: "tensorflow", - ) - - reload(muc_module) + @pytest.mark.requires_trainable_backend + def test_get_gpu_memory_torch(self, monkeypatch): + """ + _get_gpu_memory() returns the sum of memory_allocated across devices + when torch.cuda.is_available() is True. We simulate 100 MiB + 200 MiB. + """ - cb = muc_module.MemoryUsageCallback() - mem_mb = cb._get_gpu_memory() - assert pytest.approx(150.0, rel=1e-6) == mem_mb + monkeypatch.setattr(K, "backend", lambda: "torch") + fake_torch = MagicMock() + fake_torch.cuda.is_available.return_value = True + fake_torch.cuda.device_count.return_value = 2 + fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 200 * 1024**2] -def test_gpu_memory_torch(monkeypatch): - """ - Simulate PyTorch backend with 2 GPUs that allocate 100 MiB and 200 MiB. - After reload, _get_gpu_memory() should return 300.0 (MB). - """ - if real_psutil: - sys.modules["psutil"] = real_psutil + monkeypatch.setitem(sys.modules, "torch", fake_torch) + monkeypatch.setitem(sys.modules, "torch.cuda", fake_torch.cuda) - fake_torch = MagicMock() - fake_torch.cuda.is_available.return_value = True - fake_torch.cuda.device_count.return_value = 2 - fake_torch.cuda.memory_allocated.side_effect = [ - 100 * 1024**2, - 200 * 1024**2, - ] + cb = MemoryUsageCallback() + mem = cb._get_gpu_memory() + assert pytest.approx(300.0, rel=1e-6) == mem - monkeypatch.setitem(sys.modules, "torch", fake_torch) - monkeypatch.setattr( - "keras.src.callbacks.memory_usage_callback.K.backend", lambda: "torch" - ) + @pytest.mark.requires_trainable_backend + def test_get_gpu_memory_jax(self, monkeypatch): + """ + _get_gpu_memory() returns the sum of bytes_in_use across all JAX GPU devices. + We simulate two GPU devices, each reporting 200 MiB. + """ - reload(muc_module) + monkeypatch.setattr(K, "backend", lambda: "jax") - cb = muc_module.MemoryUsageCallback() - mem_mb = cb._get_gpu_memory() - assert pytest.approx(300.0, rel=1e-6) == mem_mb + class FakeDev: + platform = "gpu" + def memory_stats(self): + return {"bytes_in_use": 200 * 1024**2} + fake_jax = MagicMock() + fake_jax.devices.return_value = [FakeDev(), FakeDev()] -def test_gpu_memory_jax(monkeypatch): - """ - Simulate JAX backend with two GPU devices each reporting - bytes_in_use=220 MiB. Expect 440.0 (MB). - """ - if real_psutil: - sys.modules["psutil"] = real_psutil + monkeypatch.setitem(sys.modules, "jax", fake_jax) - class FakeDevice: - platform = "GPU" + cb = MemoryUsageCallback() + mem = cb._get_gpu_memory() + assert pytest.approx(400.0, rel=1e-6) == mem - def memory_stats(self): - return {"bytes_in_use": 220 * 1024**2} - - fake_jax = MagicMock() - fake_jax.devices.return_value = [FakeDevice(), FakeDevice()] + def test_running_on_gpu_and_tpu_flags(self): + """ + running_on_gpu() and running_on_tpu() should return a boolean in all cases. + """ + val_gpu = running_on_gpu() + val_tpu = running_on_tpu() + assert isinstance(val_gpu, bool) + assert isinstance(val_tpu, bool) - monkeypatch.setitem(sys.modules, "jax", fake_jax) - monkeypatch.setattr( - "keras.src.callbacks.memory_usage_callback.K.backend", lambda: "jax" - ) + def test_psutil_missing(self): + """ + If psutil is not importable, attempting to instantiate MemoryUsageCallback + must raise ImportError. We temporarily remove psutil from sys.modules, + reload the module, then restore psutil. + """ - reload(muc_module) + orig = sys.modules.pop("psutil", None) - cb = muc_module.MemoryUsageCallback() - mem_mb = cb._get_gpu_memory() - assert pytest.approx(440.0, rel=1e-6) == mem_mb + try: + import keras.src.callbacks.memory_usage_callback as mod + with patch.dict(sys.modules, {"psutil": None}): + with pytest.raises(ImportError, match="MemoryUsageCallback requires the 'psutil' library"): + reload(mod) + _ = mod.MemoryUsageCallback() + finally: + + if orig is not None: + sys.modules["psutil"] = orig + from importlib import reload as _r + _r(sys.modules["keras.src.callbacks.memory_usage_callback"]) From a671b62a90d672ad266baad447c146b261a40a53 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Sat, 7 Jun 2025 13:54:14 +0300 Subject: [PATCH 15/17] Fix TPU support --- keras/src/callbacks/memory_usage_callback.py | 46 +---- .../callbacks/memory_usage_callback_test.py | 189 ++++++++++-------- 2 files changed, 113 insertions(+), 122 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index a4b7425568ff..65890ec506b4 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -42,6 +42,12 @@ def running_on_tpu(): if backend_name == "tensorflow": import tensorflow as tf + try: + resolver = tf.distribute.cluster_resolver.TPUClusterResolver() + tf.config.experimental_connect_to_cluster(resolver) + tf.tpu.experimental.initialize_tpu_system(resolver) + except Exception: + pass return bool(tf.config.list_logical_devices("TPU")) elif backend_name == "jax": try: @@ -93,7 +99,6 @@ def __init__( "MemoryUsageCallback requires the 'psutil' library. " "To install, please use: pip install psutil" ) - self.log_every_batch = log_every_batch self._proc = psutil.Process() self._step_counter = 0 @@ -127,9 +132,7 @@ def on_epoch_end(self, epoch, logs=None): def on_batch_end(self, batch, logs=None): if self.log_every_batch: print() - self._log_step( - f"Batch {self._step_counter} end", self._step_counter - ) + self._log_step(f"Batch {self._step_counter} end", self._step_counter) self._step_counter += 1 def on_train_end(self, logs=None): @@ -155,7 +158,6 @@ def _log_step(self, label, step): msg += f"; GPU Memory: {gpu_mb:.2f} MB" if tpu_mb is not None: msg += f"; TPU Memory: {tpu_mb:.2f} MB" - print(msg) time.sleep(0) @@ -180,7 +182,6 @@ def _get_gpu_memory(self): """ if not running_on_gpu(): return None - backend_name = K.backend() try: if backend_name == "tensorflow": @@ -196,26 +197,21 @@ def _get_gpu_memory(self): total = 0 for i, _ in enumerate(gpus): try: - info = tf.config.experimental.get_memory_info( - f"GPU:{i}" - ) + info = tf.config.experimental.get_memory_info(f"GPU:{i}") total += info.get("current", 0) except Exception: continue return total / (1024**2) - elif backend_name == "torch": import torch if not torch.cuda.is_available(): return None - device_count = torch.cuda.device_count() total_bytes = 0 for i in range(device_count): total_bytes += torch.cuda.memory_allocated(i) return total_bytes / (1024**2) - elif backend_name == "jax": import jax @@ -227,9 +223,7 @@ def _get_gpu_memory(self): stats = getattr(d, "memory_stats", lambda: {})() total += stats.get("bytes_in_use", 0) return total / (1024**2) - return None - except ImportError as imp_err: if not hasattr(self, "_warn_import"): warnings.warn( @@ -238,12 +232,9 @@ def _get_gpu_memory(self): ) self._warn_import = True return None - except Exception as exc: if not hasattr(self, "_warn_exc"): - warnings.warn( - f"Error retrieving GPU memory: {exc}", RuntimeWarning - ) + warnings.warn(f"Error retrieving GPU memory: {exc}", RuntimeWarning) self._warn_exc = True return None @@ -251,24 +242,14 @@ def _get_tpu_memory(self): """ Return current TPU memory usage in MB for the detected backend, or None if no TPU is present or if measurement fails. - Note: TPU memory APIs vary; here we attempt best‐effort. + Note: TPU memory APIs vary; here we attempt best-effort. """ if not running_on_tpu(): return None - backend_name = K.backend() try: if backend_name == "tensorflow": - import tensorflow as tf - - if not hasattr(self, "_warn_tpu_tf"): - warnings.warn( - "TensorFlow TPU memory info is not directly available; returning None.", - RuntimeWarning, - ) - self._warn_tpu_tf = True return None - elif backend_name == "jax": import jax @@ -289,9 +270,7 @@ def _get_tpu_memory(self): ) self._warn_tpu_jax = True return None - return None - except ImportError as imp_err: if not hasattr(self, "_warn_tpu_imp"): warnings.warn( @@ -300,11 +279,8 @@ def _get_tpu_memory(self): ) self._warn_tpu_imp = True return None - except Exception as exc: if not hasattr(self, "_warn_tpu_exc"): - warnings.warn( - f"Error retrieving TPU memory: {exc}", RuntimeWarning - ) + warnings.warn(f"Error retrieving TPU memory: {exc}", RuntimeWarning) self._warn_tpu_exc = True return None diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 5cf07419c1fe..07a8daeb1d69 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import tensorflow as tf from io import StringIO from contextlib import redirect_stdout @@ -22,14 +21,15 @@ from keras.src.models import Sequential from keras.src.layers import Dense - try: import psutil except ImportError: psutil = None -@pytest.mark.skipif(psutil is None, reason="psutil is required for MemoryUsageCallback tests.") +@pytest.mark.skipif( + psutil is None, reason="psutil is required for MemoryUsageCallback tests." +) class TestMemoryUsageCallback: """ Test suite for MemoryUsageCallback. We explicitly patch `K.backend()` → "tensorflow" @@ -42,10 +42,12 @@ def setup_model(self): self.x_train = np.random.random((20, 10)).astype(np.float32) self.y_train = np.random.randint(0, 2, (20, 1)).astype(np.float32) - self.model = Sequential([ - Dense(5, activation="relu", input_shape=(10,)), - Dense(1, activation="sigmoid") - ]) + self.model = Sequential( + [ + Dense(5, activation="relu", input_shape=(10,)), + Dense(1, activation="sigmoid"), + ] + ) self.model.compile(optimizer="adam", loss="binary_crossentropy") self.epochs = 2 @@ -56,18 +58,17 @@ def setup_model(self): @pytest.mark.requires_trainable_backend def test_cpu_only_epoch_logging(self, monkeypatch): - """ - If no GPU/TPU is present (or they are mocked off), then MemoryUsageCallback - should print exactly two lines per epoch (start + end), containing only CPU memory. - """ - monkeypatch.setattr(K, "backend", lambda: "tensorflow") out = StringIO() with redirect_stdout(out): - - with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ - patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): + with patch( + "keras.src.callbacks.memory_usage_callback.running_on_gpu", + return_value=False, + ), patch( + "keras.src.callbacks.memory_usage_callback.running_on_tpu", + return_value=False, + ): cb = MemoryUsageCallback(log_every_batch=False) self.model.fit( self.x_train, @@ -75,40 +76,38 @@ def test_cpu_only_epoch_logging(self, monkeypatch): epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], - verbose=0 + verbose=0, ) - lines = out.getvalue().splitlines() - start_lines = [ - ln for ln in lines + ln + for ln in lines if re.match(r"^Epoch \d+ start - CPU Memory: [\d\.]+ MB$", ln) ] end_lines = [ - ln for ln in lines + ln + for ln in lines if re.match(r"^Epoch \d+ end - CPU Memory: [\d\.]+ MB$", ln) ] assert len(start_lines) == self.epochs assert len(end_lines) == self.epochs - - assert all("GPU Memory" not in ln for ln in lines) assert all("TPU Memory" not in ln for ln in lines) @pytest.mark.requires_trainable_backend def test_log_every_batch(self, monkeypatch): - """ - If log_every_batch=True and no GPU/TPU, the callback should print batch-level lines - in addition to the epoch-start and epoch-end lines. - """ - monkeypatch.setattr(K, "backend", lambda: "tensorflow") out = StringIO() with redirect_stdout(out): - with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ - patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): + with patch( + "keras.src.callbacks.memory_usage_callback.running_on_gpu", + return_value=False, + ), patch( + "keras.src.callbacks.memory_usage_callback.running_on_tpu", + return_value=False, + ): cb = MemoryUsageCallback(log_every_batch=True) self.model.fit( self.x_train, @@ -116,13 +115,12 @@ def test_log_every_batch(self, monkeypatch): epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], - verbose=0 + verbose=0, ) - lines = out.getvalue().splitlines() - batch_lines = [ - ln for ln in lines + ln + for ln in lines if re.match(r"^Batch \d+ end - CPU Memory: [\d\.]+ MB$", ln) ] expected_batches = self.epochs * self.steps_per_epoch @@ -130,22 +128,21 @@ def test_log_every_batch(self, monkeypatch): @pytest.mark.requires_trainable_backend def test_tensorboard_log_dir(self, monkeypatch): - """ - When tensorboard_log_dir is provided and .fit(...) is called, we should see - at least one file matching events.out.tfevents.* inside that directory. - """ - monkeypatch.setattr(K, "backend", lambda: "tensorflow") with tempfile.TemporaryDirectory() as tmpdir: log_dir = os.path.join(tmpdir, "tb_logs") - - with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ - patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): - cb = MemoryUsageCallback(log_every_batch=True, tensorboard_log_dir=log_dir) - - # __init__ should have created the folder already - assert os.path.isdir(log_dir), "tensorboard_log_dir must exist after callback __init__" + with patch( + "keras.src.callbacks.memory_usage_callback.running_on_gpu", + return_value=False, + ), patch( + "keras.src.callbacks.memory_usage_callback.running_on_tpu", + return_value=False, + ): + cb = MemoryUsageCallback( + log_every_batch=True, tensorboard_log_dir=log_dir + ) + assert os.path.isdir(log_dir) self.model.fit( self.x_train, @@ -153,104 +150,122 @@ def test_tensorboard_log_dir(self, monkeypatch): epochs=self.epochs, batch_size=self.batch_size, callbacks=[cb], - verbose=0 + verbose=0, ) event_files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*")) - assert len(event_files) > 0, f"No TensorBoard event files found under {log_dir}" + assert event_files, f"No TensorBoard events under {log_dir}" @pytest.mark.requires_trainable_backend def test_get_gpu_memory_tensorflow(self, monkeypatch): - """ - _get_gpu_memory() returns a float when TF backend has at least one GPU - with a reported "current" usage of 150 MiB. - """ - monkeypatch.setattr(K, "backend", lambda: "tensorflow") - fake_tf = MagicMock() fake_tf.config.list_physical_devices.return_value = ["GPU:0"] - fake_tf.config.experimental.get_memory_info.return_value = {"current": 150 * 1024**2} - + fake_tf.config.experimental.get_memory_info.return_value = { + "current": 150 * 1024**2 + } monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) monkeypatch.setitem(sys.modules, "tensorflow.config", fake_tf.config) - monkeypatch.setitem(sys.modules, "tensorflow.config.experimental", fake_tf.config.experimental) + monkeypatch.setitem( + sys.modules, "tensorflow.config.experimental", fake_tf.config.experimental + ) cb = MemoryUsageCallback() - mem = cb._get_gpu_memory() - assert pytest.approx(150.0, rel=1e-6) == mem + assert pytest.approx(150.0, rel=1e-6) == cb._get_gpu_memory() @pytest.mark.requires_trainable_backend def test_get_gpu_memory_torch(self, monkeypatch): - """ - _get_gpu_memory() returns the sum of memory_allocated across devices - when torch.cuda.is_available() is True. We simulate 100 MiB + 200 MiB. - """ - monkeypatch.setattr(K, "backend", lambda: "torch") - fake_torch = MagicMock() fake_torch.cuda.is_available.return_value = True fake_torch.cuda.device_count.return_value = 2 fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 200 * 1024**2] - monkeypatch.setitem(sys.modules, "torch", fake_torch) monkeypatch.setitem(sys.modules, "torch.cuda", fake_torch.cuda) cb = MemoryUsageCallback() - mem = cb._get_gpu_memory() - assert pytest.approx(300.0, rel=1e-6) == mem + assert pytest.approx(300.0, rel=1e-6) == cb._get_gpu_memory() @pytest.mark.requires_trainable_backend def test_get_gpu_memory_jax(self, monkeypatch): - """ - _get_gpu_memory() returns the sum of bytes_in_use across all JAX GPU devices. - We simulate two GPU devices, each reporting 200 MiB. - """ - monkeypatch.setattr(K, "backend", lambda: "jax") class FakeDev: platform = "gpu" + def memory_stats(self): return {"bytes_in_use": 200 * 1024**2} fake_jax = MagicMock() fake_jax.devices.return_value = [FakeDev(), FakeDev()] - monkeypatch.setitem(sys.modules, "jax", fake_jax) cb = MemoryUsageCallback() - mem = cb._get_gpu_memory() - assert pytest.approx(400.0, rel=1e-6) == mem + assert pytest.approx(400.0, rel=1e-6) == cb._get_gpu_memory() def test_running_on_gpu_and_tpu_flags(self): - """ - running_on_gpu() and running_on_tpu() should return a boolean in all cases. - """ val_gpu = running_on_gpu() val_tpu = running_on_tpu() assert isinstance(val_gpu, bool) assert isinstance(val_tpu, bool) def test_psutil_missing(self): - """ - If psutil is not importable, attempting to instantiate MemoryUsageCallback - must raise ImportError. We temporarily remove psutil from sys.modules, - reload the module, then restore psutil. - """ - orig = sys.modules.pop("psutil", None) - try: import keras.src.callbacks.memory_usage_callback as mod + with patch.dict(sys.modules, {"psutil": None}): - with pytest.raises(ImportError, match="MemoryUsageCallback requires the 'psutil' library"): + with pytest.raises( + ImportError, + match="MemoryUsageCallback requires the 'psutil' library", + ): reload(mod) _ = mod.MemoryUsageCallback() finally: - if orig is not None: sys.modules["psutil"] = orig from importlib import reload as _r + _r(sys.modules["keras.src.callbacks.memory_usage_callback"]) + + @pytest.mark.requires_trainable_backend + def test_running_on_tpu_true(self, monkeypatch): + monkeypatch.setattr(K, "backend", lambda: "tensorflow") + fake_tf = MagicMock() + fake_tf.config.list_logical_devices.return_value = ["TPU:0"] + fake_tf.distribute.cluster_resolver.TPUClusterResolver = lambda: MagicMock() + fake_tf.config.experimental_connect_to_cluster = lambda resolver: None + fake_tf.tpu.experimental.initialize_tpu_system = lambda resolver: None + monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) + + assert running_on_tpu() is True + + @pytest.mark.requires_trainable_backend + def test_get_tpu_memory_tensorflow(self, monkeypatch): + monkeypatch.setattr(K, "backend", lambda: "tensorflow") + fake_tf = MagicMock() + fake_tf.config.list_logical_devices.return_value = ["TPU:0"] + fake_tf.distribute.cluster_resolver.TPUClusterResolver = lambda: MagicMock() + fake_tf.config.experimental_connect_to_cluster = lambda resolver: None + fake_tf.tpu.experimental.initialize_tpu_system = lambda resolver: None + monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) + + cb = MemoryUsageCallback() + assert cb._get_tpu_memory() is None + + @pytest.mark.requires_trainable_backend + def test_get_tpu_memory_jax(self, monkeypatch): + monkeypatch.setattr(K, "backend", lambda: "jax") + + class FakeTpuDev: + platform = "tpu" + + def memory_stats(self): + return {"bytes_in_use": 250 * 1024**2} + + fake_jax = MagicMock() + fake_jax.devices.return_value = [FakeTpuDev()] + monkeypatch.setitem(sys.modules, "jax", fake_jax) + + cb = MemoryUsageCallback() + assert pytest.approx(250.0, rel=1e-6) == cb._get_tpu_memory() From bd7fc075cfcaf76fb4cedcaadbb7e9dfe7082cd9 Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Fri, 20 Jun 2025 18:03:51 +0300 Subject: [PATCH 16/17] Fix callback code --- keras/src/callbacks/memory_usage_callback.py | 83 +++++++------------- 1 file changed, 27 insertions(+), 56 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 65890ec506b4..2684a728c705 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -66,9 +66,9 @@ class MemoryUsageCallback(Callback): This callback measures: - **CPU**: via psutil.Process().memory_info().rss - - **GPU**: if a GPU is detected, via backend-specific APIs + - **GPU**: if a GPU is detected, via backend‐specific APIs (TensorFlow, PyTorch, JAX) - - **TPU**: if a TPU is detected, via backend-specific APIs + - **TPU**: if a TPU is detected, via backend‐specific APIs (TensorFlow, JAX) Logs are printed to stdout at the start and end of each epoch @@ -78,20 +78,14 @@ class MemoryUsageCallback(Callback): via tf.summary (TensorBoard). Args: - log_every_batch (bool): If True, also log after each batch. Defaults to False - (i.e., log only at epoch start and end). - tensorboard_log_dir (str|None): Directory for TensorBoard logs; if None, - no TF summary writer is created. + log_every_batch (bool): If True, also log after each batch. Defaults to False. + tensorboard_log_dir (str|None): Directory for TensorBoard logs; if None, no TF writer. Raises: ImportError: If `psutil` is not installed (required for CPU logging). """ - def __init__( - self, - log_every_batch=False, - tensorboard_log_dir=None, - ): + def __init__(self, log_every_batch=False, tensorboard_log_dir=None): super().__init__() if psutil is None: @@ -122,6 +116,17 @@ def on_train_begin(self, logs=None): self._step_counter = 0 def on_epoch_begin(self, epoch, logs=None): + + backend_name = K.backend() + if backend_name == "tensorflow": + import tensorflow as tf + + tf.config.experimental.reset_memory_stats("GPU:0") + elif backend_name == "torch": + import torch + + torch.cuda.reset_peak_memory_stats() + print() self._log_epoch("start", epoch) @@ -145,10 +150,6 @@ def _log_epoch(self, when, epoch, offset=0): self._log_step(label, step) def _log_step(self, label, step): - """ - Internal helper to measure and print CPU/GPU/TPU memory. - Inserts a short delay (time.sleep(0)) to let stdout flush cleanly. - """ cpu_mb = self._get_cpu_memory() gpu_mb = self._get_gpu_memory() tpu_mb = self._get_tpu_memory() @@ -172,12 +173,11 @@ def _log_step(self, label, step): tf.summary.scalar("Memory/TPU_MB", tpu_mb) def _get_cpu_memory(self): - """Return current process CPU memory usage in MB.""" return self._proc.memory_info().rss / (1024**2) def _get_gpu_memory(self): """ - Return current GPU memory usage in MB for the detected backend, + Return peak GPU memory usage in MB for the detected backend, or None if no GPU is present or if measurement fails. """ if not running_on_gpu(): @@ -187,42 +187,25 @@ def _get_gpu_memory(self): if backend_name == "tensorflow": import tensorflow as tf - try: - mem_info = tf.config.experimental.get_memory_info("GPU:0") - return mem_info["current"] / (1024**2) - except Exception: - gpus = tf.config.list_physical_devices("GPU") - if not gpus: - return None - total = 0 - for i, _ in enumerate(gpus): - try: - info = tf.config.experimental.get_memory_info(f"GPU:{i}") - total += info.get("current", 0) - except Exception: - continue - return total / (1024**2) + info = tf.config.experimental.get_memory_info("GPU:0") + return info["peak"] / (1024**2) elif backend_name == "torch": import torch if not torch.cuda.is_available(): return None - device_count = torch.cuda.device_count() - total_bytes = 0 - for i in range(device_count): - total_bytes += torch.cuda.memory_allocated(i) - return total_bytes / (1024**2) + return torch.cuda.max_memory_reserved() / (1024**2) elif backend_name == "jax": import jax devs = [d for d in jax.devices() if d.platform.upper() == "GPU"] if not devs: return None - total = 0 + total_peak = 0 for d in devs: stats = getattr(d, "memory_stats", lambda: {})() - total += stats.get("bytes_in_use", 0) - return total / (1024**2) + total_peak += stats.get("peak_bytes", stats.get("bytes_in_use", 0)) + return total_peak / (1024**2) return None except ImportError as imp_err: if not hasattr(self, "_warn_import"): @@ -242,7 +225,6 @@ def _get_tpu_memory(self): """ Return current TPU memory usage in MB for the detected backend, or None if no TPU is present or if measurement fails. - Note: TPU memory APIs vary; here we attempt best-effort. """ if not running_on_tpu(): return None @@ -256,20 +238,9 @@ def _get_tpu_memory(self): devs = [d for d in jax.devices() if d.platform.upper() == "TPU"] if not devs: return None - try: - stats = devs[0].memory_stats() - tpu_bytes = stats.get( - "bytes_in_use", stats.get("allocated_bytes", 0) - ) - return tpu_bytes / (1024**2) - except Exception: - if not hasattr(self, "_warn_tpu_jax"): - warnings.warn( - "Failed to retrieve JAX TPU memory stats; returning None.", - RuntimeWarning, - ) - self._warn_tpu_jax = True - return None + stats = devs[0].memory_stats() + tpu_bytes = stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) + return tpu_bytes / (1024**2) return None except ImportError as imp_err: if not hasattr(self, "_warn_tpu_imp"): @@ -283,4 +254,4 @@ def _get_tpu_memory(self): if not hasattr(self, "_warn_tpu_exc"): warnings.warn(f"Error retrieving TPU memory: {exc}", RuntimeWarning) self._warn_tpu_exc = True - return None + return None \ No newline at end of file From 8f3764966aa5d18ddadf5f395a2981578b1c9feb Mon Sep 17 00:00:00 2001 From: DimiChatzipavlis Date: Fri, 20 Jun 2025 19:05:41 +0300 Subject: [PATCH 17/17] Fix tests --- keras/src/callbacks/memory_usage_callback.py | 75 +++-- .../callbacks/memory_usage_callback_test.py | 282 ++++++------------ 2 files changed, 138 insertions(+), 219 deletions(-) diff --git a/keras/src/callbacks/memory_usage_callback.py b/keras/src/callbacks/memory_usage_callback.py index 2684a728c705..ad6831e7041f 100644 --- a/keras/src/callbacks/memory_usage_callback.py +++ b/keras/src/callbacks/memory_usage_callback.py @@ -18,7 +18,10 @@ def running_on_gpu(): if backend_name == "tensorflow": import tensorflow as tf - return bool(tf.config.list_logical_devices("GPU")) + try: + return bool(tf.config.list_physical_devices("GPU")) + except Exception: + return False elif backend_name == "torch": try: import torch @@ -48,7 +51,10 @@ def running_on_tpu(): tf.tpu.experimental.initialize_tpu_system(resolver) except Exception: pass - return bool(tf.config.list_logical_devices("TPU")) + try: + return bool(tf.config.list_physical_devices("TPU")) + except Exception: + return False elif backend_name == "jax": try: import jax @@ -66,9 +72,9 @@ class MemoryUsageCallback(Callback): This callback measures: - **CPU**: via psutil.Process().memory_info().rss - - **GPU**: if a GPU is detected, via backend‐specific APIs + - **GPU**: if a GPU is detected, via backend-specific APIs (TensorFlow, PyTorch, JAX) - - **TPU**: if a TPU is detected, via backend‐specific APIs + - **TPU**: if a TPU is detected, via backend-specific APIs (TensorFlow, JAX) Logs are printed to stdout at the start and end of each epoch @@ -79,13 +85,18 @@ class MemoryUsageCallback(Callback): Args: log_every_batch (bool): If True, also log after each batch. Defaults to False. - tensorboard_log_dir (str|None): Directory for TensorBoard logs; if None, no TF writer. + tensorboard_log_dir (str|None): Directory for TensorBoard logs; if None, + no TF summary writer is created. Raises: ImportError: If `psutil` is not installed (required for CPU logging). """ - def __init__(self, log_every_batch=False, tensorboard_log_dir=None): + def __init__( + self, + log_every_batch=False, + tensorboard_log_dir=None, + ): super().__init__() if psutil is None: @@ -107,8 +118,7 @@ def __init__(self, log_every_batch=False, tensorboard_log_dir=None): print(f"MemoryUsageCallback: TensorBoard logs → {logdir}") except Exception as e: warnings.warn( - f"Could not initialize TensorBoard writer: {e}", - RuntimeWarning, + f"Could not initialize TensorBoard writer: {e}", RuntimeWarning ) self._writer = None @@ -116,17 +126,6 @@ def on_train_begin(self, logs=None): self._step_counter = 0 def on_epoch_begin(self, epoch, logs=None): - - backend_name = K.backend() - if backend_name == "tensorflow": - import tensorflow as tf - - tf.config.experimental.reset_memory_stats("GPU:0") - elif backend_name == "torch": - import torch - - torch.cuda.reset_peak_memory_stats() - print() self._log_epoch("start", epoch) @@ -150,6 +149,10 @@ def _log_epoch(self, when, epoch, offset=0): self._log_step(label, step) def _log_step(self, label, step): + """ + Internal helper to measure and print CPU/GPU/TPU memory. + Inserts a short delay (time.sleep(0)) to let stdout flush cleanly. + """ cpu_mb = self._get_cpu_memory() gpu_mb = self._get_gpu_memory() tpu_mb = self._get_tpu_memory() @@ -173,11 +176,12 @@ def _log_step(self, label, step): tf.summary.scalar("Memory/TPU_MB", tpu_mb) def _get_cpu_memory(self): + """Return current process CPU memory usage in MB.""" return self._proc.memory_info().rss / (1024**2) def _get_gpu_memory(self): """ - Return peak GPU memory usage in MB for the detected backend, + Return current GPU memory usage in MB for the detected backend, or None if no GPU is present or if measurement fails. """ if not running_on_gpu(): @@ -187,25 +191,41 @@ def _get_gpu_memory(self): if backend_name == "tensorflow": import tensorflow as tf - info = tf.config.experimental.get_memory_info("GPU:0") - return info["peak"] / (1024**2) + try: + mem_info = tf.config.experimental.get_memory_info("GPU:0") + return mem_info["current"] / (1024**2) + except Exception: + gpus = tf.config.list_physical_devices("GPU") + if not gpus: + return None + total = 0 + for i in range(len(gpus)): + try: + info = tf.config.experimental.get_memory_info(f"GPU:{i}") + total += info.get("current", 0) + except Exception: + continue + return total / (1024**2) elif backend_name == "torch": import torch if not torch.cuda.is_available(): return None - return torch.cuda.max_memory_reserved() / (1024**2) + total_bytes = 0 + for i in range(torch.cuda.device_count()): + total_bytes += torch.cuda.memory_allocated(i) + return total_bytes / (1024**2) elif backend_name == "jax": import jax devs = [d for d in jax.devices() if d.platform.upper() == "GPU"] if not devs: return None - total_peak = 0 + total = 0 for d in devs: stats = getattr(d, "memory_stats", lambda: {})() - total_peak += stats.get("peak_bytes", stats.get("bytes_in_use", 0)) - return total_peak / (1024**2) + total += stats.get("bytes_in_use", 0) + return total / (1024**2) return None except ImportError as imp_err: if not hasattr(self, "_warn_import"): @@ -225,6 +245,7 @@ def _get_tpu_memory(self): """ Return current TPU memory usage in MB for the detected backend, or None if no TPU is present or if measurement fails. + Note: TPU memory APIs vary; here we attempt best‐effort. """ if not running_on_tpu(): return None @@ -238,7 +259,7 @@ def _get_tpu_memory(self): devs = [d for d in jax.devices() if d.platform.upper() == "TPU"] if not devs: return None - stats = devs[0].memory_stats() + stats = getattr(devs[0], "memory_stats", lambda: {})() tpu_bytes = stats.get("bytes_in_use", stats.get("allocated_bytes", 0)) return tpu_bytes / (1024**2) return None diff --git a/keras/src/callbacks/memory_usage_callback_test.py b/keras/src/callbacks/memory_usage_callback_test.py index 07a8daeb1d69..852d72fe9afb 100644 --- a/keras/src/callbacks/memory_usage_callback_test.py +++ b/keras/src/callbacks/memory_usage_callback_test.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import tensorflow as tf from io import StringIO from contextlib import redirect_stdout @@ -27,245 +28,142 @@ psutil = None -@pytest.mark.skipif( - psutil is None, reason="psutil is required for MemoryUsageCallback tests." -) +@pytest.mark.skipif(psutil is None, reason="psutil is required for MemoryUsageCallback tests.") class TestMemoryUsageCallback: - """ - Test suite for MemoryUsageCallback. We explicitly patch `K.backend()` → "tensorflow" - whenever we call `model.fit(...)`, so that the callback’s logging logic actually runs. - Otherwise, on the “NumPy” backend, `.fit(…)` isn’t implemented and nothing is printed. - """ - @pytest.fixture(autouse=True) def setup_model(self): self.x_train = np.random.random((20, 10)).astype(np.float32) self.y_train = np.random.randint(0, 2, (20, 1)).astype(np.float32) - self.model = Sequential( - [ - Dense(5, activation="relu", input_shape=(10,)), - Dense(1, activation="sigmoid"), - ] - ) + self.model = Sequential([ + Dense(5, activation="relu", input_shape=(10,)), + Dense(1, activation="sigmoid") + ]) self.model.compile(optimizer="adam", loss="binary_crossentropy") self.epochs = 2 self.batch_size = 5 self.steps_per_epoch = len(self.x_train) // self.batch_size - yield @pytest.mark.requires_trainable_backend - def test_cpu_only_epoch_logging(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "tensorflow") - - out = StringIO() - with redirect_stdout(out): - with patch( - "keras.src.callbacks.memory_usage_callback.running_on_gpu", - return_value=False, - ), patch( - "keras.src.callbacks.memory_usage_callback.running_on_tpu", - return_value=False, - ): + def test_cpu_only_epoch_logging(self): + # Force TF backend and no GPU/TPU + monkey = patch.object(K, "backend", lambda: "tensorflow") + with monkey: + out = StringIO() + with redirect_stdout(out), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): cb = MemoryUsageCallback(log_every_batch=False) - self.model.fit( - self.x_train, - self.y_train, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0, - ) - lines = out.getvalue().splitlines() - start_lines = [ - ln - for ln in lines - if re.match(r"^Epoch \d+ start - CPU Memory: [\d\.]+ MB$", ln) - ] - end_lines = [ - ln - for ln in lines - if re.match(r"^Epoch \d+ end - CPU Memory: [\d\.]+ MB$", ln) - ] - - assert len(start_lines) == self.epochs - assert len(end_lines) == self.epochs - assert all("GPU Memory" not in ln for ln in lines) - assert all("TPU Memory" not in ln for ln in lines) + self.model.fit(self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0) + + lines = out.getvalue().splitlines() + start = [ln for ln in lines if re.match(r"^Epoch \d+ start - CPU Memory:", ln)] + end = [ln for ln in lines if re.match(r"^Epoch \d+ end - CPU Memory:", ln)] + assert len(start) == self.epochs + assert len(end) == self.epochs + assert not any("GPU Memory" in ln or "TPU Memory" in ln for ln in lines) @pytest.mark.requires_trainable_backend - def test_log_every_batch(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "tensorflow") - - out = StringIO() - with redirect_stdout(out): - with patch( - "keras.src.callbacks.memory_usage_callback.running_on_gpu", - return_value=False, - ), patch( - "keras.src.callbacks.memory_usage_callback.running_on_tpu", - return_value=False, - ): + def test_log_every_batch(self): + monkey = patch.object(K, "backend", lambda: "tensorflow") + with monkey: + out = StringIO() + with redirect_stdout(out), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): cb = MemoryUsageCallback(log_every_batch=True) - self.model.fit( - self.x_train, - self.y_train, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0, - ) - lines = out.getvalue().splitlines() - batch_lines = [ - ln - for ln in lines - if re.match(r"^Batch \d+ end - CPU Memory: [\d\.]+ MB$", ln) - ] - expected_batches = self.epochs * self.steps_per_epoch - assert len(batch_lines) == expected_batches - - @pytest.mark.requires_trainable_backend - def test_tensorboard_log_dir(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "tensorflow") + self.model.fit(self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0) - with tempfile.TemporaryDirectory() as tmpdir: - log_dir = os.path.join(tmpdir, "tb_logs") - with patch( - "keras.src.callbacks.memory_usage_callback.running_on_gpu", - return_value=False, - ), patch( - "keras.src.callbacks.memory_usage_callback.running_on_tpu", - return_value=False, - ): - cb = MemoryUsageCallback( - log_every_batch=True, tensorboard_log_dir=log_dir - ) - assert os.path.isdir(log_dir) + lines = out.getvalue().splitlines() + batches = [ln for ln in lines if re.match(r"^Batch \d+ end - CPU Memory:", ln)] + assert len(batches) == self.epochs * self.steps_per_epoch - self.model.fit( - self.x_train, - self.y_train, - epochs=self.epochs, - batch_size=self.batch_size, - callbacks=[cb], - verbose=0, - ) - - event_files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*")) - assert event_files, f"No TensorBoard events under {log_dir}" + @pytest.mark.requires_trainable_backend + def test_tensorboard_log_dir(self): + monkey = patch.object(K, "backend", lambda: "tensorflow") + with monkey: + with tempfile.TemporaryDirectory() as tmpdir: + log_dir = os.path.join(tmpdir, "tb_logs") + with patch("keras.src.callbacks.memory_usage_callback.running_on_gpu", return_value=False), \ + patch("keras.src.callbacks.memory_usage_callback.running_on_tpu", return_value=False): + cb = MemoryUsageCallback(log_every_batch=True, tensorboard_log_dir=log_dir) + assert os.path.isdir(log_dir) + self.model.fit(self.x_train, self.y_train, + epochs=self.epochs, + batch_size=self.batch_size, + callbacks=[cb], + verbose=0) + files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*")) + assert files, f"No events files under {log_dir}" @pytest.mark.requires_trainable_backend - def test_get_gpu_memory_tensorflow(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "tensorflow") + def test_get_gpu_memory_tensorflow(self): + patch_backend = patch.object(K, "backend", lambda: "tensorflow") fake_tf = MagicMock() + # mock physical devices fake_tf.config.list_physical_devices.return_value = ["GPU:0"] - fake_tf.config.experimental.get_memory_info.return_value = { - "current": 150 * 1024**2 - } - monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) - monkeypatch.setitem(sys.modules, "tensorflow.config", fake_tf.config) - monkeypatch.setitem( - sys.modules, "tensorflow.config.experimental", fake_tf.config.experimental - ) + fake_tf.config.experimental.get_memory_info.return_value = {"current": 150 * 1024**2} - cb = MemoryUsageCallback() - assert pytest.approx(150.0, rel=1e-6) == cb._get_gpu_memory() + with patch_backend, \ + patch.dict(sys.modules, { + "tensorflow": fake_tf, + "tensorflow.config": fake_tf.config, + "tensorflow.config.experimental": fake_tf.config.experimental + }): + cb = MemoryUsageCallback() + assert pytest.approx(150.0, rel=1e-6) == cb._get_gpu_memory() @pytest.mark.requires_trainable_backend - def test_get_gpu_memory_torch(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "torch") + def test_get_gpu_memory_torch(self): + patch_backend = patch.object(K, "backend", lambda: "torch") fake_torch = MagicMock() fake_torch.cuda.is_available.return_value = True fake_torch.cuda.device_count.return_value = 2 + # return 100MB then 200MB fake_torch.cuda.memory_allocated.side_effect = [100 * 1024**2, 200 * 1024**2] - monkeypatch.setitem(sys.modules, "torch", fake_torch) - monkeypatch.setitem(sys.modules, "torch.cuda", fake_torch.cuda) - cb = MemoryUsageCallback() - assert pytest.approx(300.0, rel=1e-6) == cb._get_gpu_memory() + with patch_backend, \ + patch.dict(sys.modules, {"torch": fake_torch, "torch.cuda": fake_torch.cuda}): + cb = MemoryUsageCallback() + assert pytest.approx(300.0, rel=1e-6) == cb._get_gpu_memory() @pytest.mark.requires_trainable_backend - def test_get_gpu_memory_jax(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "jax") - - class FakeDev: + def test_get_gpu_memory_jax(self): + patch_backend = patch.object(K, "backend", lambda: "jax") + class Dev: platform = "gpu" - - def memory_stats(self): - return {"bytes_in_use": 200 * 1024**2} - + def memory_stats(self): return {"bytes_in_use": 200 * 1024**2} fake_jax = MagicMock() - fake_jax.devices.return_value = [FakeDev(), FakeDev()] - monkeypatch.setitem(sys.modules, "jax", fake_jax) + fake_jax.devices.return_value = [Dev(), Dev()] - cb = MemoryUsageCallback() - assert pytest.approx(400.0, rel=1e-6) == cb._get_gpu_memory() + with patch_backend, patch.dict(sys.modules, {"jax": fake_jax}): + cb = MemoryUsageCallback() + assert pytest.approx(400.0, rel=1e-6) == cb._get_gpu_memory() def test_running_on_gpu_and_tpu_flags(self): - val_gpu = running_on_gpu() - val_tpu = running_on_tpu() - assert isinstance(val_gpu, bool) - assert isinstance(val_tpu, bool) + g = running_on_gpu(); t = running_on_tpu() + assert isinstance(g, bool) and isinstance(t, bool) def test_psutil_missing(self): + # ensure ImportError if psutil absent orig = sys.modules.pop("psutil", None) try: import keras.src.callbacks.memory_usage_callback as mod - with patch.dict(sys.modules, {"psutil": None}): - with pytest.raises( - ImportError, - match="MemoryUsageCallback requires the 'psutil' library", - ): + with pytest.raises(ImportError): reload(mod) _ = mod.MemoryUsageCallback() finally: if orig is not None: sys.modules["psutil"] = orig - from importlib import reload as _r - - _r(sys.modules["keras.src.callbacks.memory_usage_callback"]) - - @pytest.mark.requires_trainable_backend - def test_running_on_tpu_true(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "tensorflow") - fake_tf = MagicMock() - fake_tf.config.list_logical_devices.return_value = ["TPU:0"] - fake_tf.distribute.cluster_resolver.TPUClusterResolver = lambda: MagicMock() - fake_tf.config.experimental_connect_to_cluster = lambda resolver: None - fake_tf.tpu.experimental.initialize_tpu_system = lambda resolver: None - monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) - - assert running_on_tpu() is True - - @pytest.mark.requires_trainable_backend - def test_get_tpu_memory_tensorflow(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "tensorflow") - fake_tf = MagicMock() - fake_tf.config.list_logical_devices.return_value = ["TPU:0"] - fake_tf.distribute.cluster_resolver.TPUClusterResolver = lambda: MagicMock() - fake_tf.config.experimental_connect_to_cluster = lambda resolver: None - fake_tf.tpu.experimental.initialize_tpu_system = lambda resolver: None - monkeypatch.setitem(sys.modules, "tensorflow", fake_tf) - - cb = MemoryUsageCallback() - assert cb._get_tpu_memory() is None - - @pytest.mark.requires_trainable_backend - def test_get_tpu_memory_jax(self, monkeypatch): - monkeypatch.setattr(K, "backend", lambda: "jax") - - class FakeTpuDev: - platform = "tpu" - - def memory_stats(self): - return {"bytes_in_use": 250 * 1024**2} - - fake_jax = MagicMock() - fake_jax.devices.return_value = [FakeTpuDev()] - monkeypatch.setitem(sys.modules, "jax", fake_jax) - - cb = MemoryUsageCallback() - assert pytest.approx(250.0, rel=1e-6) == cb._get_tpu_memory() + reload(sys.modules["keras.src.callbacks.memory_usage_callback"])