diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml index c071934..51c4572 100644 --- a/.github/workflows/test-examples.yml +++ b/.github/workflows/test-examples.yml @@ -92,10 +92,9 @@ jobs: COMET_INTERNAL_SENTRY_DSN: ${{ secrets.COMET_INTERNAL_SENTRY_DSN }} COMET_WORKSPACE: cometexamples-tests - name: debugging-save-logs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: runner.debug == '1' && failure() with: - name: debug-logs path: ${{ env.COMET_LOG_DIR }} test-scripts: diff --git a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb index bff82ca..50e0971 100644 --- a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb +++ b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb @@ -41,7 +41,7 @@ }, "outputs": [], "source": [ - "%pip install \"comet_ml>=3.31.5\" \"ray[air]>=2.1.0\" \"transformers>=4.43.0\" \"accelerate>=0.12.0\" \"datasets\" \"sentencepiece\" scipy \"scikit-learn\" protobuf \"torch>=1.3\" evaluate" + "%pip install \"comet_ml>=3.49.0\" \"ray[air]>=2.1.0\" \"transformers>=4.43.0\" \"accelerate>=0.12.0\" \"datasets\" \"sentencepiece\" scipy \"scikit-learn\" protobuf \"torch>=1.3\" evaluate" ] }, { @@ -62,7 +62,6 @@ "outputs": [], "source": [ "import comet_ml\n", - "import comet_ml.integration.ray\n", "\n", "comet_ml.init()" ] @@ -101,7 +100,9 @@ "\n", "import ray.train.huggingface.transformers\n", "from ray.train import ScalingConfig, RunConfig\n", - "from ray.train.torch import TorchTrainer" + "from ray.train.torch import TorchTrainer\n", + "import comet_ml.integration.ray\n", + "from comet_ml.integration.ray import comet_worker" ] }, { @@ -164,63 +165,62 @@ "metadata": {}, "outputs": [], "source": [ + "@comet_worker\n", "def train_func(config):\n", " from comet_ml import get_running_experiment\n", - " from comet_ml.integration.ray import comet_worker_logger\n", - "\n", - " with comet_worker_logger(config) as experiment:\n", - " small_train_dataset, small_eval_dataset = get_dataset()\n", - "\n", - " # Model\n", - " model = AutoModelForSequenceClassification.from_pretrained(\n", - " \"google-bert/bert-base-cased\", num_labels=5\n", - " )\n", - "\n", - " # Evaluation Metrics\n", - " metric = evaluate.load(\"accuracy\")\n", - "\n", - " def compute_metrics(eval_pred):\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - "\n", - " experiment = comet_ml.get_running_experiment()\n", - " if experiment:\n", - " experiment.log_confusion_matrix(predictions, labels)\n", - "\n", - " return metric.compute(predictions=predictions, references=labels)\n", - "\n", - " # Hugging Face Trainer\n", - " training_args = TrainingArguments(\n", - " do_eval=True,\n", - " do_train=True,\n", - " eval_strategy=\"epoch\",\n", - " num_train_epochs=config[\"epochs\"],\n", - " output_dir=\"./results\",\n", - " overwrite_output_dir=True,\n", - " per_device_eval_batch_size=4,\n", - " per_device_train_batch_size=4,\n", - " report_to=[\"comet_ml\"],\n", - " seed=SEED,\n", - " )\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=small_train_dataset,\n", - " eval_dataset=small_eval_dataset,\n", - " compute_metrics=compute_metrics,\n", - " )\n", - "\n", - " # Report Metrics and Checkpoints to Ray Train\n", - " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", - " trainer.add_callback(callback)\n", - "\n", - " # Prepare Transformers Trainer\n", - " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", - "\n", - " # Start Training\n", - " trainer.train()\n", - "\n", - " comet_ml.get_running_experiment().end()" + "\n", + " small_train_dataset, small_eval_dataset = get_dataset()\n", + "\n", + " # Model\n", + " model = AutoModelForSequenceClassification.from_pretrained(\n", + " \"google-bert/bert-base-cased\", num_labels=5\n", + " )\n", + "\n", + " # Evaluation Metrics\n", + " metric = evaluate.load(\"accuracy\")\n", + "\n", + " def compute_metrics(eval_pred):\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + "\n", + " experiment = comet_ml.get_running_experiment()\n", + " if experiment:\n", + " experiment.log_confusion_matrix(predictions, labels)\n", + "\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " # Hugging Face Trainer\n", + " training_args = TrainingArguments(\n", + " do_eval=True,\n", + " do_train=True,\n", + " eval_strategy=\"epoch\",\n", + " num_train_epochs=config[\"epochs\"],\n", + " output_dir=\"./results\",\n", + " overwrite_output_dir=True,\n", + " per_device_eval_batch_size=4,\n", + " per_device_train_batch_size=4,\n", + " report_to=[\"comet_ml\"],\n", + " seed=SEED,\n", + " )\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + " )\n", + "\n", + " # Report Metrics and Checkpoints to Ray Train\n", + " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", + " trainer.add_callback(callback)\n", + "\n", + " # Prepare Transformers Trainer\n", + " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", + "\n", + " # Start Training\n", + " trainer.train()\n", + "\n", + " comet_ml.end()" ] }, { @@ -240,16 +240,15 @@ " scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)\n", " config = {\"use_gpu\": use_gpu, \"epochs\": 2}\n", "\n", - " callback = comet_ml.integration.ray.CometTrainLoggerCallback(\n", - " config, project_name=\"comet-example-ray-train-hugginface-transformers\"\n", - " )\n", - "\n", " ray_trainer = TorchTrainer(\n", " train_func,\n", " scaling_config=scaling_config,\n", " train_loop_config=config,\n", - " run_config=RunConfig(callbacks=[callback]),\n", " )\n", + " comet_ml.integration.ray.comet_ray_train_logger(\n", + " ray_trainer, project_name=\"comet-example-ray-train-hugginface-transformers\"\n", + " )\n", + "\n", " result = ray_trainer.fit()" ] }, @@ -278,13 +277,6 @@ "\n", "train(num_workers, use_gpu=False, epochs=5)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb index d0de715..3bd5a3c 100644 --- a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb +++ b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb @@ -41,7 +41,7 @@ }, "outputs": [], "source": [ - "%pip install -U \"comet_ml>=3.44.0\" \"ray[air]>=2.1.0\" \"keras<3\" \"tensorflow<2.16.0\"" + "%pip install -U \"comet_ml>=3.49.0\" \"ray[air]>=2.1.0\" \"keras<3\" \"tensorflow<2.16.0\"" ] }, { @@ -88,6 +88,7 @@ "import os\n", "\n", "import comet_ml.integration.ray\n", + "from comet_ml.integration.ray import comet_worker\n", "\n", "import numpy as np\n", "import ray\n", @@ -172,45 +173,43 @@ }, "outputs": [], "source": [ + "@comet_worker\n", "def train_func(config: dict):\n", - " from comet_ml.integration.ray import comet_worker_logger\n", " from ray.air import session\n", "\n", " per_worker_batch_size = config.get(\"batch_size\", 64)\n", " epochs = config.get(\"epochs\", 3)\n", " steps_per_epoch = config.get(\"steps_per_epoch\", 70)\n", "\n", - " with comet_worker_logger(config) as experiment:\n", + " tf_config = json.loads(os.environ[\"TF_CONFIG\"])\n", + " num_workers = len(tf_config[\"cluster\"][\"worker\"])\n", "\n", - " tf_config = json.loads(os.environ[\"TF_CONFIG\"])\n", - " num_workers = len(tf_config[\"cluster\"][\"worker\"])\n", + " strategy = tf.distribute.MultiWorkerMirroredStrategy()\n", "\n", - " strategy = tf.distribute.MultiWorkerMirroredStrategy()\n", + " global_batch_size = per_worker_batch_size * num_workers\n", + " multi_worker_dataset = mnist_dataset(global_batch_size)\n", "\n", - " global_batch_size = per_worker_batch_size * num_workers\n", - " multi_worker_dataset = mnist_dataset(global_batch_size)\n", - "\n", - " with strategy.scope():\n", - " # Model building/compiling need to be within `strategy.scope()`.\n", - " multi_worker_model = build_cnn_model()\n", - " learning_rate = config.get(\"lr\", 0.001)\n", - " multi_worker_model.compile(\n", - " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", - " optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),\n", - " metrics=[\"accuracy\"],\n", - " )\n", + " with strategy.scope():\n", + " # Model building/compiling need to be within `strategy.scope()`.\n", + " multi_worker_model = build_cnn_model()\n", + " learning_rate = config.get(\"lr\", 0.001)\n", + " multi_worker_model.compile(\n", + " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", + " optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),\n", + " metrics=[\"accuracy\"],\n", + " )\n", "\n", - " callbacks = []\n", - " if session.get_world_rank() == 0:\n", - " callbacks.append(experiment.get_callback(\"tf-keras\"))\n", + " callbacks = []\n", + " if session.get_world_rank() == 0:\n", + " callbacks.append(comet_ml.get_running_experiment().get_callback(\"tf-keras\"))\n", "\n", - " history = multi_worker_model.fit(\n", - " multi_worker_dataset,\n", - " epochs=epochs,\n", - " steps_per_epoch=steps_per_epoch,\n", - " callbacks=callbacks,\n", - " )\n", - " results = history.history\n", + " history = multi_worker_model.fit(\n", + " multi_worker_dataset,\n", + " epochs=epochs,\n", + " steps_per_epoch=steps_per_epoch,\n", + " callbacks=callbacks,\n", + " )\n", + " results = history.history\n", "\n", " return results" ] @@ -233,14 +232,15 @@ ") -> Result:\n", " config = {\"lr\": 1e-3, \"batch_size\": 64, \"epochs\": epochs}\n", "\n", - " callback = comet_ml.integration.ray.CometTrainLoggerCallback(config)\n", - "\n", " trainer = TensorflowTrainer(\n", " train_loop_per_worker=train_func,\n", " train_loop_config=config,\n", " scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n", - " run_config=RunConfig(callbacks=[callback]),\n", " )\n", + " comet_ml.integration.ray.comet_ray_train_logger(\n", + " trainer, project_name=\"comet-example-ray-train-keras\"\n", + " )\n", + "\n", " results = trainer.fit()\n", " return results" ] @@ -270,6 +270,15 @@ "\n", "train_tensorflow_mnist(num_workers, use_gpu=False, epochs=10)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comet_ml.end()" + ] } ], "metadata": { @@ -291,7 +300,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb index fcdfde9..30457a1 100644 --- a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb +++ b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb @@ -41,7 +41,7 @@ }, "outputs": [], "source": [ - "%pip install \"comet_ml>=3.47.1\" \"ray[air]>=2.1.0\" \"lightning\" \"torchvision\"" + "%pip install \"comet_ml>=3.49.0\" \"ray[air]>=2.1.0\" \"lightning\" \"torchvision\"" ] }, { @@ -62,7 +62,6 @@ "outputs": [], "source": [ "import comet_ml\n", - "import comet_ml.integration.ray\n", "\n", "comet_ml.login()" ] @@ -87,6 +86,9 @@ "import os\n", "import tempfile\n", "\n", + "import comet_ml.integration.ray\n", + "from comet_ml.integration.ray import comet_worker\n", + "\n", "import torch\n", "from torch.utils.data import DataLoader\n", "from torchvision.models import resnet18\n", @@ -153,45 +155,44 @@ "metadata": {}, "outputs": [], "source": [ + "@comet_worker\n", "def train_func(config):\n", - " from comet_ml.integration.ray import comet_worker_logger\n", " from lightning.pytorch.loggers import CometLogger\n", "\n", - " with comet_worker_logger(config) as experiment:\n", - " # Data\n", - " transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])\n", - " data_dir = os.path.join(tempfile.gettempdir(), \"data\")\n", - " train_data = FashionMNIST(\n", - " root=data_dir, train=True, download=True, transform=transform\n", - " )\n", - " train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)\n", + " # Data\n", + " transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])\n", + " data_dir = os.path.join(tempfile.gettempdir(), \"data\")\n", + " train_data = FashionMNIST(\n", + " root=data_dir, train=True, download=True, transform=transform\n", + " )\n", + " train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)\n", "\n", - " # Training\n", - " model = ImageClassifier()\n", + " # Training\n", + " model = ImageClassifier()\n", "\n", - " comet_logger = CometLogger()\n", + " comet_logger = CometLogger()\n", "\n", - " # Temporary workaround, can be removed once\n", - " # https://github.com/Lightning-AI/pytorch-lightning/pull/20275 has\n", - " # been merged and released\n", - " comet_logger._experiment = experiment\n", + " # Temporary workaround, can be removed once\n", + " # https://github.com/Lightning-AI/pytorch-lightning/pull/20275 has\n", + " # been merged and released\n", + " comet_logger._experiment = comet_ml.get_running_experiment()\n", "\n", - " # [1] Configure PyTorch Lightning Trainer.\n", - " trainer = pl.Trainer(\n", - " max_epochs=config[\"epochs\"],\n", - " devices=\"auto\",\n", - " accelerator=\"auto\",\n", - " strategy=ray.train.lightning.RayDDPStrategy(),\n", - " plugins=[ray.train.lightning.RayLightningEnvironment()],\n", - " callbacks=[ray.train.lightning.RayTrainReportCallback()],\n", - " logger=comet_logger,\n", - " # [1a] Optionally, disable the default checkpointing behavior\n", - " # in favor of the `RayTrainReportCallback` above.\n", - " enable_checkpointing=False,\n", - " log_every_n_steps=2,\n", - " )\n", - " trainer = ray.train.lightning.prepare_trainer(trainer)\n", - " trainer.fit(model, train_dataloaders=train_dataloader)" + " # [1] Configure PyTorch Lightning Trainer.\n", + " trainer = pl.Trainer(\n", + " max_epochs=config[\"epochs\"],\n", + " devices=\"auto\",\n", + " accelerator=\"auto\",\n", + " strategy=ray.train.lightning.RayDDPStrategy(),\n", + " plugins=[ray.train.lightning.RayLightningEnvironment()],\n", + " callbacks=[ray.train.lightning.RayTrainReportCallback()],\n", + " logger=comet_logger,\n", + " # [1a] Optionally, disable the default checkpointing behavior\n", + " # in favor of the `RayTrainReportCallback` above.\n", + " enable_checkpointing=False,\n", + " log_every_n_steps=2,\n", + " )\n", + " trainer = ray.train.lightning.prepare_trainer(trainer)\n", + " trainer.fit(model, train_dataloaders=train_dataloader)" ] }, { @@ -211,15 +212,14 @@ " scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)\n", " config = {\"use_gpu\": use_gpu, \"epochs\": epochs}\n", "\n", - " callback = comet_ml.integration.ray.CometTrainLoggerCallback(\n", - " config, project_name=\"comet-example-ray-train-pytorch-lightning\"\n", - " )\n", - "\n", " ray_trainer = TorchTrainer(\n", " train_func,\n", " scaling_config=scaling_config,\n", " train_loop_config=config,\n", - " run_config=RunConfig(callbacks=[callback]),\n", + " # run_config=RunConfig(callbacks=[callback]),\n", + " )\n", + " comet_ml.integration.ray.comet_ray_train_logger(\n", + " ray_trainer, project_name=\"comet-example-ray-train-pytorch-lightning\"\n", " )\n", " result = ray_trainer.fit()" ] @@ -279,7 +279,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb index 0271ba5..6794523 100644 --- a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb +++ b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb @@ -42,7 +42,7 @@ "outputs": [], "source": [ "# XGBoost is pinned because of https://github.com/ray-project/ray/issues/46476\n", - "%pip install -U \"comet_ml>=3.44.0\" \"ray[air]>=2.1.0\" xgboost_ray \"pandas!=2.2.0\" \"xgboost!=2.1.0\"" + "%pip install -U \"comet_ml>=3.49.0\" \"ray[air]>=2.1.0\" xgboost_ray \"pandas!=2.2.0\" \"xgboost!=2.1.0\"" ] }, { @@ -63,7 +63,6 @@ "outputs": [], "source": [ "import comet_ml\n", - "import comet_ml.integration.ray\n", "\n", "comet_ml.login(project_name=\"comet-example-ray-train-xgboost\")" ] @@ -89,7 +88,8 @@ "import ray\n", "from ray.air.config import RunConfig, ScalingConfig\n", "from ray.train import Result\n", - "from ray.train.xgboost import XGBoostTrainer" + "from ray.train.xgboost import XGBoostTrainer\n", + "import comet_ml.integration.ray" ] }, { @@ -130,7 +130,6 @@ " num_workers: int = 2, use_gpu: bool = False, num_boost_round: int = 20\n", ") -> Result:\n", " config = {}\n", - " callback = comet_ml.integration.ray.CometTrainLoggerCallback(config)\n", "\n", " trainer = XGBoostTrainer(\n", " scaling_config=ScalingConfig(\n", @@ -151,7 +150,9 @@ " \"random_state\": 536,\n", " },\n", " datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n", - " run_config=RunConfig(callbacks=[callback]),\n", + " )\n", + " comet_ml.integration.ray.comet_ray_train_logger(\n", + " trainer, project_name=\"comet-example-ray-train-xgboost\"\n", " )\n", " result = trainer.fit()\n", " return result" @@ -182,6 +183,15 @@ "\n", "train_xgboost(num_workers, use_gpu=False, num_boost_round=10)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comet_ml.end()" + ] } ], "metadata": { @@ -203,7 +213,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.1" } }, "nbformat": 4,