Skip to content

Commit 0ca5113

Browse files
authored
Merge pull request #210 from comet-ml/update-ray-train-new-api
Update Ray Train examples for new public API
2 parents 55e9018 + 4fff205 commit 0ca5113

File tree

5 files changed

+160
-150
lines changed

5 files changed

+160
-150
lines changed

.github/workflows/test-examples.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,9 @@ jobs:
9292
COMET_INTERNAL_SENTRY_DSN: ${{ secrets.COMET_INTERNAL_SENTRY_DSN }}
9393
COMET_WORKSPACE: cometexamples-tests
9494
- name: debugging-save-logs
95-
uses: actions/upload-artifact@v3
95+
uses: actions/upload-artifact@v4
9696
if: runner.debug == '1' && failure()
9797
with:
98-
name: debug-logs
9998
path: ${{ env.COMET_LOG_DIR }}
10099

101100
test-scripts:

integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb

Lines changed: 62 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
},
4242
"outputs": [],
4343
"source": [
44-
"%pip install \"comet_ml>=3.31.5\" \"ray[air]>=2.1.0\" \"transformers>=4.43.0\" \"accelerate>=0.12.0\" \"datasets\" \"sentencepiece\" scipy \"scikit-learn\" protobuf \"torch>=1.3\" evaluate"
44+
"%pip install \"comet_ml>=3.49.0\" \"ray[air]>=2.1.0\" \"transformers>=4.43.0\" \"accelerate>=0.12.0\" \"datasets\" \"sentencepiece\" scipy \"scikit-learn\" protobuf \"torch>=1.3\" evaluate"
4545
]
4646
},
4747
{
@@ -62,7 +62,6 @@
6262
"outputs": [],
6363
"source": [
6464
"import comet_ml\n",
65-
"import comet_ml.integration.ray\n",
6665
"\n",
6766
"comet_ml.init()"
6867
]
@@ -101,7 +100,9 @@
101100
"\n",
102101
"import ray.train.huggingface.transformers\n",
103102
"from ray.train import ScalingConfig, RunConfig\n",
104-
"from ray.train.torch import TorchTrainer"
103+
"from ray.train.torch import TorchTrainer\n",
104+
"import comet_ml.integration.ray\n",
105+
"from comet_ml.integration.ray import comet_worker"
105106
]
106107
},
107108
{
@@ -164,63 +165,62 @@
164165
"metadata": {},
165166
"outputs": [],
166167
"source": [
168+
"@comet_worker\n",
167169
"def train_func(config):\n",
168170
" from comet_ml import get_running_experiment\n",
169-
" from comet_ml.integration.ray import comet_worker_logger\n",
170-
"\n",
171-
" with comet_worker_logger(config) as experiment:\n",
172-
" small_train_dataset, small_eval_dataset = get_dataset()\n",
173-
"\n",
174-
" # Model\n",
175-
" model = AutoModelForSequenceClassification.from_pretrained(\n",
176-
" \"google-bert/bert-base-cased\", num_labels=5\n",
177-
" )\n",
178-
"\n",
179-
" # Evaluation Metrics\n",
180-
" metric = evaluate.load(\"accuracy\")\n",
181-
"\n",
182-
" def compute_metrics(eval_pred):\n",
183-
" logits, labels = eval_pred\n",
184-
" predictions = np.argmax(logits, axis=-1)\n",
185-
"\n",
186-
" experiment = comet_ml.get_running_experiment()\n",
187-
" if experiment:\n",
188-
" experiment.log_confusion_matrix(predictions, labels)\n",
189-
"\n",
190-
" return metric.compute(predictions=predictions, references=labels)\n",
191-
"\n",
192-
" # Hugging Face Trainer\n",
193-
" training_args = TrainingArguments(\n",
194-
" do_eval=True,\n",
195-
" do_train=True,\n",
196-
" eval_strategy=\"epoch\",\n",
197-
" num_train_epochs=config[\"epochs\"],\n",
198-
" output_dir=\"./results\",\n",
199-
" overwrite_output_dir=True,\n",
200-
" per_device_eval_batch_size=4,\n",
201-
" per_device_train_batch_size=4,\n",
202-
" report_to=[\"comet_ml\"],\n",
203-
" seed=SEED,\n",
204-
" )\n",
205-
" trainer = Trainer(\n",
206-
" model=model,\n",
207-
" args=training_args,\n",
208-
" train_dataset=small_train_dataset,\n",
209-
" eval_dataset=small_eval_dataset,\n",
210-
" compute_metrics=compute_metrics,\n",
211-
" )\n",
212-
"\n",
213-
" # Report Metrics and Checkpoints to Ray Train\n",
214-
" callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n",
215-
" trainer.add_callback(callback)\n",
216-
"\n",
217-
" # Prepare Transformers Trainer\n",
218-
" trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n",
219-
"\n",
220-
" # Start Training\n",
221-
" trainer.train()\n",
222-
"\n",
223-
" comet_ml.get_running_experiment().end()"
171+
"\n",
172+
" small_train_dataset, small_eval_dataset = get_dataset()\n",
173+
"\n",
174+
" # Model\n",
175+
" model = AutoModelForSequenceClassification.from_pretrained(\n",
176+
" \"google-bert/bert-base-cased\", num_labels=5\n",
177+
" )\n",
178+
"\n",
179+
" # Evaluation Metrics\n",
180+
" metric = evaluate.load(\"accuracy\")\n",
181+
"\n",
182+
" def compute_metrics(eval_pred):\n",
183+
" logits, labels = eval_pred\n",
184+
" predictions = np.argmax(logits, axis=-1)\n",
185+
"\n",
186+
" experiment = comet_ml.get_running_experiment()\n",
187+
" if experiment:\n",
188+
" experiment.log_confusion_matrix(predictions, labels)\n",
189+
"\n",
190+
" return metric.compute(predictions=predictions, references=labels)\n",
191+
"\n",
192+
" # Hugging Face Trainer\n",
193+
" training_args = TrainingArguments(\n",
194+
" do_eval=True,\n",
195+
" do_train=True,\n",
196+
" eval_strategy=\"epoch\",\n",
197+
" num_train_epochs=config[\"epochs\"],\n",
198+
" output_dir=\"./results\",\n",
199+
" overwrite_output_dir=True,\n",
200+
" per_device_eval_batch_size=4,\n",
201+
" per_device_train_batch_size=4,\n",
202+
" report_to=[\"comet_ml\"],\n",
203+
" seed=SEED,\n",
204+
" )\n",
205+
" trainer = Trainer(\n",
206+
" model=model,\n",
207+
" args=training_args,\n",
208+
" train_dataset=small_train_dataset,\n",
209+
" eval_dataset=small_eval_dataset,\n",
210+
" compute_metrics=compute_metrics,\n",
211+
" )\n",
212+
"\n",
213+
" # Report Metrics and Checkpoints to Ray Train\n",
214+
" callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n",
215+
" trainer.add_callback(callback)\n",
216+
"\n",
217+
" # Prepare Transformers Trainer\n",
218+
" trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n",
219+
"\n",
220+
" # Start Training\n",
221+
" trainer.train()\n",
222+
"\n",
223+
" comet_ml.end()"
224224
]
225225
},
226226
{
@@ -240,16 +240,15 @@
240240
" scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)\n",
241241
" config = {\"use_gpu\": use_gpu, \"epochs\": 2}\n",
242242
"\n",
243-
" callback = comet_ml.integration.ray.CometTrainLoggerCallback(\n",
244-
" config, project_name=\"comet-example-ray-train-hugginface-transformers\"\n",
245-
" )\n",
246-
"\n",
247243
" ray_trainer = TorchTrainer(\n",
248244
" train_func,\n",
249245
" scaling_config=scaling_config,\n",
250246
" train_loop_config=config,\n",
251-
" run_config=RunConfig(callbacks=[callback]),\n",
252247
" )\n",
248+
" comet_ml.integration.ray.comet_ray_train_logger(\n",
249+
" ray_trainer, project_name=\"comet-example-ray-train-hugginface-transformers\"\n",
250+
" )\n",
251+
"\n",
253252
" result = ray_trainer.fit()"
254253
]
255254
},
@@ -278,13 +277,6 @@
278277
"\n",
279278
"train(num_workers, use_gpu=False, epochs=5)"
280279
]
281-
},
282-
{
283-
"cell_type": "code",
284-
"execution_count": null,
285-
"metadata": {},
286-
"outputs": [],
287-
"source": []
288280
}
289281
],
290282
"metadata": {

integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb

Lines changed: 41 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
},
4242
"outputs": [],
4343
"source": [
44-
"%pip install -U \"comet_ml>=3.44.0\" \"ray[air]>=2.1.0\" \"keras<3\" \"tensorflow<2.16.0\""
44+
"%pip install -U \"comet_ml>=3.49.0\" \"ray[air]>=2.1.0\" \"keras<3\" \"tensorflow<2.16.0\""
4545
]
4646
},
4747
{
@@ -88,6 +88,7 @@
8888
"import os\n",
8989
"\n",
9090
"import comet_ml.integration.ray\n",
91+
"from comet_ml.integration.ray import comet_worker\n",
9192
"\n",
9293
"import numpy as np\n",
9394
"import ray\n",
@@ -172,45 +173,43 @@
172173
},
173174
"outputs": [],
174175
"source": [
176+
"@comet_worker\n",
175177
"def train_func(config: dict):\n",
176-
" from comet_ml.integration.ray import comet_worker_logger\n",
177178
" from ray.air import session\n",
178179
"\n",
179180
" per_worker_batch_size = config.get(\"batch_size\", 64)\n",
180181
" epochs = config.get(\"epochs\", 3)\n",
181182
" steps_per_epoch = config.get(\"steps_per_epoch\", 70)\n",
182183
"\n",
183-
" with comet_worker_logger(config) as experiment:\n",
184+
" tf_config = json.loads(os.environ[\"TF_CONFIG\"])\n",
185+
" num_workers = len(tf_config[\"cluster\"][\"worker\"])\n",
184186
"\n",
185-
" tf_config = json.loads(os.environ[\"TF_CONFIG\"])\n",
186-
" num_workers = len(tf_config[\"cluster\"][\"worker\"])\n",
187+
" strategy = tf.distribute.MultiWorkerMirroredStrategy()\n",
187188
"\n",
188-
" strategy = tf.distribute.MultiWorkerMirroredStrategy()\n",
189+
" global_batch_size = per_worker_batch_size * num_workers\n",
190+
" multi_worker_dataset = mnist_dataset(global_batch_size)\n",
189191
"\n",
190-
" global_batch_size = per_worker_batch_size * num_workers\n",
191-
" multi_worker_dataset = mnist_dataset(global_batch_size)\n",
192-
"\n",
193-
" with strategy.scope():\n",
194-
" # Model building/compiling need to be within `strategy.scope()`.\n",
195-
" multi_worker_model = build_cnn_model()\n",
196-
" learning_rate = config.get(\"lr\", 0.001)\n",
197-
" multi_worker_model.compile(\n",
198-
" loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
199-
" optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),\n",
200-
" metrics=[\"accuracy\"],\n",
201-
" )\n",
192+
" with strategy.scope():\n",
193+
" # Model building/compiling need to be within `strategy.scope()`.\n",
194+
" multi_worker_model = build_cnn_model()\n",
195+
" learning_rate = config.get(\"lr\", 0.001)\n",
196+
" multi_worker_model.compile(\n",
197+
" loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
198+
" optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),\n",
199+
" metrics=[\"accuracy\"],\n",
200+
" )\n",
202201
"\n",
203-
" callbacks = []\n",
204-
" if session.get_world_rank() == 0:\n",
205-
" callbacks.append(experiment.get_callback(\"tf-keras\"))\n",
202+
" callbacks = []\n",
203+
" if session.get_world_rank() == 0:\n",
204+
" callbacks.append(comet_ml.get_running_experiment().get_callback(\"tf-keras\"))\n",
206205
"\n",
207-
" history = multi_worker_model.fit(\n",
208-
" multi_worker_dataset,\n",
209-
" epochs=epochs,\n",
210-
" steps_per_epoch=steps_per_epoch,\n",
211-
" callbacks=callbacks,\n",
212-
" )\n",
213-
" results = history.history\n",
206+
" history = multi_worker_model.fit(\n",
207+
" multi_worker_dataset,\n",
208+
" epochs=epochs,\n",
209+
" steps_per_epoch=steps_per_epoch,\n",
210+
" callbacks=callbacks,\n",
211+
" )\n",
212+
" results = history.history\n",
214213
"\n",
215214
" return results"
216215
]
@@ -233,14 +232,15 @@
233232
") -> Result:\n",
234233
" config = {\"lr\": 1e-3, \"batch_size\": 64, \"epochs\": epochs}\n",
235234
"\n",
236-
" callback = comet_ml.integration.ray.CometTrainLoggerCallback(config)\n",
237-
"\n",
238235
" trainer = TensorflowTrainer(\n",
239236
" train_loop_per_worker=train_func,\n",
240237
" train_loop_config=config,\n",
241238
" scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n",
242-
" run_config=RunConfig(callbacks=[callback]),\n",
243239
" )\n",
240+
" comet_ml.integration.ray.comet_ray_train_logger(\n",
241+
" trainer, project_name=\"comet-example-ray-train-keras\"\n",
242+
" )\n",
243+
"\n",
244244
" results = trainer.fit()\n",
245245
" return results"
246246
]
@@ -270,6 +270,15 @@
270270
"\n",
271271
"train_tensorflow_mnist(num_workers, use_gpu=False, epochs=10)"
272272
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": null,
277+
"metadata": {},
278+
"outputs": [],
279+
"source": [
280+
"comet_ml.end()"
281+
]
273282
}
274283
],
275284
"metadata": {
@@ -291,7 +300,7 @@
291300
"name": "python",
292301
"nbconvert_exporter": "python",
293302
"pygments_lexer": "ipython3",
294-
"version": "3.10.12"
303+
"version": "3.9.1"
295304
}
296305
},
297306
"nbformat": 4,

0 commit comments

Comments
 (0)