ENH XPU support for boft_dreambooth example (#2679)

yao-matrix · web-flow · commit 951e72008132 · 2025-08-04T11:17:10.000+02:00
---------

Signed-off-by: Yao, Matrix &lt;matrix.yao@intel.com&gt;
diff --git a/examples/boft_dreambooth/boft_dreambooth.md b/examples/boft_dreambooth/boft_dreambooth.md
@@ -40,6 +40,7 @@ cd peft/examples/boft_dreambooth
 
 Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend installing PEFT from source. The following environment setup should work on A100 and H100:
 
+### CUDA
 ```bash
 conda create --name peft python=3.10
 conda activate peft
@@ -48,6 +49,16 @@ conda install xformers -c xformers
 pip install -r requirements.txt
 pip install git+https://github.com/huggingface/peft
 ```
+The follwing environment setuo is validated work on Intel XPU:
+
+### Intel XPU
+```bash
+conda create --name peft python=3.10
+conda activate peft
+pip install pip install torch==2.8.0.dev20250615+xpu torchvision==0.23.0.dev20250615+xpu torchaudio==2.8.0.dev20250615+xpu --index-url https://download.pytorch.org/whl/nightly/xpu --no-cache-dir
+pip install -r requirements.txt
+pip install git+https://github.com/huggingface/peft
+```
 
 ## Download the data
 
diff --git a/examples/boft_dreambooth/dreambooth_inference.ipynb b/examples/boft_dreambooth/dreambooth_inference.ipynb
@@ -44,8 +44,10 @@
    "outputs": [],
    "source": [
     "def get_boft_sd_pipeline(\n",
-    "    ckpt_dir, base_model_name_or_path=None, epoch=int, dtype=torch.float32, device=\"cuda\", adapter_name=\"default\"\n",
+    "    ckpt_dir, base_model_name_or_path=None, epoch=int, dtype=torch.float32, device=\"auto\", adapter_name=\"default\"\n",
     "):\n",
+    "    if device == \"auto\":\n",
+    "        device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
     "\n",
     "    if base_model_name_or_path is None:\n",
     "        raise ValueError(\"Please specify the base model name or path\")\n",
@@ -152,14 +154,6 @@
     "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n",
     "image"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f534eca2-94a4-432b-b092-7149ac44b12f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/boft_dreambooth/requirements.txt b/examples/boft_dreambooth/requirements.txt
@@ -1,13 +1,13 @@
-transformers=>4.48.0
-accelerate==0.25.0
+transformers==4.54.0
+accelerate==1.9.0
 evaluate
 tqdm
-datasets==2.16.1
-diffusers==0.17.1
+datasets==4.0.0
+diffusers==0.34.0
 Pillow
 huggingface_hub
 safetensors
 nb_conda_kernels
 ipykernel
 ipywidgets
-wandb==0.16.1
+wandb==0.21.0
diff --git a/examples/boft_dreambooth/train_dreambooth.py b/examples/boft_dreambooth/train_dreambooth.py
@@ -139,7 +139,7 @@ def main(args):
         cur_class_images = len(list(class_images_dir.iterdir()))
 
         if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32
             if args.prior_generation_precision == "fp32":
                 torch_dtype = torch.float32
             elif args.prior_generation_precision == "fp16":
@@ -176,6 +176,8 @@ def main(args):
             del pipeline
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
+            elif torch.xpu.is_available():
+                torch.xpu.empty_cache()
 
     # Handle the repository creation
     if accelerator.is_main_process:
@@ -263,7 +265,9 @@ def main(args):
     text_encoder.to(accelerator.device, dtype=weight_dtype)
 
     if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
+        if accelerator.device.type == "xpu":
+            logger.warn("XPU hasn't support xformers yet, ignore it.")
+        elif is_xformers_available():
             unet.enable_xformers_memory_efficient_attention()
         else:
             raise ValueError("xformers is not available. Make sure it is installed correctly")
@@ -276,7 +280,7 @@ def main(args):
 
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32:
+    if args.allow_tf32 and torch.cuda.is_available():
         torch.backends.cuda.matmul.allow_tf32 = True
 
     if args.scale_lr:
@@ -581,18 +585,27 @@ def main(args):
                             )
 
                     del pipeline
-                    torch.cuda.empty_cache()
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                    elif torch.xpu.is_available():
+                        torch.xpu.empty_cache()
 
                 if global_step >= args.max_train_steps:
                     break
 
-        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
+        # Printing the accelerator memory usage details such as allocated memory, peak memory, and total memory usage
         if not args.no_tracemalloc:
-            accelerator.print(f"GPU Memory before entering the train : {b2mb(tracemalloc.begin)}")
-            accelerator.print(f"GPU Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
-            accelerator.print(f"GPU Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
             accelerator.print(
-                f"GPU Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
+                f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}"
+            )
+            accelerator.print(
+                f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}"
+            )
+            accelerator.print(
+                f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}"
+            )
+            accelerator.print(
+                f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
             )
 
             accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}")
diff --git a/examples/boft_dreambooth/utils/tracemalloc.py b/examples/boft_dreambooth/utils/tracemalloc.py
@@ -13,10 +13,12 @@ def b2mb(x):
 # This context manager is used to track the peak memory usage of the process
 class TorchTracemalloc:
     def __enter__(self):
+        self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+        self.device_module = getattr(torch, self.device_type, torch.cuda)
         gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
-        self.begin = torch.cuda.memory_allocated()
+        self.device_module.empty_cache()
+        self.device_module.reset_peak_memory_stats()  # reset the peak gauge to zero
+        self.begin = self.device_module.memory_allocated()
         self.process = psutil.Process()
 
         self.cpu_begin = self.cpu_mem_used()
@@ -46,9 +48,9 @@ def __exit__(self, *exc):
         self.peak_monitoring = False
 
         gc.collect()
-        torch.cuda.empty_cache()
-        self.end = torch.cuda.memory_allocated()
-        self.peak = torch.cuda.max_memory_allocated()
+        self.device_module.empty_cache()
+        self.end = self.device_module.memory_allocated()
+        self.peak = self.device_module.max_memory_allocated()
         self.used = b2mb(self.end - self.begin)
         self.peaked = b2mb(self.peak - self.begin)