Skip to content

[bug]: Latents to image steps uses excessive resources and time when run for the first time with a new resolution #7976

Open
@max-maag

Description

@max-maag

Is there an existing issue for this problem?

  • I have searched the existing issues

Operating system

Linux

GPU vendor

AMD (ROCm)

GPU model

RX 7800 XT

GPU VRAM

16 GB

Version number

5.10.1

Browser

Firefox 138.0 (64-bit)

Python dependencies

{
"version": "5.10.1",
"dependencies": {
"accelerate" : "1.6.0" ,
"compel" : "2.0.2" ,
"cuda" : null ,
"diffusers" : "0.33.0" ,
"numpy" : "1.26.3" ,
"opencv" : "4.9.0.80" ,
"onnx" : "1.16.1" ,
"pillow" : "11.0.0" ,
"python" : "3.12.9" ,
"torch" : "2.6.0+rocm6.2.4" ,
"torchvision" : "0.21.0+rocm6.2.4",
"transformers": "4.51.3" ,
"xformers" : null
},
"config": {
"schema_version": "4.0.2",
"legacy_models_yaml_path": null,
"host": "127.0.0.1",
"port": 9090,
"allow_origins": ["localhost"],
"allow_credentials": true,
"allow_methods": [""],
"allow_headers": ["
"],
"ssl_certfile": null,
"ssl_keyfile": null,
"log_tokenization": false,
"patchmatch": false,
"models_dir": "/usr/share/ai-models/nsfw",
"convert_cache_dir": "models/.convert_cache",
"download_cache_dir": "models/.download_cache",
"legacy_conf_dir": "configs",
"db_dir": "nsfw/databases",
"outputs_dir": "nsfw/outputs",
"custom_nodes_dir": "nodes",
"style_presets_dir": "style_presets",
"workflow_thumbnails_dir": "workflow_thumbnails",
"log_handlers": ["console"],
"log_format": "color",
"log_level": "info",
"log_sql": false,
"log_level_network": "warning",
"use_memory_db": false,
"dev_reload": false,
"profile_graphs": false,
"profile_prefix": null,
"profiles_dir": "profiles",
"max_cache_ram_gb": null,
"max_cache_vram_gb": null,
"log_memory_usage": false,
"device_working_mem_gb": 3,
"enable_partial_loading": false,
"keep_ram_copy_of_weights": true,
"ram": null,
"vram": null,
"lazy_offload": true,
"pytorch_cuda_alloc_conf": null,
"device": "auto",
"precision": "float16",
"sequential_guidance": true,
"attention_type": "auto",
"attention_slice_size": "auto",
"force_tiled_decode": false,
"pil_compress_level": 1,
"max_queue_size": 100,
"clear_queue_on_startup": true,
"allow_nodes": null,
"deny_nodes": null,
"node_cache_size": 512,
"hashing_algorithm": "blake3_single",
"remote_api_tokens": null,
"scan_models_on_startup": false
},
"set_config_fields": [
"precision" , "allow_origins" , "db_dir" , "outputs_dir" ,
"max_queue_size" , "models_dir" , "clear_queue_on_startup" , "legacy_models_yaml_path",
"sequential_guidance" , "patchmatch"
]
}

What happened

When generating an 1024x1024 SDXL1 image for the first time, the latents to image step (l2i) takes three and a half minutes. In subsequent runs with only he seed changed l2i takes roughly half a second. Changing the resolution results in one slow generation with subsequent ones being quick again.

The workflow editor as well as the queue item JSON show the l2i node not using the cache even if its behavior indicates that something is cached.

While the l2i node is running, CPU and GPU utilization are at 100%2, RAM is at at 11GB / 32GB and VRAM is at 9.5GB / 16GB.

Canceling the generation while l2i is running only succeeds visually, i.e. on the canvas the little icon showing the active and pending count decrements and disappears if it hits 0, and the queue item list shows the item as canceled. However, CPU and GPU utilization remain high. If the server is left running, eventually utilization drops, the graph stats are printed to the console and the image appears on the canvas or in the gallery. This might be an unrelated issue that is only noticeable when l2i takes this long.

While l2i is running, sending SIGINT to the server by pressing Ctrl+C in the terminal prints the usual stack trace:

^C[2025-04-30 14:04:11,359]::[ModelInstallService]::INFO --> Installer thread 134159527483072 exiting
Traceback (most recent call last):
  File "/opt/InvokeAI/.venv/bin/invokeai-web", line 12, in <module>
    sys.exit(run_app())
             ^^^^^^^^^
  File "/opt/InvokeAI/.venv/lib/python3.12/site-packages/invokeai/app/run_app.py", line 92, in run_app
    loop.run_until_complete(server.serve())
  File "/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/asyncio/base_events.py", line 678, in run_until_complete
    self.run_forever()
  File "/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/asyncio/base_events.py", line 645, in run_forever
    self._run_once()
  File "/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once
    handle._run()
  File "/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/opt/InvokeAI/.venv/lib/python3.12/site-packages/uvicorn/server.py", line 69, in serve
    with self.capture_signals():
         ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/contextlib.py", line 144, in __exit__
    next(self.gen)
  File "/opt/InvokeAI/.venv/lib/python3.12/site-packages/uvicorn/server.py", line 330, in capture_signals
    signal.raise_signal(captured_signal)
KeyboardInterrupt

However, CPU and GPU utilization remain high and the process does not exit. Sending SIGINT again finally terminates the process with the message:

^CException ignored in: <module 'threading' from '/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/threading.py'>
Traceback (most recent call last):
  File "/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/threading.py", line 1624, in _shutdown
    lock.acquire()
KeyboardInterrupt: 
/home/mmaag/.local/share/uv/python/cpython-3.12.9-linux-x86_64-gnu/lib/python3.12/tempfile.py:936: ResourceWarning: Implicitly cleaning up <TemporaryDirectory '/tmp/tmppvbrln3n'>
  _warnings.warn(warn_message, ResourceWarning)

This too might be an unrelated issue.

What you expected to happen

l2i does not take three and a half minutes. In v.5.9.1 this step took around 20 seconds for a new resolution.

How to reproduce the problem

  • Open the canvas.
  • Enter settings to generate any image you like
  • Select a resolution that you have not used before

Alternatively install a fresh copy Invoke if you want to use one of the standard resolutions.

Additional context

The issue does not exists in v5.9.1. It is present in v5.10.0 and v5.10.1. I initially noticed the issue after upgrading an existing installation but the issue is also present with a fresh installation.

I made sure Invoke is detecting my GPU. It is correctly prints AMD Radeon Graphics as the used Torch Device, GPU utilization is high when invoking, and denoising doesn't take forever.

I made sure Invoke is actually running the same node graph. The issue also happens when using the workflow editor. In addition, diffing the queue item JSONS only shows different node IDs and seeds but no difference in structure.

Stats for a quick run:


                          Node   Calls   Seconds  VRAM Used
             sdxl_model_loader       1    0.000s     0.325G
            sdxl_compel_prompt       2    0.005s     0.325G
                       collect       2    0.000s     0.325G
                         noise       1    0.003s     0.325G
               denoise_latents       1   31.860s     5.366G
                 core_metadata       1    0.000s     5.209G
                           l2i       1    0.630s     8.217G

Stats for a slow run:

                          Node   Calls   Seconds  VRAM Used
             sdxl_model_loader       1    0.006s     0.000G
            sdxl_compel_prompt       2    2.196s     1.569G
                       collect       2    0.001s     1.561G
                         noise       1    0.003s     1.561G
               denoise_latents       1   33.674s     6.598G
                 core_metadata       1    0.000s     6.441G
                           l2i       1  212.695s    12.107G

Stats for a 'slow' run in v.5.9.1:

                          Node   Calls   Seconds  VRAM Used
             sdxl_model_loader       1    0.000s     6.567G
            sdxl_compel_prompt       2    0.008s     6.567G
                       collect       2    0.001s     6.567G
                         noise       1    0.002s     6.567G
               denoise_latents       1   18.406s     7.291G
                 core_metadata       1    0.000s     6.567G
                           l2i       1   20.318s    12.457G


Discord username

No response

Footnotes

  1. 512x512 SD 1.5 has the same issue with less drastic impact on run time.

  2. 100% CPU utilization meaning one core is fully in use.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions