Skip to content

GPU: Refcount Vulkan allocations to fix transfer corruption on defrag#15127

Merged
thatcosmonaut merged 4 commits intolibsdl-org:mainfrom
thatcosmonaut:defrag_corruption_fix
Mar 10, 2026
Merged

GPU: Refcount Vulkan allocations to fix transfer corruption on defrag#15127
thatcosmonaut merged 4 commits intolibsdl-org:mainfrom
thatcosmonaut:defrag_corruption_fix

Conversation

@thatcosmonaut
Copy link
Copy Markdown
Collaborator

@thatcosmonaut thatcosmonaut commented Feb 27, 2026

Attempting to resolve #15123

The problem is that defrag can execute on allocations that currently have pending transfer operations. Furthermore, in a multi-threaded situation it's possible to queue up a transfer on a resource whose block is currently being defragmented. The fix is to refcount those allocations on transfer operations, and then only defrag blocks that don't have any references. Additionally we perform some mutex locks to prevent transfers on blocks that are currently being restructured for defrag.

This should probably be stress tested a bit on some real apps because the defrag function is very sensitive.

Running this test program demonstrates that the patch resolves the issue in the single-threaded case.

#include <SDL3/SDL.h>

#define BUFFER_SIZE 4096
#define INIT_PATTERN 0x55
#define TEST_PATTERN 0xAA

static SDL_GPUBuffer* create_gpu_buffer(SDL_GPUDevice *device, Uint32 size) {
    SDL_GPUBufferCreateInfo info = {
        SDL_GPU_BUFFERUSAGE_VERTEX,
        size,
    };
    return SDL_CreateGPUBuffer(device, &info);
}

static SDL_GPUTransferBuffer* create_upload_buffer(SDL_GPUDevice *device, Uint32 size) {
    SDL_GPUTransferBufferCreateInfo info = {
        SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD,
        size,
    };
    return SDL_CreateGPUTransferBuffer(device, &info);
}

static SDL_GPUTransferBuffer* create_download_buffer(SDL_GPUDevice *device, Uint32 size) {
    SDL_GPUTransferBufferCreateInfo info = {
        SDL_GPU_TRANSFERBUFFERUSAGE_DOWNLOAD,
        size,
    };
    return SDL_CreateGPUTransferBuffer(device, &info);
}

static SDL_GPUCommandBuffer* record_upload(SDL_GPUDevice *device,
                                           SDL_GPUTransferBuffer *src,
                                           SDL_GPUBuffer *dst,
                                           Uint32 size) {
    SDL_GPUCommandBuffer *cmd = SDL_AcquireGPUCommandBuffer(device);
    SDL_GPUCopyPass *pass = SDL_BeginGPUCopyPass(cmd);
    SDL_GPUTransferBufferLocation srcLoc = { src };
    SDL_GPUBufferRegion dstRegion = { dst, 0, size };
    SDL_UploadToGPUBuffer(pass, &srcLoc, &dstRegion, false);
    SDL_EndGPUCopyPass(pass);
    return cmd;
}

static SDL_GPUCommandBuffer* record_download(SDL_GPUDevice *device,
                                             SDL_GPUBuffer *src,
                                             SDL_GPUTransferBuffer *dst,
                                             Uint32 size) {
    SDL_GPUCommandBuffer *cmd = SDL_AcquireGPUCommandBuffer(device);
    SDL_GPUCopyPass *pass = SDL_BeginGPUCopyPass(cmd);
    SDL_GPUBufferRegion srcRegion = { src, 0, size };
    SDL_GPUTransferBufferLocation dstLoc = { dst, };
    SDL_DownloadFromGPUBuffer(pass, &srcRegion, &dstLoc);
    SDL_EndGPUCopyPass(pass);
    return cmd;
}

static void submit_and_wait(SDL_GPUDevice *device, SDL_GPUCommandBuffer *cmd) {
    SDL_SubmitGPUCommandBuffer(cmd);
    SDL_WaitForGPUIdle(device);
}

static void fill_transfer_buffer(SDL_GPUDevice *device, SDL_GPUTransferBuffer *buffer,
                                 unsigned char pattern, Uint32 size) {
    void *data = SDL_MapGPUTransferBuffer(device, buffer, false);
    SDL_memset(data, pattern, size);
    SDL_UnmapGPUTransferBuffer(device, buffer);
}

static void read_and_print_gpu_buffer(SDL_GPUDevice *device,
                                      SDL_GPUBuffer *gpuBuffer,
                                      SDL_GPUTransferBuffer *downloadBuffer,
                                      Uint32 size,
                                      const char *label) {
    submit_and_wait(device, record_download(device, gpuBuffer, downloadBuffer, size));
    unsigned char *data = (unsigned char *)SDL_MapGPUTransferBuffer(device, downloadBuffer, false);
    SDL_Log("%s: [", label);
    for (int i = 0; i < 16 && i < (int)size; i++) {
        SDL_Log("%02X ", data[i]);
    }
    SDL_Log("...]\n");
    SDL_UnmapGPUTransferBuffer(device, downloadBuffer);
}

#define NUM_FRAG_BUFFERS 14 // Fill ~14MB of the 16MB page
static const size_t FRAG_BUFFER_SIZE = 1 * 1024 * 1024;  // 1 MB each
static const size_t TRIGGER_BUFFER_SIZE = 2 * 1024 * 1024;  // 2 MB
static SDL_GPUBuffer* TEMP_BUFFERS[NUM_FRAG_BUFFERS];

static void trigger_defragmentation(SDL_GPUDevice *device) {
    SDL_Log("Allocating %d x %zu KB buffers...\n", NUM_FRAG_BUFFERS, FRAG_BUFFER_SIZE / 1024);
    for (int i = 0; i < NUM_FRAG_BUFFERS; i++) {
        TEMP_BUFFERS[i] = create_gpu_buffer(device, FRAG_BUFFER_SIZE);
    }
    SDL_Log("Created %d buffers\n", NUM_FRAG_BUFFERS);

    SDL_Log("Releasing alternating buffers to create fragmentation...\n");
    for (size_t i = 0; i < NUM_FRAG_BUFFERS; i += 2) {
        SDL_ReleaseGPUBuffer(device, TEMP_BUFFERS[i]);
        TEMP_BUFFERS[i] = NULL;
    }

    submit_and_wait(device, SDL_AcquireGPUCommandBuffer(device));

    SDL_Log("Allocating %zu KB trigger buffer (won't fit in 1MB holes)...\n", TRIGGER_BUFFER_SIZE / 1024);
    SDL_GPUBuffer *triggerBuffer = create_gpu_buffer(device, TRIGGER_BUFFER_SIZE);

    SDL_Log("Submitting empty command buffer to execute defrag...\n");
    submit_and_wait(device, SDL_AcquireGPUCommandBuffer(device));
    SDL_Log("Empty command buffer completed (defrag should have run)\n");

    for (size_t i = 0; i < NUM_FRAG_BUFFERS; i += 1) {
        if (TEMP_BUFFERS[i]) {
            SDL_ReleaseGPUBuffer(device, TEMP_BUFFERS[i]);
        }
    }
    SDL_ReleaseGPUBuffer(device, triggerBuffer);
}

int main() {
    SDL_SetHint(SDL_HINT_VIDEO_DRIVER, "offscreen");
    SDL_Init(SDL_INIT_VIDEO);

    SDL_GPUDevice *device = SDL_CreateGPUDevice(
        SDL_GPU_SHADERFORMAT_SPIRV,
        true,
        NULL
    );

    SDL_Log("GPU device created successfully\n");

    SDL_GPUBuffer *targetBuffer = create_gpu_buffer(device, BUFFER_SIZE);

    SDL_GPUTransferBuffer *uploadBuffer = create_upload_buffer(device, BUFFER_SIZE);
    SDL_GPUTransferBuffer *downloadBuffer = create_download_buffer(device, BUFFER_SIZE);

    SDL_Log("Buffers created\n");

    SDL_Log("\n[Step 1] Initialize buffer with 0x%02X\n", INIT_PATTERN);
    fill_transfer_buffer(device, uploadBuffer, INIT_PATTERN, BUFFER_SIZE);
    submit_and_wait(device, record_upload(device, uploadBuffer, targetBuffer, BUFFER_SIZE));
    read_and_print_gpu_buffer(device, targetBuffer, downloadBuffer, BUFFER_SIZE,
                              "  Buffer contents");

    SDL_Log("\n[Step 2] Record cmdDeferredWrite to write 0x%02X (but don't submit yet)\n", TEST_PATTERN);
    fill_transfer_buffer(device, uploadBuffer, TEST_PATTERN, BUFFER_SIZE);
    SDL_GPUCommandBuffer *cmdDeferredWrite = record_upload(device, uploadBuffer, targetBuffer, BUFFER_SIZE);
    read_and_print_gpu_buffer(device, targetBuffer, downloadBuffer, BUFFER_SIZE,
                              "  Buffer contents (cmdDeferredWrite not yet submitted)");

    SDL_Log("\n[Step 3] Trigger defragmentation (cmdDeferredWrite still pending)\n");
    trigger_defragmentation(device);
    read_and_print_gpu_buffer(device, targetBuffer, downloadBuffer, BUFFER_SIZE,
                              "  Buffer contents (after defrag)");

    SDL_Log("\n[Step 4] Submit cmdDeferredWrite (AFTER defrag has run)\n");
    submit_and_wait(device, cmdDeferredWrite);
    read_and_print_gpu_buffer(device, targetBuffer, downloadBuffer, BUFFER_SIZE,
                              "  Buffer contents (after cmdDeferredWrite)");

    submit_and_wait(device, record_download(device, targetBuffer, downloadBuffer, BUFFER_SIZE));
    unsigned char *bytes = (unsigned char *)SDL_MapGPUTransferBuffer(device, downloadBuffer, false);
    bool bug_detected = (bytes[0] == INIT_PATTERN);
    SDL_UnmapGPUTransferBuffer(device, downloadBuffer);

    SDL_Log("\n=== RESULT ===\n");
    if (bug_detected) {
        SDL_Log("BUG DETECTED: Buffer contains 0x%02X, expected 0x%02X\n", INIT_PATTERN, TEST_PATTERN);
        SDL_Log("The deferred write went to the OLD buffer location (before defrag moved it).\n");
    } else {
        SDL_Log("TEST PASSED: Buffer contains 0x%02X as expected.\n", TEST_PATTERN);
    }

    SDL_ReleaseGPUBuffer(device, targetBuffer);
    SDL_ReleaseGPUTransferBuffer(device, uploadBuffer);
    SDL_ReleaseGPUTransferBuffer(device, downloadBuffer);
    SDL_DestroyGPUDevice(device);
    SDL_Quit();

    return bug_detected ? 1 : 0;
}

@slouken
Copy link
Copy Markdown
Collaborator

slouken commented Feb 27, 2026

I don't know the code well enough to know if this is a correct solution, but I did notice that you're mixing locks and atomic variables. If the variable you're looking at is protected by a lock, you don't need to make the variable atomic.

@thatcosmonaut
Copy link
Copy Markdown
Collaborator Author

I don't know the code well enough to know if this is a correct solution, but I did notice that you're mixing locks and atomic variables. If the variable you're looking at is protected by a lock, you don't need to make the variable atomic.

The atomics are for refcounting, they are generally not protected by locks.

@slouken
Copy link
Copy Markdown
Collaborator

slouken commented Feb 27, 2026

One missing unlock, but this otherwise looks good to me. Feel free to squash and merge and cherry-pick at will.

VkDeviceSize usedSpace;
Uint8 *mapPointer;
SDL_Mutex *memoryLock;
SDL_AtomicInt referenceCount; // Used to avoid defrag races
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe specify when referenceCount should be incremented/decremented here in a comment so anyone showing up later knows when to do it

// Find an allocation that doesn't have any pending transfer operations
Sint32 indexToDefrag = -1;
for (Sint32 i = renderer->allocationsToDefragCount - 1; i >= 0; i -= 1) {
if (SDL_GetAtomicInt(&renderer->allocationsToDefrag[i]->referenceCount) == 0) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest asserting in debug builds that this value never drops below 0, since if it does due to a bug things will break silently

Copy link
Copy Markdown
Collaborator

@flibitijibibo flibitijibibo left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry this took so long - as far as functionality is concerned it all seems to work with SOR4 which is our best stress test for threaded loading and unloading, but it's not torturethread stressful, so this should probably only be merged once we know for sure the lock timing is correct.

@kg
Copy link
Copy Markdown
Contributor

kg commented Mar 10, 2026

Forgot to come back and say that I also tested this and it seems to work fine for my game.

@thatcosmonaut thatcosmonaut merged commit b4b9a03 into libsdl-org:main Mar 10, 2026
45 checks passed
@thatcosmonaut thatcosmonaut deleted the defrag_corruption_fix branch March 10, 2026 20:34
@thatcosmonaut
Copy link
Copy Markdown
Collaborator Author

Merged, this can be cherry-picked.

@slouken
Copy link
Copy Markdown
Collaborator

slouken commented Mar 10, 2026

Merged, this can be cherry-picked.

This doesn't cleanly merge, can you create a PR for that?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

SDL_GPU vulkan backend: defragmentation causes data corruption with multiple command buffers

4 participants