Skip to content

[Bug] Very rare nondeterministic error #571

@roxblnfk

Description

@roxblnfk

From Slack chat

Describe the bug

I'm very infrequently getting determinism errors for a reasonably simple workflow
When replaying, I get a [TMPRL1100] nondeterministic workflow: extra replay command for ScheduleActivityTask: (ActivityId:61, ActivityType:(Name:Programmes.Programme_getContent)

From the Temporal UI it looks like the worker did not schedule get Programme_getContent as expected (seems to have stalled?), but on replay it is doing

Temporal UI

History: d0abc853-0fca-43d9-ab68-b54c61cc5ee7_events (1).json

Minimal Reproduction

<?php

declare(strict_types=1);

namespace Temporal\Tests\Workflow;

use Ramsey\Uuid\Uuid;
use React\Promise\PromiseInterface;
use Temporal\Activity;
use Temporal\Activity\ActivityInterface;
use Temporal\Client\WorkflowClientInterface;
use Temporal\Internal\Workflow\ActivityProxy;
use Temporal\Promise;
use Temporal\Workflow;
use Temporal\Workflow\WorkflowInterface;
use Temporal\Workflow\WorkflowMethod;

#[WorkflowInterface]
class UserCaseWorkflow
{
    private ActivityProxy|UserCaseActivity $activity;
    private int $moduleIndex = 0;
    private bool $currentRead = false;
    private string $currentUrl = '';

    public function __construct()
    {
        $this->activity = Workflow::newActivityStub(
            UserCaseActivity::class,
            Activity\ActivityOptions::new()
                ->withStartToCloseTimeout('1 day'),
        );
    }

    #[WorkflowMethod(name: 'UserCaseWorkflow')]
    public function run(string $enrolment)
    {
        while (true) {
            $nextContent = yield $this->activity->getContent($enrolment, $this->moduleIndex + 1);

            if (!$nextContent) {
                break;
            }

            $this->currentRead = false;
            $this->currentUrl = $nextContent;
            $this->moduleIndex++;

            yield $this->activity->moduleStarted($enrolment, $this->moduleIndex, $nextContent);

            $delay = yield Workflow::sideEffect(static fn() => '6 seconds');

            yield Promise::all([
                Workflow::timer($delay),
                Workflow::async(function () use ($enrolment) {
                    yield Workflow::await(fn() => $this->currentRead);

                    yield $this->activity->moduleCompleted($enrolment, $this->moduleIndex);
                }),
            ]);
        }

        yield $this->activity->programmeCompleted($enrolment);
    }

    #[Workflow\SignalMethod]
    public function markCompleted(int $index): void
    {
        if ($this->moduleIndex === $index) {
            $this->currentRead = true;
        }
    }
}

#[ActivityInterface('UserCaseActivity.')]
class UserCaseActivity
{
    public function __construct(
        private WorkflowClientInterface $client,
    ) {}

    /**
     * @return PromiseInterface<non-empty-string|null>
     */
    public function getContent(string $enrolment, int $moduleIndex)
    {
        return \mt_rand(0, 10 - $moduleIndex) === 0 ? null : Uuid::uuid4()->__toString();
    }

    /**
     * @return PromiseInterface<null>
     */
    public function moduleStarted(string $enrolment, int $moduleIndex, string $content)
    {
        \usleep((int)\round(2.2 * 1_000_000.0));

        $this->client->newRunningWorkflowStub(
            UserCaseWorkflow::class,
            Activity::getInfo()->workflowExecution->getID(),
        )->markCompleted($moduleIndex);
    }

    /**
     * @return PromiseInterface<null>
     */
    public function moduleCompleted(string $enrolment, int $moduleIndex) {}

    /**
     * @return PromiseInterface<null>
     */
    public function programmeCompleted(string $enrolment) {}
}

Similar issue. Got log (PHP 8.3.19):

ERROR    temporal        Workflow panic    {"Namespace": "default", "TaskQueue": "order.creation", "WorkerID": "order.creation:9a1377d9-6662-486c-a9e3-cbbc90fd2e7f", "WorkflowType": "order.creation", "WorkflowID": "5fc5df88-0200-4a13-abde-deb61a2ed28f", "RunID": "e4570e70-10de-465c-a9ed-2c4967841532", "Attempt": 5821, "Error": "[TMPRL1100] lookup failed for scheduledEventID to activityID: scheduleEventID: 71, activityID: 71", "StackTrace": "process event for order.creation [panic]:\ngo.temporal.io/sdk/internal.panicIllegalState(...)\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_command_state_machine.go:521\ngo.temporal.io/sdk/internal.(*commandsHelper).handleActivityTaskScheduled(0xc001c2d040, {0xc001f1c7fe, 0x2}, 0x47)\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_command_state_machine.go:1156 +0xf8\ngo.temporal.io/sdk/internal.(*workflowExecutionEventHandlerImpl).ProcessEvent(0xc00240cb58, 0xc002005e60, 0x3?, 0x0)\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_event_handlers.go:1231 +0x3f2\ngo.temporal.io/sdk/internal.(*workflowExecutionContextImpl).ProcessWorkflowTask(0xc0005654a0, 0xc0023aa420)\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_task_handlers.go:1182 +0x1a8a\ngo.temporal.io/sdk/internal.(*workflowTaskHandlerImpl).ProcessWorkflowTask(0xc001266000, 0xc0023aa420, 0xc0005654a0, 0xc001b8cba0)\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_task_handlers.go:929 +0x59e\ngo.temporal.io/sdk/internal.(*workflowTaskPoller).processWorkflowTask(0xc00012a000, 0xc0023aa420)\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_task_pollers.go:424 +0x3db\ngo.temporal.io/sdk/internal.(*workflowTaskPoller).ProcessTask(0xc00012a000, {0x1f67f40, 0xc0023aa420})\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_task_pollers.go:372 +0x205\ngo.temporal.io/sdk/internal.(*baseWorker).processTaskAsync.func1()\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_worker_base.go:453 +0x12f\ncreated by go.temporal.io/sdk/internal.(*baseWorker).processTaskAsync in goroutine 203\n\tgo.temporal.io/sdk@v1.33.0/internal/internal_worker_base.go:432 +0x8c"}
WARN    temporal        Failed to process workflow task.    {"Namespace": "default", "TaskQueue": "order.creation", "WorkerID": "order.creation:9a1377d9-6662-486c-a9e3-cbbc90fd2e7f", "WorkflowType": "order.creation", "WorkflowID": "5fc5df88-0200-4a13-abde-deb61a2ed28f", "RunID": "e4570e70-10de-465c-a9ed-2c4967841532", "Attempt": 5822, "Error": "[TMPRL1100] lookup failed for scheduledEventID to activityID: scheduleEventID: 71, activityID: 71"}

Metadata

Metadata

Labels

BugSomething isn't working

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions