🛡️ fix: Address Codex round 3 — approval expiry lifecycle completeness

Three P2 findings on 780833d908, all valid:

- H1 status consistency: /chat/status now treats a non-expired
  requires_action job as active (matching /chat/active), so a client
  refreshing while an approval is pending resumes/subscribes instead of
  treating the run as finished and stranding it.
- H2 active expiry: cleanup now finalizes past-expiry requires_action jobs
  (→ aborted) in both stores instead of only filtering them from the active
  list — an expired prompt no longer lingers resident until key TTL. Redis
  routes through transitionStatus (terminal content cleanup); in-memory marks
  terminal + reclaims.
- H3 resumed liveness: in-memory stale-running check uses
  max(lastActivity, lastActiveAt, createdAt), so a just-resumed job isn't
  reaped on a stale per-chunk lastActivity entry before the next chunk.

Test added: in-memory cleanup finalizes + reclaims a past-expiry approval.
tsc + lint clean; policy + type-contract specs pass.
This commit is contained in:
Danny Avila 2026-06-16 14:32:06 -04:00
parent 780833d908
commit 4324a4e776
4 changed files with 68 additions and 5 deletions

View file

@ -202,7 +202,14 @@ router.get('/chat/status/:conversationId', async (req, res) => {
// Get resume state which contains aggregatedContent
// Avoid calling both getStreamInfo and getResumeState (both fetch content)
const resumeState = await GenerationJobManager.getResumeState(conversationId);
const isActive = job.status === 'running';
// A job paused for human review is still active (consistent with /chat/active),
// so the client resumes/subscribes rather than treating it as finished — unless
// its prompt has already expired (cleanup/expiry will finalize it).
const pendingExpired =
job.metadata.pendingAction?.expiresAt != null &&
job.metadata.pendingAction.expiresAt <= Date.now();
const isActive =
job.status === 'running' || (job.status === 'requires_action' && !pendingExpired);
res.json({
active: isActive,

View file

@ -228,3 +228,24 @@ describe('ApprovalLifecycle via GenerationJobManager.approvals (in-memory)', ()
});
});
});
describe('InMemoryJobStore — approval expiry cleanup', () => {
test('cleanup() finalizes and reclaims a past-expiry pending-approval job', async () => {
const store = new InMemoryJobStore({ ttlAfterComplete: 0 });
await store.createJob('s1', 'u1');
const action = buildPendingAction(
buildToolApprovalPayload([{ name: 'shell', arguments: {}, tool_call_id: 'c1' }]),
{ streamId: 's1', ttlMs: -1000 },
);
await store.transitionStatus('s1', {
from: 'running',
to: 'requires_action',
patch: { pendingAction: action, pendingActionId: action.actionId },
});
// A past-expiry approval must be finalized + reclaimed, not left resident.
await store.cleanup();
expect(await store.getJob('s1')).toBeNull();
});
});

View file

@ -207,14 +207,32 @@ export class InMemoryJobStore implements IJobStore {
if (this.ttlAfterComplete === 0 || now - job.completedAt > this.ttlAfterComplete) {
toDelete.push(streamId);
}
} else if (job.status === 'requires_action' && isPendingActionExpired(job)) {
// Past-due approval: finalize it (aborted) so it stops occupying the
// user slot and its content state is reclaimed, mirroring
// ApprovalLifecycle.expire(). Skipping it (active-list filter) alone
// would leave the job resident indefinitely.
job.status = 'aborted';
job.completedAt = now;
job.error = 'Approval expired before a decision was made';
delete job.pendingAction;
delete job.pendingActionId;
if (this.ttlAfterComplete === 0) {
toDelete.push(streamId);
}
} else if (this.staleJobTimeout > 0 && job.status === 'running') {
// Failsafe: reap jobs stuck in "running" with no generation activity for
// longer than the stale timeout. These are crashed/hung generations that
// never reached a terminal state; without this they accumulate their
// content state in memory until the process OOMs. Reaping keys off last
// activity (not creation time) so a long but live stream is never reaped,
// mirroring RedisJobStore refreshing the running TTL on each chunk.
const lastActive = this.lastActivity.get(streamId) ?? job.lastActiveAt ?? job.createdAt;
// content state in memory until the process OOMs. Reaping keys off the
// most recent liveness signal (not creation time) so a long but live
// stream is never reaped, and a just-resumed approval (fresh
// `lastActiveAt`) wins over a stale per-chunk `lastActivity` entry.
const lastActive = Math.max(
this.lastActivity.get(streamId) ?? 0,
job.lastActiveAt ?? 0,
job.createdAt,
);
if (now - lastActive > this.staleJobTimeout) {
toDelete.push(streamId);
staleRunning++;

View file

@ -675,6 +675,23 @@ export class RedisJobStore implements IJobStore {
return 1;
}
// Past-due approval: finalize it (aborted) so it stops occupying the
// slot and its stream contents are reclaimed, mirroring
// ApprovalLifecycle.expire(). transitionStatus runs the terminal
// content cleanup (sets, chunks, run-steps, userJobs, completed TTL).
if (isPendingActionExpired(job)) {
await this.transitionStatus(streamId, {
from: 'requires_action',
to: 'aborted',
clear: ['pendingAction', 'pendingActionId'],
patch: {
error: 'Approval expired before a decision was made',
completedAt: Date.now(),
},
});
return 1;
}
return 0;
}),
);