fix: harden Hermes stream recovery around tool boundaries (#189)

2026-04-24 15:42:42 +02:00
parent edd41e6eb7
commit 009acc1c28
6 changed files with 496 additions and 114 deletions
@@ -72,7 +72,9 @@ export function streamRunEvents(
        onDone()
      }
    } catch {
-      onEvent({ event: 'message', delta: e.data })
+      // Some SSE adapters may deliver raw text frames. Treat them as
+      // assistant deltas so they render instead of being silently ignored.
+      onEvent({ event: 'message.delta', delta: e.data })
    }
  }

@@ -1,5 +1,5 @@
 import { startRun, streamRunEvents, type ChatMessage, type RunEvent } from '@/api/hermes/chat'
-import { deleteSession as deleteSessionApi, fetchSession, fetchSessions, fetchSessionUsageSingle, type HermesMessage, type SessionSummary } from '@/api/hermes/sessions'
+import { deleteSession as deleteSessionApi, fetchSession, fetchSessions, fetchSessionUsageSingle, type HermesMessage, type SessionDetail, type SessionSummary } from '@/api/hermes/sessions'
 import { defineStore } from 'pinia'
 import { ref, computed } from 'vue'
 import { useAppStore } from './app'
@@ -162,6 +162,42 @@ function mapHermesSession(s: SessionSummary): Session {
  }
 }

+function assistantTranscriptLength(msgs: Message[]): number {
+  return msgs.reduce((total, m) => total + (m.role === 'assistant' ? (m.content?.length ?? 0) : 0), 0)
+}
+
+function userTurnCount(msgs: Message[]): number {
+  return msgs.filter(m => m.role === 'user').length
+}
+
+function serverMessagesAreAheadOrEqual(local: Message[], server: Message[]): boolean {
+  const localUsers = userTurnCount(local)
+  const serverUsers = userTurnCount(server)
+  return serverUsers > localUsers
+    || (serverUsers === localUsers && assistantTranscriptLength(server) >= assistantTranscriptLength(local))
+}
+
+function serverHasCaughtUpToLocalTurn(local: Message[], server: Message[]): boolean {
+  return userTurnCount(server) >= userTurnCount(local)
+}
+
+function applySessionMetaFromDetail(target: Session, detail: SessionDetail) {
+  if (detail.title) target.title = detail.title
+  target.endedAt = detail.ended_at != null ? Math.round(detail.ended_at * 1000) : null
+  target.lastActiveAt = detail.last_active != null ? Math.round(detail.last_active * 1000) : target.lastActiveAt
+  target.updatedAt = Math.round((detail.last_active || detail.ended_at || target.updatedAt / 1000) * 1000)
+}
+
+function applyServerMessagesIfAhead(target: Session, detail: SessionDetail): boolean {
+  const mapped = mapHermesMessages(detail.messages || [])
+  applySessionMetaFromDetail(target, detail)
+  if (serverMessagesAreAheadOrEqual(target.messages, mapped)) {
+    target.messages = mapped
+    return true
+  }
+  return false
+}
+
 // Cache keys for stale-while-revalidate loading of sessions / messages.
 // All keys include the active profile name to isolate cache between profiles.
 // Rendering from cache on boot avoids the multi-round-trip wait the user sees
@@ -356,8 +392,10 @@ export const useChatStore = defineStore('chat', () => {
    saveJsonWithLegacy(inFlightKey(sid), { runId, startedAt: Date.now() } as InFlightRun, legacyInFlightKey(sid))
  }

-  function clearInFlight(sid: string) {
+  function clearInFlight(sid: string, runId?: string): boolean {
+    if (runId && readInFlight(sid)?.runId !== runId) return false
    removeItemWithLegacy(inFlightKey(sid), legacyInFlightKey(sid))
+    return true
  }

  function readInFlight(sid: string): InFlightRun | null {
@@ -381,8 +419,8 @@ export const useChatStore = defineStore('chat', () => {
  }

  // Poll fetchSession while an in-flight run is recovering. Exits when the
-  // server's message signature is stable for POLL_STABLE_EXITS ticks (run
-  // presumed done), TTL elapses, or the user explicitly starts streaming.
+  // server reports a terminal session and its message signature is stable for
+  // POLL_STABLE_EXITS ticks, TTL elapses, or the user explicitly starts streaming.
  function startPolling(sid: string) {
    if (pollTimers.has(sid)) return
    resumingRuns.value = new Set([...resumingRuns.value, sid])
@@ -403,34 +441,21 @@ export const useChatStore = defineStore('chat', () => {
        const mapped = mapHermesMessages(detail.messages || [])
        const target = sessions.value.find(s => s.id === sid)
        if (!target) return
-        // Use the same "content-aware" comparison as switchSession: server
-        // is ahead iff it knows about at least as many user turns and its
-        // last assistant text is at least as long as ours.
-        const local = target.messages
-        const localLastAssistant = [...local].reverse().find(m => m.role === 'assistant')
-        const serverLastAssistant = [...mapped].reverse().find(m => m.role === 'assistant')
-        const localAssistantLen = localLastAssistant?.content?.length ?? 0
-        const serverAssistantLen = serverLastAssistant?.content?.length ?? 0
-        const localUsers = local.filter(m => m.role === 'user').length
-        const serverUsers = mapped.filter(m => m.role === 'user').length
-        const serverIsCaughtUp = serverUsers >= localUsers
-        // Same rationale as switchSession: strictly more user turns means
-        // server is ahead (new turn complete). Equal user turns + longer
-        // assistant means server caught up on the current turn.
-        const serverIsAhead =
-          serverUsers > localUsers
-          || (serverUsers === localUsers && serverAssistantLen >= localAssistantLen)
+        const serverIsCaughtUp = serverHasCaughtUpToLocalTurn(target.messages, mapped)
+        const serverIsAhead = serverMessagesAreAheadOrEqual(target.messages, mapped)
+        const serverIsTerminal = detail.ended_at != null
+
+        applySessionMetaFromDetail(target, detail)
        if (serverIsAhead) {
          target.messages = mapped
-          if (detail.title && !target.title) target.title = detail.title
          if (sid === activeSessionId.value) persistActiveMessages()
        }
-        // Stability detection ONLY matters when the server has at least as
-        // many user turns as we do. Otherwise the server is still catching
-        // up (e.g. the new turn we just sent hasn't been flushed server-side
-        // yet) and a "stable" signature is a false positive — the stability
-        // is the server NOT having our latest turn, not the run being done.
-        if (!serverIsCaughtUp) {
+
+        // Stability detection ONLY matters when the server has caught up to
+        // our latest user turn AND the session is terminal. During long tool
+        // calls the persisted transcript may be stable while the run is still
+        // active; treating that as completion is the truncation failure mode.
+        if (!serverIsCaughtUp || !serverIsTerminal) {
          pollSignatures.delete(sid)
        } else {
          const last = mapped[mapped.length - 1]
@@ -439,14 +464,13 @@ export const useChatStore = defineStore('chat', () => {
          if (prev && prev.sig === sig) {
            prev.stableTicks += 1
            if (prev.stableTicks >= POLL_STABLE_EXITS) {
-              // Run is done on the server. Force-apply server view even if
-              // our "don't retreat" guard above skipped it — the server is
-              // now the authoritative source of truth.
-              target.messages = mapped
-              if (detail.title) target.title = detail.title
+              // Server confirms this run is terminal. Keep any longer local
+              // stream text if the final session export lags behind.
+              if (serverIsAhead) target.messages = mapped
              if (sid === activeSessionId.value) persistActiveMessages()
-              clearInFlight(sid)
-              stopPolling(sid)
+              if (clearInFlight(sid, inFlight.runId)) {
+                stopPolling(sid)
+              }
            }
          } else {
            pollSignatures.set(sid, { sig, stableTicks: 0 })
@@ -521,9 +545,9 @@ export const useChatStore = defineStore('chat', () => {
    }
  }

-  // Re-pull active session from server and overwrite local messages. Used on
-  // SSE drop and on tab-visible events — mobile browsers kill EventSource
-  // while backgrounded, but the backend run usually completes anyway.
+  // Re-pull active session from server without retreating from longer local
+  // streamed text. Used on SSE drop and tab-visible events — mobile browsers
+  // can kill EventSource while the backend run continues.
  async function refreshActiveSession(): Promise<boolean> {
    const sid = activeSessionId.value
    if (!sid) return false
@@ -532,17 +556,28 @@ export const useChatStore = defineStore('chat', () => {
      if (!detail) return false
      const target = sessions.value.find(s => s.id === sid)
      if (!target) return false
-      const mapped = mapHermesMessages(detail.messages || [])
-      target.messages = mapped
-      if (detail.title) target.title = detail.title
-      persistActiveMessages()
-      return true
+      const applied = applyServerMessagesIfAhead(target, detail)
+      if (applied && sid === activeSessionId.value) persistActiveMessages()
+      return applied
    } catch (err) {
      console.error('Failed to refresh active session:', err)
      return false
    }
  }

+  async function reconcileSessionAfterCompletion(sid: string): Promise<void> {
+    try {
+      const detail = await fetchSession(sid)
+      if (!detail) return
+      const target = sessions.value.find(s => s.id === sid)
+      if (!target) return
+      const applied = applyServerMessagesIfAhead(target, detail)
+      if (applied && sid === activeSessionId.value) persistActiveMessages()
+    } catch (err) {
+      console.error('Failed to reconcile completed session:', err)
+    }
+  }
+

  function createSession(): Session {
    const session: Session = {
@@ -588,34 +623,14 @@ export const useChatStore = defineStore('chat', () => {
      const detail = await fetchSession(sessionId)
      if (detail && detail.messages) {
        const mapped = mapHermesMessages(detail.messages)
-        // Pick whichever view has more information. Simple length comparison
-        // is wrong because mapHermesMessages folds tool_call-only assistant
-        // msgs and matches them with tool-result msgs — so post-fold `mapped`
-        // can be SHORTER than the raw SSE-built local array even when the
-        // server is strictly ahead. Instead, compare the last assistant
-        // message content: if the server's is at least as long, the server
-        // is up-to-date (and has the final complete text); otherwise keep
-        // local (in-flight window where server hasn't flushed the new turn).
-        const local = activeSession.value.messages
-        const localLastAssistant = [...local].reverse().find(m => m.role === 'assistant')
-        const serverLastAssistant = [...mapped].reverse().find(m => m.role === 'assistant')
-        const localAssistantLen = localLastAssistant?.content?.length ?? 0
-        const serverAssistantLen = serverLastAssistant?.content?.length ?? 0
-        const localUsers = local.filter(m => m.role === 'user').length
-        const serverUsers = mapped.filter(m => m.role === 'user').length
-        // Trust server when:
-        //   - it has STRICTLY MORE user turns than we do (new turn landed),
-        //     OR
-        //   - same user-turn count AND server's last assistant is at least
-        //     as long as ours (same turn, server caught up or further)
-        // Otherwise keep local (protects against the server-not-yet-flushed
-        // race during in-flight runs). Length comparison alone is wrong
-        // across different turns because each turn's last assistant is
-        // unrelated to the previous turn's.
-        const serverIsAhead =
-          serverUsers > localUsers
-          || (serverUsers === localUsers && serverAssistantLen >= localAssistantLen)
-        if (serverIsAhead) {
+        // Pick whichever view has more information. Simple array length
+        // comparison is wrong because mapHermesMessages folds tool-call-only
+        // assistant msgs into tool-result msgs. Also, tool boundaries can
+        // split one assistant turn into pre-tool and post-tool assistant
+        // segments, so comparing only the last assistant segment can retreat
+        // a fuller local transcript to stale pre-tool server text. Compare
+        // user-turn count plus total assistant transcript length instead.
+        if (serverMessagesAreAheadOrEqual(activeSession.value.messages, mapped)) {
          activeSession.value.messages = mapped
        }
        // Update title: use Hermes title, or fallback to first user message
@@ -823,7 +838,8 @@ export const useChatStore = defineStore('chat', () => {
            case 'run.started':
              break

-            case 'message.delta': {
+            case 'message.delta':
+            case 'message': {
              const msgs = getSessionMsgs(sid)
              const last = msgs[msgs.length - 1]
              if (last?.role === 'assistant' && last.isStreaming) {
@@ -888,14 +904,14 @@ export const useChatStore = defineStore('chat', () => {
              }
              cleanup()
              updateSessionTitle(sid)
-              // the in-flight marker. If the browser is reloading right now
-              // and kills us between the two localStorage writes, we want
-              // the next page load to still see in-flight === true (so
-              // polling kicks in and recovers) rather than the other way
-              // around (cleared in-flight + stale streaming cache = UI stuck).
+              // Persist the terminal local view before clearing the in-flight
+              // marker. If final SSE deltas were missed, reconcile once from
+              // the authoritative session export without retreating from
+              // longer local text.
              if (sid === activeSessionId.value) persistActiveMessages()
-              clearInFlight(sid)
-              stopPolling(sid)
+              void reconcileSessionAfterCompletion(sid).finally(() => {
+                if (readInFlight(sid)?.runId === runId) startPolling(sid)
+              })
              break
            }

@@ -923,8 +939,9 @@ export const useChatStore = defineStore('chat', () => {
              })
              cleanup()
              if (sid === activeSessionId.value) persistActiveMessages()
-              clearInFlight(sid)
-              stopPolling(sid)
+              if (clearInFlight(sid, runId)) {
+                stopPolling(sid)
+              }
              break
            }
          }
@@ -953,13 +970,9 @@ export const useChatStore = defineStore('chat', () => {
          if (last?.isStreaming) {
            updateMessage(sid, last.id, { isStreaming: false })
          }
-          // Any tool messages still marked 'running' will be replaced by the
-          // server's view after refresh; clear their spinner state now.
-          msgs.forEach((m, i) => {
-            if (m.role === 'tool' && m.toolStatus === 'running') {
-              msgs[i] = { ...m, toolStatus: 'done' }
-            }
-          })
+          // Keep running tool state until refresh/polling sees the server's
+          // terminal transcript. A dropped SSE connection is not proof the
+          // tool completed.
          cleanup()
          if (sid === activeSessionId.value) {
            void refreshActiveSession()
@@ -96,28 +96,43 @@ function buildProxyHeaders(ctx: Context, upstream: string): Record<string, strin
 const SSE_EVENTS_PATH = /^\/v1\/runs\/([^/]+)\/events$/

 /**
- * Parse SSE text chunks and extract run.completed events.
- * Returns the run_id if a run.completed was found.
+ * Parse one complete SSE event block and record usage for run.completed.
+ * The public stream is forwarded elsewhere; parser failures are accounting-only
+ * and must never abort the client stream.
 */
-function extractRunCompletedFromChunk(chunk: string): string | null {
-  // SSE format: each line is "data: {...}\n\n"
-  const lines = chunk.split('\n')
-  for (const line of lines) {
-    if (!line.startsWith('data: ')) continue
-    try {
-      const data = JSON.parse(line.slice(6))
-      if (data.event === 'run.completed' && data.usage && data.run_id) {
-        const sessionId = getSessionForRun(data.run_id)
-        if (sessionId) {
-          updateUsage(sessionId, data.usage.input_tokens, data.usage.output_tokens)
-          return data.run_id
-        }
-      }
-    } catch { /* not JSON, skip */ }
+function extractRunCompletedFromBlock(block: string): string | null {
+  const dataLines: string[] = []
+  for (const rawLine of block.split(/\r?\n/)) {
+    if (!rawLine.startsWith('data:')) continue
+    let data = rawLine.slice(5)
+    if (data.startsWith(' ')) data = data.slice(1)
+    dataLines.push(data)
  }
+  if (dataLines.length === 0) return null
+
+  try {
+    const data = JSON.parse(dataLines.join('\n'))
+    if (data.event === 'run.completed' && data.usage && data.run_id) {
+      const sessionId = getSessionForRun(data.run_id)
+      if (sessionId) {
+        updateUsage(sessionId, data.usage.input_tokens, data.usage.output_tokens)
+        return data.run_id
+      }
+    }
+  } catch { /* not JSON or usage accounting failed; skip */ }
  return null
 }

+function takeSSEBlock(buffer: string): { block: string; rest: string } | null {
+  const lf = buffer.indexOf('\n\n')
+  const crlf = buffer.indexOf('\r\n\r\n')
+  if (lf === -1 && crlf === -1) return null
+  if (crlf !== -1 && (lf === -1 || crlf < lf)) {
+    return { block: buffer.slice(0, crlf), rest: buffer.slice(crlf + 4) }
+  }
+  return { block: buffer.slice(0, lf), rest: buffer.slice(lf + 2) }
+}
+
 /**
 * Stream an SSE response while intercepting run.completed events.
 */
@@ -142,18 +157,18 @@ async function streamSSE(ctx: Context, res: Response): Promise<void> {
      // Also decode for interception
      buffer += decoder.decode(value, { stream: true })

-      // Process complete SSE lines (delimited by double newline)
-      let newlineIdx: number
-      while ((newlineIdx = buffer.indexOf('\n\n')) !== -1) {
-        const eventBlock = buffer.slice(0, newlineIdx)
-        buffer = buffer.slice(newlineIdx + 2)
-        extractRunCompletedFromChunk(eventBlock)
+      // Process complete SSE event blocks (LF or CRLF blank-line delimiters).
+      let next: { block: string; rest: string } | null
+      while ((next = takeSSEBlock(buffer)) !== null) {
+        buffer = next.rest
+        extractRunCompletedFromBlock(next.block)
      }
    }

+    buffer += decoder.decode()
    // Process remaining buffer
    if (buffer.trim()) {
-      extractRunCompletedFromChunk(buffer)
+      extractRunCompletedFromBlock(buffer)
    }
  } finally {
    ctx.res.end()
@@ -232,6 +247,9 @@ export async function proxy(ctx: Context) {
    // Intercept SSE streams for /v1/runs/{id}/events
    const sseMatch = upstreamPath.match(SSE_EVENTS_PATH)
    if (sseMatch) {
+      ctx.set('Content-Type', 'text/event-stream')
+      ctx.set('Cache-Control', 'no-cache, no-transform')
+      ctx.set('X-Accel-Buffering', 'no')
      await streamSSE(ctx, res)
      return
    }
@@ -0,0 +1,59 @@
+// @vitest-environment jsdom
+import { beforeEach, describe, expect, it, vi } from 'vitest'
+import { streamRunEvents, type RunEvent } from '@/api/hermes/chat'
+
+class MockEventSource {
+  static instances: MockEventSource[] = []
+
+  url: string
+  onmessage: ((event: { data: string }) => void) | null = null
+  onerror: (() => void) | null = null
+  close = vi.fn()
+
+  constructor(url: string) {
+    this.url = url
+    MockEventSource.instances.push(this)
+  }
+
+  emit(data: string) {
+    this.onmessage?.({ data })
+  }
+
+  fail() {
+    this.onerror?.()
+  }
+}
+
+describe('streamRunEvents', () => {
+  beforeEach(() => {
+    window.localStorage.clear()
+    MockEventSource.instances = []
+    vi.stubGlobal('EventSource', MockEventSource)
+  })
+
+  it('maps non-JSON EventSource data to message.delta so raw text is rendered', () => {
+    const events: RunEvent[] = []
+
+    streamRunEvents('run-raw', event => events.push(event), vi.fn(), vi.fn())
+    MockEventSource.instances[0].emit('原因：raw fallback')
+
+    expect(events).toEqual([{ event: 'message.delta', delta: '原因：raw fallback' }])
+  })
+
+  it('parses colon-containing JSON deltas and closes on completion', () => {
+    const events: RunEvent[] = []
+    const onDone = vi.fn()
+
+    streamRunEvents('run-json', event => events.push(event), onDone, vi.fn())
+    const source = MockEventSource.instances[0]
+    source.emit(JSON.stringify({ event: 'message.delta', delta: '让我直接读文件：A: B' }))
+    source.emit(JSON.stringify({ event: 'run.completed' }))
+
+    expect(events).toEqual([
+      { event: 'message.delta', delta: '让我直接读文件：A: B' },
+      { event: 'run.completed' },
+    ])
+    expect(source.close).toHaveBeenCalledTimes(1)
+    expect(onDone).toHaveBeenCalledTimes(1)
+  })
+})
@@ -41,13 +41,36 @@ function makeSummary(id: string, title = 'Session') {
  }
 }

-function makeDetail(id: string, messages: Array<Record<string, any>>) {
+function makeDetail(id: string, messages: Array<Record<string, any>>, overrides: Record<string, any> = {}) {
  return {
    ...makeSummary(id),
+    ...overrides,
    messages,
  }
 }

+function makeHermesMessage(
+  id: number,
+  role: 'user' | 'assistant' | 'system' | 'tool',
+  content: string,
+  overrides: Record<string, any> = {},
+) {
+  return {
+    id,
+    session_id: overrides.session_id || 'sess-1',
+    role,
+    content,
+    tool_call_id: null,
+    tool_calls: null,
+    tool_name: null,
+    timestamp: 1710000000 + id,
+    token_count: null,
+    finish_reason: null,
+    reasoning: null,
+    ...overrides,
+  }
+}
+
 async function flushPromises() {
  await Promise.resolve()
  await Promise.resolve()
@@ -294,4 +317,186 @@ describe('Chat Store', () => {
    expect(store.isRunActive).toBe(false)
    expect(window.localStorage.getItem(inFlightKey('sess-1'))).toBeNull()
  })
+
+  it('keeps colon deltas before and after a tool boundary', async () => {
+    mockChatApi.streamRunEvents.mockImplementation((
+      _runId: string,
+      onEvent: (event: any) => void,
+    ) => {
+      onEvent({ event: 'message.delta', delta: '让我直接读文件：' })
+      onEvent({ event: 'tool.started', tool: 'read_file', preview: 'notes.md' })
+      onEvent({ event: 'tool.completed' })
+      onEvent({ event: 'message.delta', delta: '读取后结论: final' })
+      onEvent({ event: 'run.completed' })
+      return { abort: vi.fn() }
+    })
+
+    const store = useChatStore()
+    await flushPromises()
+    await store.sendMessage('check file')
+    await flushPromises()
+
+    const assistantText = store.messages
+      .filter(m => m.role === 'assistant')
+      .map(m => m.content)
+      .join('')
+    expect(assistantText).toBe('让我直接读文件：读取后结论: final')
+    expect(store.messages.some(m => m.role === 'tool' && m.toolName === 'read_file' && m.toolStatus === 'done')).toBe(true)
+  })
+
+  it('renders raw SSE fallback message events as assistant deltas', async () => {
+    mockChatApi.streamRunEvents.mockImplementation((
+      _runId: string,
+      onEvent: (event: any) => void,
+    ) => {
+      onEvent({ event: 'message', delta: '原因：raw fallback' })
+      onEvent({ event: 'run.completed' })
+      return { abort: vi.fn() }
+    })
+
+    const store = useChatStore()
+    await flushPromises()
+    await store.sendMessage('raw stream')
+    await flushPromises()
+
+    expect(store.messages.some(m => m.role === 'assistant' && m.content === '原因：raw fallback')).toBe(true)
+  })
+
+  it('does not stop polling when server messages are stable but the session is still active', async () => {
+    vi.useFakeTimers()
+
+    let fetchSessionCalls = 0
+    mockSessionsApi.fetchSession.mockImplementation(async () => {
+      fetchSessionCalls += 1
+      if (fetchSessionCalls === 1) return null
+      return makeDetail('sess-1', [
+        makeHermesMessage(1, 'user', 'tool gap prompt'),
+        makeHermesMessage(2, 'assistant', '让我直接读文件：'),
+      ], { ended_at: null })
+    })
+
+    mockChatApi.streamRunEvents.mockImplementation((
+      _runId: string,
+      onEvent: (event: any) => void,
+      _onDone: () => void,
+      onError: (err: Error) => void,
+    ) => {
+      onEvent({ event: 'message.delta', delta: '让我直接读文件：' })
+      setTimeout(() => onError(new Error('SSE connection error')), 0)
+      return { abort: vi.fn() }
+    })
+
+    const store = useChatStore()
+    await flushPromises()
+    await store.sendMessage('tool gap prompt')
+    const sid = store.activeSessionId!
+
+    await vi.advanceTimersByTimeAsync(0)
+    await flushPromises()
+    await vi.advanceTimersByTimeAsync(9000)
+    await flushPromises()
+
+    expect(window.localStorage.getItem(inFlightKey(sid))).toBeTruthy()
+    expect(store.isRunActive).toBe(true)
+  })
+
+  it('reconciles the final session after run.completed to recover missed deltas', async () => {
+    let fetchSessionCalls = 0
+    mockSessionsApi.fetchSession.mockImplementation(async () => {
+      fetchSessionCalls += 1
+      if (fetchSessionCalls === 1) return null
+      return makeDetail('sess-1', [
+        makeHermesMessage(1, 'user', 'finish prompt'),
+        makeHermesMessage(2, 'assistant', '让我直接读文件：读取后结论：完整回答'),
+      ], { ended_at: 1710000010 })
+    })
+
+    mockChatApi.streamRunEvents.mockImplementation((
+      _runId: string,
+      onEvent: (event: any) => void,
+    ) => {
+      onEvent({ event: 'message.delta', delta: '让我直接读文件：' })
+      onEvent({ event: 'run.completed' })
+      return { abort: vi.fn() }
+    })
+
+    const store = useChatStore()
+    await flushPromises()
+    await store.sendMessage('finish prompt')
+    await flushPromises()
+
+    expect(store.messages.some(m => m.role === 'assistant' && m.content === '让我直接读文件：读取后结论：完整回答')).toBe(true)
+  })
+
+  it('does not replace longer local tool-boundary text with a stale shorter final fetch', async () => {
+    let fetchSessionCalls = 0
+    const stalePrefix = '让我直接读文件：较长的工具前说明'
+    mockSessionsApi.fetchSession.mockImplementation(async () => {
+      fetchSessionCalls += 1
+      if (fetchSessionCalls === 1) return null
+      return makeDetail('sess-1', [
+        makeHermesMessage(1, 'user', 'stale prompt'),
+        makeHermesMessage(2, 'assistant', stalePrefix),
+      ], { ended_at: 1710000010 })
+    })
+
+    mockChatApi.streamRunEvents.mockImplementation((
+      _runId: string,
+      onEvent: (event: any) => void,
+    ) => {
+      onEvent({ event: 'message.delta', delta: stalePrefix })
+      onEvent({ event: 'tool.started', tool: 'read_file', preview: 'notes.md' })
+      onEvent({ event: 'tool.completed' })
+      onEvent({ event: 'message.delta', delta: 'OK' })
+      onEvent({ event: 'run.completed' })
+      return { abort: vi.fn() }
+    })
+
+    const store = useChatStore()
+    await flushPromises()
+    await store.sendMessage('stale prompt')
+    await flushPromises()
+
+    const assistantText = store.messages
+      .filter(m => m.role === 'assistant')
+      .map(m => m.content)
+      .join('')
+    expect(assistantText).toBe(`${stalePrefix}OK`)
+    expect(store.messages.some(m => m.role === 'tool' && m.toolStatus === 'done')).toBe(true)
+  })
+
+  it('does not let delayed completion reconciliation clear a newer in-flight run', async () => {
+    let resolveReconcile: ((detail: any) => void) | null = null
+    const reconcilePromise = new Promise<any>(resolve => { resolveReconcile = resolve })
+    mockSessionsApi.fetchSession.mockImplementation(() => reconcilePromise)
+    mockChatApi.startRun
+      .mockResolvedValueOnce({ run_id: 'run-1', status: 'queued' })
+      .mockResolvedValueOnce({ run_id: 'run-2', status: 'queued' })
+    let firstRunEvent: ((event: any) => void) | null = null
+    mockChatApi.streamRunEvents.mockImplementation((
+      runId: string,
+      onEvent: (event: any) => void,
+    ) => {
+      if (runId === 'run-1') firstRunEvent = onEvent
+      return { abort: vi.fn() }
+    })
+
+    const store = useChatStore()
+    await flushPromises()
+    await store.sendMessage('first')
+    firstRunEvent!({ event: 'run.completed' })
+    await flushPromises()
+    const sid = store.activeSessionId!
+    await store.sendMessage('second')
+
+    expect(JSON.parse(window.localStorage.getItem(inFlightKey(sid)) || '{}').runId).toBe('run-2')
+
+    resolveReconcile!(makeDetail(sid, [
+      makeHermesMessage(1, 'user', 'first', { session_id: sid }),
+      makeHermesMessage(2, 'assistant', 'first done', { session_id: sid }),
+    ], { ended_at: 1710000010 }))
+    await flushPromises()
+
+    expect(JSON.parse(window.localStorage.getItem(inFlightKey(sid)) || '{}').runId).toBe('run-2')
+  })
 })
@@ -414,4 +414,89 @@ describe('SSE stream interception — run.completed', () => {

    expect(mockUpdateUsage).toHaveBeenCalledWith('session-split', 200, 50)
  })
+
+  it('forwards colon-containing SSE deltas around tool events unchanged and disables buffering', async () => {
+    const runId = 'run-colon-tool'
+    const sseData = [
+      `data: ${JSON.stringify({ event: 'message.delta', run_id: runId, delta: '让我直接读文件：A: B' })}\n\n`,
+      `data: ${JSON.stringify({ event: 'tool.started', run_id: runId, tool: 'read_file', preview: 'file:a.md' })}\n\n`,
+      `data: ${JSON.stringify({ event: 'tool.completed', run_id: runId })}\n\n`,
+      `data: ${JSON.stringify({ event: 'message.delta', run_id: runId, delta: '继续：done' })}\n\n`,
+    ]
+
+    mockFetch.mockResolvedValue({
+      status: 200,
+      headers: new Headers({ 'content-type': 'text/event-stream' }),
+      body: createSSEBody(sseData),
+    })
+
+    const ctx = createMockCtx({
+      path: `/api/hermes/v1/runs/${runId}/events`,
+      search: '',
+    })
+
+    await proxy(ctx)
+
+    const forwarded = ctx.res.write.mock.calls
+      .map(([chunk]: [Uint8Array]) => new TextDecoder().decode(chunk))
+      .join('')
+    expect(forwarded).toBe(sseData.join(''))
+    expect(ctx.set).toHaveBeenCalledWith('Content-Type', 'text/event-stream')
+    expect(ctx.set).toHaveBeenCalledWith('Cache-Control', 'no-cache, no-transform')
+    expect(ctx.set).toHaveBeenCalledWith('X-Accel-Buffering', 'no')
+  })
+
+  it('intercepts run.completed with CRLF delimiters and data without a space', async () => {
+    const runId = 'run-crlf'
+    setRunSession(runId, 'session-crlf')
+    const completedJson = JSON.stringify({ event: 'run.completed', run_id: runId, usage: { input_tokens: 321, output_tokens: 45, total_tokens: 366 } })
+    const sseData = [`data:${completedJson}\r\n\r\n`]
+
+    mockFetch.mockResolvedValue({
+      status: 200,
+      headers: new Headers({ 'content-type': 'text/event-stream' }),
+      body: createSSEBody(sseData),
+    })
+
+    const ctx = createMockCtx({
+      path: `/api/hermes/v1/runs/${runId}/events`,
+      search: '',
+    })
+
+    await proxy(ctx)
+
+    expect(mockUpdateUsage).toHaveBeenCalledWith('session-crlf', 321, 45)
+  })
+
+  it('does not let usage accounting failures abort the SSE stream', async () => {
+    const runId = 'run-usage-fails'
+    setRunSession(runId, 'session-usage-fails')
+    mockUpdateUsage.mockImplementationOnce(() => {
+      throw new Error('usage db unavailable')
+    })
+    const sseData = [
+      `data: ${JSON.stringify({ event: 'message.delta', run_id: runId, delta: 'before：' })}\n\n`,
+      `data: ${JSON.stringify({ event: 'run.completed', run_id: runId, usage: { input_tokens: 1, output_tokens: 2, total_tokens: 3 } })}\n\n`,
+    ]
+
+    mockFetch.mockResolvedValue({
+      status: 200,
+      headers: new Headers({ 'content-type': 'text/event-stream' }),
+      body: createSSEBody(sseData),
+    })
+
+    const ctx = createMockCtx({
+      path: `/api/hermes/v1/runs/${runId}/events`,
+      search: '',
+    })
+
+    await proxy(ctx)
+
+    const forwarded = ctx.res.write.mock.calls
+      .map(([chunk]: [Uint8Array]) => new TextDecoder().decode(chunk))
+      .join('')
+    expect(ctx.status).toBe(200)
+    expect(forwarded).toBe(sseData.join(''))
+    expect(ctx.res.end).toHaveBeenCalled()
+  })
 })