Hermes-ui/packages/server/src/lib/context-compressor/index.ts

/**
 * Chat Context Compressor
 *
 * Compresses 1:1 chat conversation history before sending to upstream.
 * Uses the Hermes structured summary prompt for LLM-based compression.
 *
 * Algorithm:
 * 1. If total tokens < trigger threshold → return as-is
 * 2. Pre-clean: truncate old tool results (no LLM call)
 * 3. Load snapshot from SQLite for incremental update
 * 4. Keep last 10 messages verbatim (tail protection by message count)
 * 5. Summarize everything before the tail
 * 6. Save snapshot: last_message_index = index where compression ends
 */

import { encodingForModel, getEncoding } from 'js-tiktoken'
import { randomUUID } from 'crypto'
import { mkdir, writeFile } from 'fs/promises'
import { resolve } from 'path'
import { logger } from '../../services/logger'
import { AgentBridgeClient, type AgentBridgeRunResult } from '../../services/hermes/agent-bridge'
import {
  getCompressionSnapshot,
  saveCompressionSnapshot,
  deleteCompressionSnapshot,
} from '../../db/hermes/compression-snapshot'

// ─── Types ───────────────────────────────────────────────

export interface ContentBlock {
  type: 'text' | 'image' | 'file'
  text?: string
  path?: string
  source?: { type: string; media_type?: string; data?: string }
}

export interface ChatMessage {
  role: string
  content: string | ContentBlock[]
  tool_calls?: Array<{ id: string; type: string; function: { name: string; arguments: string } }>
  tool_call_id?: string
  name?: string
  reasoning_content?: string | null
}

export interface CompressionConfig {
  /** Token threshold to trigger compression (default: contextLength / 2) */
  triggerTokens: number
  /** Summary token target (default: 8000) */
  summaryBudget: number
  /** Number of earliest messages to keep verbatim (default: 0) */
  headMessageCount: number
  /** Number of recent messages to keep verbatim (default: 10) */
  tailMessageCount: number
  /** Timeout for LLM summarization call (default: 300_000ms) */
  summarizationTimeoutMs: number
}

export const DEFAULT_COMPRESSION_CONFIG: CompressionConfig = {
  triggerTokens: 100_000,
  summaryBudget: 8_000,
  headMessageCount: 0,
  tailMessageCount: 10,
  summarizationTimeoutMs: 300_000,
}

export interface CompressedResult {
  messages: ChatMessage[]
  meta: {
    totalMessages: number
    compressed: boolean
    /** true = actually called LLM to summarize; false = assembled from existing snapshot or returned as-is */
    llmCompressed: boolean
    summaryTokenEstimate: number
    verbatimCount: number
    compressedStartIndex: number
  }
}

export interface SummarizerOptions {
  profile?: string
  model?: string | null
  provider?: string | null
  workerKey?: string
}

const SUMMARIZER_TRIGGER_MESSAGE = 'Generate the context checkpoint summary now.'
const SUMMARIZER_DEBUG_DIR = 'logs/context-compressor'
const SUMMARIZER_DEBUG_FILE = 'summarizer-debug.json'

async function writeSummarizerDebugDump(payload: Record<string, unknown>): Promise<void> {
  if (process.env.NODE_ENV !== 'development') return
  try {
    const debugDir = resolve(process.cwd(), SUMMARIZER_DEBUG_DIR)
    await mkdir(debugDir, { recursive: true })
    await writeFile(
      resolve(debugDir, SUMMARIZER_DEBUG_FILE),
      `${JSON.stringify(payload, null, 2)}\n`,
      'utf8',
    )
  } catch (err) {
    logger.warn(err, '[context-compressor] failed to write summarizer debug dump')
  }
}

// ─── Token counting ─────────────────────────────────────

let _encoder: ReturnType<typeof getEncoding> | null = null

function getEncoder() {
  if (!_encoder) {
    _encoder = getEncoding('cl100k_base')
  }
  return _encoder
}

export function countTokens(text: string): number {
  try {
    return getEncoder().encode(text).length
  } catch {
    const cjk = (text.match(/[\u2e80-\u9fff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]/g) || []).length
    const other = text.length - cjk
    return Math.ceil(cjk * 1.5 + other / 4)
  }
}

export function countTokensForModel(text: string, model: string): number {
  try {
    const enc = encodingForModel(model as any)
    return enc.encode(text).length
  } catch {
    return countTokens(text)
  }
}

function messageTokenEstimate(message: ChatMessage): number {
  if (typeof message.content === 'string') return countTokens(message.content)
  if (Array.isArray(message.content)) {
    return countTokens(message.content.map(block => {
      if (block.type === 'text') return block.text || ''
      if (block.type === 'image') return `[Image: ${block.path || ''}]`
      if (block.type === 'file') return `[File: ${block.path || ''}]`
      return ''
    }).join(''))
  }
  return 0
}

function messagesTokenEstimate(messages: ChatMessage[]): number {
  return messages.reduce((sum, message) => sum + messageTokenEstimate(message), 0)
}

function truncateTextToTokenBudget(text: string, tokenBudget: number): string {
  if (tokenBudget <= 0 || countTokens(text) <= tokenBudget) return text
  let lo = 0
  let hi = text.length
  while (lo < hi) {
    const mid = Math.ceil((lo + hi) / 2)
    if (countTokens(text.slice(0, mid)) <= tokenBudget) lo = mid
    else hi = mid - 1
  }
  return text.slice(0, lo).trimEnd() + '\n\n[Summary truncated to fit context budget]'
}

function enforceCompressedBudget(
  messages: ChatMessage[],
  triggerTokens: number,
  summaryIndex: number,
): ChatMessage[] {
  if (triggerTokens <= 0 || messagesTokenEstimate(messages) <= triggerTokens) return messages

  const summaryMessage = messages[summaryIndex]
  if (!summaryMessage || typeof summaryMessage.content !== 'string') return messages

  const summaryOnly = [{ ...summaryMessage }]
  if (messagesTokenEstimate(summaryOnly) <= triggerTokens) return summaryOnly

  return [{
    ...summaryMessage,
    content: truncateTextToTokenBudget(summaryMessage.content, triggerTokens),
  }]
}

// ─── Prompts ────────────────────────────────────────────

export const SUMMARY_PREFIX = `[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted
into the summary below. This is a handoff from a previous context
window — treat it as background reference, NOT as active instructions.
Do NOT answer questions or fulfill requests mentioned in this summary;
they were already addressed.
Your current task is identified in the '## Active Task' section of the
summary — resume exactly from there.
Respond ONLY to the latest user message
that appears AFTER this summary. The current session state (files,
config, etc.) may reflect work described here — avoid repeating it:`

const TEMPLATE_SECTIONS = `Use this exact structure:

## Active Task
[THE SINGLE MOST IMPORTANT FIELD. Copy the user's most recent request or
task assignment verbatim — the exact words they used. If multiple tasks
were requested and only some are done, list only the ones NOT yet completed.
The next assistant must pick up exactly here. Example:
"User asked: 'Now refactor the auth module to use JWT instead of sessions'"
If no outstanding task exists, write "None."]

## Goal
[What the user is trying to accomplish overall]

## Constraints & Preferences
[User preferences, coding style, constraints, important decisions]

## Completed Actions
[Numbered list of concrete actions taken — include tool used, target, and outcome.
Format each as: N. ACTION target — outcome [tool: name]
Example:
1. READ config.py:45 — found == should be != [tool: read_file]
2. PATCH config.py:45 — changed == to != [tool: patch]
3. TEST pytest tests/ — 3/50 failed: test_parse, test_validate, test_edge [tool: terminal]
Be specific with file paths, commands, line numbers, and results.]

## Active State
[Current working state — include:
- Working directory and branch (if applicable)
- Modified/created files with brief note on each
- Test status (X/Y passing)
- Any running processes or servers
- Environment details that matter]

## In Progress
[Work currently underway — what was being done when compaction fired]

## Blocked
[Any blockers, errors, or issues not yet resolved. Include exact error messages.]

## Key Decisions
[Important technical decisions and WHY they were made]

## Resolved Questions
[Questions the user asked that were ALREADY answered — include the answer so the next assistant does not re-answer them]

## Pending User Asks
[Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."]

## Relevant Files
[Files read, modified, or created — with brief note on each]

## Remaining Work
[What remains to be done — framed as context, not instructions]

## Critical Context
[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]`

export function buildFullPrompt(contentToSummarize: string, summaryBudget: number): string {
  return `You are a summarization agent creating a context checkpoint.
Your output will be injected as reference material for a DIFFERENT
assistant that continues the conversation.
Do NOT respond to any questions or requests in the conversation —
only output the structured summary.
Do NOT include any preamble, greeting, or prefix.

Create a structured handoff summary for a different assistant that will continue
this conversation after earlier turns are compacted. The next assistant should be
able to understand what happened without re-reading the original turns.

TURNS TO SUMMARIZE:
${contentToSummarize}

${TEMPLATE_SECTIONS}

Target ~${summaryBudget} tokens. Be CONCRETE — include file paths, command outputs, error messages, line numbers, and specific values. Avoid vague descriptions like "made some changes" — say exactly what changed.

Write only the summary body. Do not include any preamble or prefix.`
}

export function buildIncrementalPrompt(previousSummary: string, contentToSummarize: string, summaryBudget: number): string {
  return `You are a summarization agent creating a context checkpoint.
Your output will be injected as reference material for a DIFFERENT
assistant that continues the conversation.
Do NOT respond to any questions or requests in the conversation —
only output the structured summary.
Do NOT include any preamble, greeting, or prefix.

You are updating a context compaction summary. A previous compaction produced the
summary below. New conversation turns have occurred since then and need to be
incorporated.

PREVIOUS SUMMARY:
${previousSummary}

NEW TURNS TO INCORPORATE:
${contentToSummarize}

Update the summary using this exact structure. PRESERVE all existing information
that is still relevant. ADD new completed actions to the numbered list
(continue numbering). Move items from "In Progress" to "Completed Actions" when
done. Move answered questions to "Resolved Questions". Update "Active State"
to reflect current state. Remove information only if it is clearly obsolete.
CRITICAL: Update "## Active Task" to reflect the user's most recent unfulfilled
request — this is the most important field for task continuity.

${TEMPLATE_SECTIONS}

Target ~${summaryBudget} tokens. Be CONCRETE — include file paths, command outputs, error messages, line numbers, and specific values. Avoid vague descriptions like "made some changes" — say exactly what changed.

Write only the summary body. Do not include any preamble or prefix.`
}

// ─── Pre-cleaning ───────────────────────────────────────

export function serializeForSummary(messages: ChatMessage[]): string {
  const parts: string[] = []

  function contentToString(content: string | ContentBlock[]): string {
    if (typeof content === 'string') return content
    if (Array.isArray(content)) {
      return content.map(block => {
        if (block.type === 'text') return block.text || ''
        if (block.type === 'image') return `[Image: ${block.path || ''}]`
        if (block.type === 'file') return `[File: ${block.path || ''}]`
        return ''
      }).join('')
    }
    return ''
  }

  for (const msg of messages) {
    const role = msg.role === 'tool' ? `[tool:${msg.name || 'unknown'}]` : msg.role
    let content = contentToString(msg.content || '')

    if (msg.role === 'tool' && content.length > 5500) {
      content = content.slice(0, 4000) + '\n... [truncated]\n...' + content.slice(-1500)
    }

    if (msg.role === 'assistant' && msg.tool_calls?.length) {
      const toolsInfo = msg.tool_calls.map(tc => {
        let args = tc.function.arguments
        if (args.length > 1500) args = args.slice(0, 1500) + '...'
        return `[tool_call: ${tc.function.name}(${args})]`
      }).join('\n')
      parts.push(`${role}: ${toolsInfo}`)
      if (content.trim()) parts.push(`${role}: ${content}`)
    } else {
      parts.push(`${role}: ${content}`)
    }
  }
  return parts.join('\n\n')
}

/**
 * Convert messages to conversation history format for LLM API.
 * Tool calls are converted to text format within assistant messages.
 */
export function buildConversationHistory(messages: ChatMessage[]): Array<{ role: string; content: string }> {
  const result: Array<{ role: string; content: string }> = []

  for (const msg of messages) {
    if (msg.role === 'tool') {
      // Convert tool result to text and append to previous assistant message
      const toolText = `[Tool result: ${msg.name || 'unknown'}]\n${(msg.content || '').slice(0, 4000)}${msg.content && msg.content.length > 4000 ? '...' : ''}`
      // Find the last assistant message and append to it
      const lastAssistant = result.findLast(m => m.role === 'assistant')
      if (lastAssistant) {
        lastAssistant.content += `\n\n${toolText}`
      } else {
        // Fallback: create an assistant message
        result.push({ role: 'assistant', content: toolText })
      }
    } else if (msg.role === 'assistant' && msg.tool_calls?.length) {
      // Include tool calls in assistant message
      const toolsInfo = msg.tool_calls.map(tc => {
        let args = tc.function.arguments
        if (args.length > 4000) args = args.slice(0, 4000) + '...'
        return `[Calling tool: ${tc.function.name} with arguments: ${args}]`
      }).join('\n')
      const content = msg.content ? `${msg.content}\n\n${toolsInfo}` : toolsInfo
      result.push({ role: msg.role, content })
    } else if (msg.role === 'user') {
      // Handle ContentBlock[] format: { type: 'text', text: '...' } or { type: 'image', path: '...' }
      let contentStr = ''
      const content = msg.content || ''
      if (typeof content === 'string') {
        contentStr = content
      } else if (Array.isArray(content)) {
        for (const block of content) {
          if (block.type === 'text') {
            contentStr += block.text || ''
          } else if (block.type === 'image') {
            contentStr += `[Image: ${block.path || ''}]`
          } else if (block.type === 'file') {
            contentStr += `[File: ${block.path || ''}]`
          }
        }
      }
      if (contentStr.length > 4000) contentStr = contentStr.slice(0, 4000) + '...'
      result.push({ role: 'user', content: contentStr })
    } else if (msg.role === 'assistant' || msg.role === 'system') {
      let contentStr = ''
      const content = msg.content
      if (typeof content === 'string') {
        contentStr = content
      } else if (Array.isArray(content)) {
        for (const block of content) {
          if (block.type === 'text') {
            contentStr += block.text || ''
          } else if (block.type === 'image') {
            contentStr += `[Image: ${block.path || ''}]`
          } else if (block.type === 'file') {
            contentStr += `[File: ${block.path || ''}]`
          }
        }
      }
      if (contentStr.length > 4000) contentStr = contentStr.slice(0, 4000) + '...'
      result.push({ role: msg.role, content: contentStr })
    }
    // Skip other roles
  }

  return result
}

export function pruneOldToolResults(messages: ChatMessage[], keepRecentCount: number): ChatMessage[] {
  if (messages.length <= keepRecentCount) return messages

  const tail = messages.slice(-keepRecentCount)
  const head = messages.slice(0, -keepRecentCount)

  const pruned = head.map(msg => {
    if (msg.role !== 'tool') return msg
    let content = ''
    if (typeof msg.content === 'string') {
      content = msg.content
    } else if (Array.isArray(msg.content)) {
      content = msg.content.map(block => {
        if (block.type === 'text') return block.text || ''
        return `[${block.type}]`
      }).join('')
    }
    const preview = content.slice(0, 100).replace(/\n/g, ' ')
    const truncated = content.length > 100 ? '...' : ''
    return { ...msg, content: `[${msg.name || 'tool'}] ${preview}${truncated}` }
  })

  return [...pruned, ...tail]
}

function pruneFallbackToolResults(messages: ChatMessage[], keepRecentCount: number): ChatMessage[] {
  return pruneOldToolResults(messages, keepRecentCount)
}

// ─── LLM Summarization ──────────────────────────────────

export async function callSummarizer(
  upstream: string,
  apiKey: string | undefined,
  prompt: string,
  history: Array<{ role: string; content: string }>,
  timeoutMs: number,
  previousSummary?: string,
  summarizer?: string | SummarizerOptions,
): Promise<string> {
  void upstream
  void apiKey
  const options: SummarizerOptions = typeof summarizer === 'string'
    ? { profile: summarizer }
    : summarizer || {}
  const profile = options.profile || 'default'
  void history
  const convHistory: Array<{ role: string; content: string }> = []

  if (previousSummary) {
    convHistory.unshift(
      { role: 'user', content: `[Previous summary]\n${previousSummary}` },
      { role: 'assistant', content: 'Understood, I will update the summary.' },
      { role: 'user', content: prompt },
    )
  } else {
    convHistory.unshift({ role: 'user', content: prompt })
  }

  const bridge = new AgentBridgeClient({ timeoutMs: timeoutMs + 15_000 })
  const sessionId = `compress_${Date.now().toString(36)}_${randomUUID().replace(/-/g, '').slice(0, 12)}`
  const workerKey = options.workerKey || `${profile}:compression:${sessionId}`
  const message = SUMMARIZER_TRIGGER_MESSAGE

  await writeSummarizerDebugDump({
    writtenAt: new Date().toISOString(),
    sessionId,
    workerKey,
    profile,
    model: options.model || null,
    provider: options.provider || null,
    message,
    convHistory,
  })

  try {
    const result = await bridge.request<AgentBridgeRunResult>({
      action: 'chat',
      session_id: sessionId,
      message,
      conversation_history: convHistory,
      profile,
      worker_key: workerKey,
      source: 'api_server',
      wait: true,
      timeout: Math.ceil(timeoutMs / 1000),
      ...(options.model ? { model: options.model } : {}),
      ...(options.provider ? { provider: options.provider } : {}),
    }, { timeoutMs: timeoutMs + 15_000 })

    if (result.status === 'error') {
      throw new Error(result.error || 'Summarization bridge run failed')
    }

    const payload = result.result as any
    const output = String(
      payload?.final_response ||
      result.output ||
      '',
    ).trim()
    if (!output) throw new Error('Empty summarization response')
    return output
  } finally {
    await bridge.destroy(sessionId, profile, workerKey).catch(() => undefined)
  }
}

// ─── Main Compressor ────────────────────────────────────

export class ChatContextCompressor {
  private config: CompressionConfig

  constructor(opts?: {
    config?: Partial<CompressionConfig>
  }) {
    this.config = { ...DEFAULT_COMPRESSION_CONFIG, ...opts?.config }
  }

  /**
   * Assemble and compress conversation history.
   *
   * Flow:
   * 1. Check snapshot → if exists, assemble = summary + new messages after snapshot index
   * 2. If no snapshot → assemble = all messages
   * 3. Count tokens of assembled context
   * 4. Under threshold → return assembled as-is (no LLM call)
   * 5. Over threshold → LLM compress, keep last N messages, save new snapshot
   */
  async compress(
    messages: ChatMessage[],
    upstream: string,
    apiKey: string | undefined,
    sessionId?: string,
    summarizer?: string | SummarizerOptions,
  ): Promise<CompressedResult> {
    const total = messages.length

    const makeMeta = (opts: Partial<CompressedResult['meta']> = {}): CompressedResult['meta'] => ({
      totalMessages: total,
      compressed: false,
      llmCompressed: false,
      summaryTokenEstimate: 0,
      verbatimCount: total,
      compressedStartIndex: -1,
      ...opts,
    })

    // Check if we have a previous compression snapshot
    const snapshot = sessionId ? getCompressionSnapshot(sessionId) : null

    if (snapshot && snapshot.lastMessageIndex >= 0 && snapshot.lastMessageIndex < messages.length) {
      // Has snapshot → incremental compress (merge old summary with new messages)
      logger.info(
        '[context-compressor] session=%s: incremental compress with snapshot at index %d',
        sessionId, snapshot.lastMessageIndex,
      )
      return this.incrementalCompress(
        messages, snapshot, upstream, apiKey, sessionId!, makeMeta(), summarizer,
      )
    } else {
      if (snapshot && sessionId) {
        const fallbackLastMessageIndex = Math.max(-1, messages.length - this.config.tailMessageCount - 1)
        logger.warn(
          '[context-compressor] session=%s: stale snapshot index %d for %d messages; using summary plus tail from index %d',
          sessionId, snapshot.lastMessageIndex, messages.length, fallbackLastMessageIndex,
        )
        return this.incrementalCompress(
          messages,
          { summary: snapshot.summary, lastMessageIndex: fallbackLastMessageIndex },
          upstream,
          apiKey,
          sessionId,
          makeMeta(),
          summarizer,
        )
      }
      // No snapshot → full compress (compress all messages)
      logger.info(
        '[context-compressor] session=%s: full compress %d messages',
        sessionId, total,
      )
      return this.fullCompress(messages, upstream, apiKey, sessionId!, makeMeta(), summarizer)
    }
  }

  private async incrementalCompress(
    messages: ChatMessage[],
    snapshot: { summary: string; lastMessageIndex: number },
    upstream: string,
    apiKey: string | undefined,
    sessionId: string,
    meta: CompressedResult['meta'],
    summarizer?: string | SummarizerOptions,
  ): Promise<CompressedResult> {
    const { summary: previousSummary, lastMessageIndex } = snapshot
    const total = messages.length
    const headCount = Math.min(this.config.headMessageCount, Math.max(0, lastMessageIndex + 1))
    const head = messages.slice(0, headCount)
    const newMessages = messages.slice(lastMessageIndex + 1)
    const tailCount = this.config.tailMessageCount
    const previousSummaryMessage: ChatMessage = { role: 'user', content: SUMMARY_PREFIX + '\n\n' + previousSummary }
    const assembledWithPrevious = [
      ...head,
      previousSummaryMessage,
      ...newMessages,
    ]
    const assembledOverBudget = messagesTokenEstimate(assembledWithPrevious) > this.config.triggerTokens
    const canKeepTailWindow = newMessages.length > tailCount

    // If the new segment itself is too small to split but already over budget,
    // fold all new messages into the existing summary instead of preserving them verbatim.
    const tailStart = assembledOverBudget && !canKeepTailWindow
      ? newMessages.length
      : Math.max(0, newMessages.length - tailCount)
    const toCompress = newMessages.slice(0, tailStart)
    const tail = newMessages.slice(tailStart)

    if (toCompress.length === 0) {
      return {
        messages: assembledWithPrevious,
        meta: {
          ...meta,
          compressed: true,
          llmCompressed: false,
          summaryTokenEstimate: countTokens(SUMMARY_PREFIX + previousSummary),
          verbatimCount: head.length + newMessages.length,
          compressedStartIndex: lastMessageIndex,
        },
      }
    }

    logger.info(
      '[context-compressor] [incremental-llm] compressing %d of %d new messages, keeping %d tail',
      toCompress.length, newMessages.length, tail.length,
    )

    let summary: string | null = null
    try {
      const contentToSummarize = serializeForSummary(toCompress)
      const prompt = buildIncrementalPrompt(previousSummary, contentToSummarize, this.config.summaryBudget)

      const t0 = Date.now()
      summary = await callSummarizer(upstream, apiKey, prompt, [], this.config.summarizationTimeoutMs, previousSummary, summarizer)
      logger.info('[context-compressor] incremental-llm done in %dms, %d chars', Date.now() - t0, summary.length)
    } catch (err: any) {
      logger.warn('[context-compressor] incremental-llm failed: %s — keeping new messages verbatim', err.message)
      const fallback = [
        ...head,
        previousSummaryMessage,
        ...newMessages,
      ]
      const prunedFallback = pruneFallbackToolResults(fallback, this.config.tailMessageCount)
      const budgetedFallback = enforceCompressedBudget(prunedFallback, this.config.triggerTokens, head.length)
      return {
        messages: budgetedFallback,
        meta: {
          ...meta,
          compressed: true,
          llmCompressed: false,
          summaryTokenEstimate: countTokens(SUMMARY_PREFIX + previousSummary),
          verbatimCount: budgetedFallback.length === fallback.length ? head.length + newMessages.length : 0,
          compressedStartIndex: lastMessageIndex,
        },
      }
    }

    let result: ChatMessage[] = [
      ...head,
      { role: 'user', content: SUMMARY_PREFIX + '\n\n' + summary },
      ...tail,
    ]
    result = enforceCompressedBudget(result, this.config.triggerTokens, head.length)

    const newLastIndex = lastMessageIndex + tailStart
    if (sessionId) {
      saveCompressionSnapshot(sessionId, summary, newLastIndex, total)
    }

    return {
      messages: result,
      meta: {
        ...meta,
        compressed: true,
        llmCompressed: true,
        summaryTokenEstimate: countTokens(SUMMARY_PREFIX + summary),
        verbatimCount: result.length === head.length + 1 + tail.length ? head.length + tail.length : 0,
        compressedStartIndex: newLastIndex,
      },
    }
  }

  private async fullCompress(
    messages: ChatMessage[],
    upstream: string,
    apiKey: string | undefined,
    sessionId: string,
    meta: CompressedResult['meta'],
    summarizer?: string | SummarizerOptions,
  ): Promise<CompressedResult> {
    const total = messages.length
    const requestedHeadCount = Math.min(this.config.headMessageCount, total)
    const requestedTailCount = this.config.tailMessageCount
    const canKeepProtectedWindows = total > requestedHeadCount + requestedTailCount
    const headCount = canKeepProtectedWindows ? requestedHeadCount : 0
    const tailCount = canKeepProtectedWindows ? requestedTailCount : 0

    const tailStart = total - tailCount
    const head = messages.slice(0, headCount)
    const toCompress = messages.slice(headCount, tailStart)
    const tail = messages.slice(tailStart)

    logger.info(
      '[context-compressor] [full-llm] compressing messages %d-%d, keeping first %d and last %d',
      headCount, tailStart - 1, head.length, tail.length,
    )

    const contentToSummarize = serializeForSummary(toCompress)
    const prompt = buildFullPrompt(contentToSummarize, this.config.summaryBudget)

    let summary: string | null = null
    try {
      const t0 = Date.now()
      summary = await callSummarizer(upstream, apiKey, prompt, [], this.config.summarizationTimeoutMs, undefined, summarizer)
      logger.info('[context-compressor] full-llm done in %dms, %d chars', Date.now() - t0, summary.length)
    } catch (err: any) {
      logger.warn('[context-compressor] full-llm failed: %s', err.message)
    }

    if (!summary) {
      return { messages: pruneFallbackToolResults(messages, this.config.tailMessageCount), meta }
    }

    const result: ChatMessage[] = []

    result.push(...head)
    result.push({ role: 'user', content: SUMMARY_PREFIX + '\n\n' + summary })
    if (sessionId) {
      saveCompressionSnapshot(sessionId, summary, tailStart - 1, total)
    }

    result.push(...tail)
    const budgetedResult = enforceCompressedBudget(result, this.config.triggerTokens, head.length)

    return {
      messages: budgetedResult,
      meta: {
        ...meta,
        compressed: true,
        llmCompressed: !!summary,
        summaryTokenEstimate: summary ? countTokens(SUMMARY_PREFIX + summary) : 0,
        verbatimCount: budgetedResult.length === result.length ? head.length + tail.length : 0,
        compressedStartIndex: tailStart - 1,
      },
    }
  }

  /** Remove snapshot for a session (e.g. when session is deleted) */
  static invalidateSnapshot(sessionId: string): void {
    deleteCompressionSnapshot(sessionId)
  }
}

async function* readSseFrames(stream: ReadableStream<Uint8Array>): AsyncGenerator<{ event?: string; data: string }> {
  const decoder = new TextDecoder()
  const reader = stream.getReader()
  let buffer = ''

  try {
    while (true) {
      const { done, value } = await reader.read()
      if (done) break
      buffer += decoder.decode(value, { stream: true })

      let boundary = buffer.indexOf('\n\n')
      while (boundary >= 0) {
        const raw = buffer.slice(0, boundary)
        buffer = buffer.slice(boundary + 2)
        const frame = parseSseFrame(raw)
        if (frame?.data) yield frame
        boundary = buffer.indexOf('\n\n')
      }
    }

    buffer += decoder.decode()
    const frame = parseSseFrame(buffer)
    if (frame?.data) yield frame
  } finally {
    reader.releaseLock()
  }
}

function parseSseFrame(raw: string): { event?: string; data: string } | null {
  let event: string | undefined
  const data: string[] = []
  for (const line of raw.split(/\r?\n/)) {
    if (!line || line.startsWith(':')) continue
    if (line.startsWith('event:')) {
      event = line.slice(6).trim()
    } else if (line.startsWith('data:')) {
      data.push(line.slice(5).trimStart())
    }
  }
  if (data.length === 0) return null
  return { event, data: data.join('\n') }
}

function extractResponseText(response: any): string {
  const output = Array.isArray(response?.output) ? response.output : []
  const parts: string[] = []
  for (const item of output) {
    if (item.type !== 'message') continue
    const content = Array.isArray(item.content) ? item.content : []
    for (const part of content) {
      if (part.type === 'output_text' || part.type === 'text') {
        parts.push(part.text || '')
      }
    }
  }
  if (parts.length > 0) return parts.join('')
  return typeof response?.output_text === 'string' ? response.output_text : ''
}