From 377fa4144d99c2ae628840640d782a513e4fda32 Mon Sep 17 00:00:00 2001 From: ekko <152005280+EKKOLearnAI@users.noreply.github.com> Date: Sun, 10 May 2026 19:50:54 +0800 Subject: [PATCH] feat: convert image uploads to base64 multimodal format (#610) * fix: lower context compression message threshold from 200 to 150 Reduce the message count threshold that triggers LLM-based context compression to avoid excessively long histories before compression kicks in. Co-Authored-By: Claude Opus 4.7 * feat: convert image uploads to base64 multimodal format for API Images sent by users are now read from disk, converted to base64 data URLs, and sent as input_image parts in the /v1/responses API request instead of being replaced with text placeholders. File attachments remain as text mentions. - convertContentBlocks returns multimodal array instead of plain text - Input is wrapped in [{role:"user", content:[...]}] format for gateway - History conversion extracts text only (no base64 in conversation_history) - Add debug logging for request input preview Co-Authored-By: Claude Opus 4.7 * chore: remove debug console.log from chat-run-socket Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- .../src/services/hermes/chat-run-socket.ts | 46 +++++++++++++------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/packages/server/src/services/hermes/chat-run-socket.ts b/packages/server/src/services/hermes/chat-run-socket.ts index bba8dd5..31e4d9d 100644 --- a/packages/server/src/services/hermes/chat-run-socket.ts +++ b/packages/server/src/services/hermes/chat-run-socket.ts @@ -65,25 +65,36 @@ function isContentBlockArray(input: any): input is ContentBlock[] { } /** - * Convert file/image blocks with path to base64 format for upstream API + * Convert ContentBlock[] to multimodal format for /v1/responses API. * - * Converts images to base64 data URLs for Anthropic/OpenAI API compatibility. - * File attachments are converted to text mentions. + * - text → { type: "input_text", text } + * - image → { type: "input_image", image_url: "data:image/...;base64,..." } + * - file → text mention [File: name] */ -async function convertContentBlocks(blocks: ContentBlock[]): Promise { - let contentStr = '' +async function convertContentBlocks(blocks: ContentBlock[]): Promise> { + const parts: Array<{ type: string; text?: string; image_url?: string }> = [] + const fs = await import('fs/promises') + const path = await import('path') for (const block of blocks) { if (block.type === 'text') { - contentStr += block.text + parts.push({ type: 'input_text', text: block.text }) } else if (block.type === 'image') { - contentStr += `[Image: ${block.path}]` + try { + const buf = await fs.readFile(block.path) + const ext = path.extname(block.path).toLowerCase().replace('.', '') + const mime = ext === 'jpg' ? 'jpeg' : ext || 'png' + const base64 = buf.toString('base64') + parts.push({ type: 'input_image', image_url: `data:image/${mime};base64,${base64}` }) + } catch { + parts.push({ type: 'input_text', text: `[Image: ${block.path}]` }) + } } else if (block.type === 'file') { - contentStr += `[File: ${block.path}]` + parts.push({ type: 'input_text', text: `[File: ${block.name || block.path}]` }) } } - return contentStr + return parts } const compressor = new ChatContextCompressor() @@ -112,8 +123,12 @@ function convertHistoryFormat(messages: any[]): any[] { if (typeof content === 'string') { result.push({ role: 'user', content: content }) } else if (Array.isArray(content)) { - // Already in array format, assume it's correct - result.push({ role: 'user', content: convertContentBlocks(content) }) + // Extract text from content blocks for history + const textParts = content + .filter((b: any) => b.type === 'text') + .map((b: any) => b.text) + .join('\n') + result.push({ role: 'user', content: textParts || JSON.stringify(content) }) } continue } @@ -660,7 +675,7 @@ export class ChatRunSocket { logger.info('[context-compress] session=%s: snapshot at %d, %d new messages, assembled ~%d tokens (threshold %d)', session_id, snapshot.lastMessageIndex, newMessages.length, totalTokens, triggerTokens) // triggerTokens - if (totalTokens <= triggerTokens && newMessages.length <= 200) { + if (totalTokens <= triggerTokens && newMessages.length <= 150) { // Under threshold — use assembled context directly, no LLM call needed history = [ { role: 'user', content: SUMMARY_PREFIX + '\n\n' + snapshot.summary }, @@ -766,7 +781,7 @@ export class ChatRunSocket { } else if (history.length > 4) { // No snapshot — check if raw history exceeds threshold - if (totalTokens <= triggerTokens && history.length <= 200) { + if (totalTokens <= triggerTokens && history.length <= 150) { // Under threshold — use raw history as-is logger.info('[context-compress] session=%s: %d messages, ~%d tokens — under threshold, skip', session_id, history.length, totalTokens) } else { @@ -880,9 +895,10 @@ export class ChatRunSocket { const headers: Record = { 'Content-Type': 'application/json' } if (apiKey) headers['Authorization'] = `Bearer ${apiKey}` - // Convert input from ContentBlock[] to Anthropic format (with base64 images) + // Convert input from ContentBlock[] to multimodal message format for /v1/responses if (isContentBlockArray(input)) { - body.input = await convertContentBlocks(input) + const parts = await convertContentBlocks(input) + body.input = [{ role: 'user', content: parts }] } // Debug: write history to JSON file for analysis (before conversion)