From 377fa4144d99c2ae628840640d782a513e4fda32 Mon Sep 17 00:00:00 2001
From: ekko <152005280+EKKOLearnAI@users.noreply.github.com>
Date: Sun, 10 May 2026 19:50:54 +0800
Subject: [PATCH] feat: convert image uploads to base64 multimodal format
 (#610)

* fix: lower context compression message threshold from 200 to 150

Reduce the message count threshold that triggers LLM-based context
compression to avoid excessively long histories before compression kicks in.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* feat: convert image uploads to base64 multimodal format for API

Images sent by users are now read from disk, converted to base64 data
URLs, and sent as input_image parts in the /v1/responses API request
instead of being replaced with text placeholders. File attachments remain
as text mentions.

- convertContentBlocks returns multimodal array instead of plain text
- Input is wrapped in [{role:"user", content:[...]}] format for gateway
- History conversion extracts text only (no base64 in conversation_history)
- Add debug logging for request input preview

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* chore: remove debug console.log from chat-run-socket

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/services/hermes/chat-run-socket.ts    | 46 +++++++++++++------
 1 file changed, 31 insertions(+), 15 deletions(-)
diff --git a/packages/server/src/services/hermes/chat-run-socket.ts b/packages/server/src/services/hermes/chat-run-socket.ts
index bba8dd5..31e4d9d 100644
--- a/packages/server/src/services/hermes/chat-run-socket.ts
+++ b/packages/server/src/services/hermes/chat-run-socket.ts
@@ -65,25 +65,36 @@ function isContentBlockArray(input: any): input is ContentBlock[] {
 }
 
 /**
- * Convert file/image blocks with path to base64 format for upstream API
+ * Convert ContentBlock[] to multimodal format for /v1/responses API.
  *
- * Converts images to base64 data URLs for Anthropic/OpenAI API compatibility.
- * File attachments are converted to text mentions.
+ * - text → { type: "input_text", text }
+ * - image → { type: "input_image", image_url: "data:image/...;base64,..." }
+ * - file → text mention [File: name]
  */
-async function convertContentBlocks(blocks: ContentBlock[]): Promise<string> {
-  let contentStr = ''
+async function convertContentBlocks(blocks: ContentBlock[]): Promise<Array<{ type: string; text?: string; image_url?: string }>> {
+  const parts: Array<{ type: string; text?: string; image_url?: string }> = []
+  const fs = await import('fs/promises')
+  const path = await import('path')
 
   for (const block of blocks) {
     if (block.type === 'text') {
-      contentStr += block.text
+      parts.push({ type: 'input_text', text: block.text })
     } else if (block.type === 'image') {
-      contentStr += `[Image: ${block.path}]`
+      try {
+        const buf = await fs.readFile(block.path)
+        const ext = path.extname(block.path).toLowerCase().replace('.', '')
+        const mime = ext === 'jpg' ? 'jpeg' : ext || 'png'
+        const base64 = buf.toString('base64')
+        parts.push({ type: 'input_image', image_url: `data:image/${mime};base64,${base64}` })
+      } catch {
+        parts.push({ type: 'input_text', text: `[Image: ${block.path}]` })
+      }
     } else if (block.type === 'file') {
-      contentStr += `[File: ${block.path}]`
+      parts.push({ type: 'input_text', text: `[File: ${block.name || block.path}]` })
     }
   }
 
-  return contentStr
+  return parts
 }
 
 const compressor = new ChatContextCompressor()
@@ -112,8 +123,12 @@ function convertHistoryFormat(messages: any[]): any[] {
       if (typeof content === 'string') {
         result.push({ role: 'user', content: content })
       } else if (Array.isArray(content)) {
-        // Already in array format, assume it's correct
-        result.push({ role: 'user', content: convertContentBlocks(content) })
+        // Extract text from content blocks for history
+        const textParts = content
+          .filter((b: any) => b.type === 'text')
+          .map((b: any) => b.text)
+          .join('\n')
+        result.push({ role: 'user', content: textParts || JSON.stringify(content) })
       }
       continue
     }
@@ -660,7 +675,7 @@ export class ChatRunSocket {
               logger.info('[context-compress] session=%s: snapshot at %d, %d new messages, assembled ~%d tokens (threshold %d)',
                 session_id, snapshot.lastMessageIndex, newMessages.length, totalTokens, triggerTokens)
               // triggerTokens
-              if (totalTokens <= triggerTokens && newMessages.length <= 200) {
+              if (totalTokens <= triggerTokens && newMessages.length <= 150) {
                 // Under threshold — use assembled context directly, no LLM call needed
                 history = [
                   { role: 'user', content: SUMMARY_PREFIX + '\n\n' + snapshot.summary },
@@ -766,7 +781,7 @@ export class ChatRunSocket {
             } else if (history.length > 4) {
               // No snapshot — check if raw history exceeds threshold
 
-              if (totalTokens <= triggerTokens && history.length <= 200) {
+              if (totalTokens <= triggerTokens && history.length <= 150) {
                 // Under threshold — use raw history as-is
                 logger.info('[context-compress] session=%s: %d messages, ~%d tokens — under threshold, skip', session_id, history.length, totalTokens)
               } else {
@@ -880,9 +895,10 @@ export class ChatRunSocket {
 
       const headers: Record<string, string> = { 'Content-Type': 'application/json' }
       if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`
-      // Convert input from ContentBlock[] to Anthropic format (with base64 images)
+      // Convert input from ContentBlock[] to multimodal message format for /v1/responses
       if (isContentBlockArray(input)) {
-        body.input = await convertContentBlocks(input)
+        const parts = await convertContentBlocks(input)
+        body.input = [{ role: 'user', content: parts }]
       }
 
       // Debug: write history to JSON file for analysis (before conversion)