feat: convert image uploads to base64 multimodal format (#610)

* fix: lower context compression message threshold from 200 to 150

Reduce the message count threshold that triggers LLM-based context
compression to avoid excessively long histories before compression kicks in.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* feat: convert image uploads to base64 multimodal format for API

Images sent by users are now read from disk, converted to base64 data
URLs, and sent as input_image parts in the /v1/responses API request
instead of being replaced with text placeholders. File attachments remain
as text mentions.

- convertContentBlocks returns multimodal array instead of plain text
- Input is wrapped in [{role:"user", content:[...]}] format for gateway
- History conversion extracts text only (no base64 in conversation_history)
- Add debug logging for request input preview

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* chore: remove debug console.log from chat-run-socket

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
ekko
2026-05-10 19:50:54 +08:00
committed by GitHub
parent 89f0127da6
commit 377fa4144d
@@ -65,25 +65,36 @@ function isContentBlockArray(input: any): input is ContentBlock[] {
}
/**
* Convert file/image blocks with path to base64 format for upstream API
* Convert ContentBlock[] to multimodal format for /v1/responses API.
*
* Converts images to base64 data URLs for Anthropic/OpenAI API compatibility.
* File attachments are converted to text mentions.
* - text → { type: "input_text", text }
* - image → { type: "input_image", image_url: "data:image/...;base64,..." }
* - file → text mention [File: name]
*/
async function convertContentBlocks(blocks: ContentBlock[]): Promise<string> {
let contentStr = ''
async function convertContentBlocks(blocks: ContentBlock[]): Promise<Array<{ type: string; text?: string; image_url?: string }>> {
const parts: Array<{ type: string; text?: string; image_url?: string }> = []
const fs = await import('fs/promises')
const path = await import('path')
for (const block of blocks) {
if (block.type === 'text') {
contentStr += block.text
parts.push({ type: 'input_text', text: block.text })
} else if (block.type === 'image') {
contentStr += `[Image: ${block.path}]`
try {
const buf = await fs.readFile(block.path)
const ext = path.extname(block.path).toLowerCase().replace('.', '')
const mime = ext === 'jpg' ? 'jpeg' : ext || 'png'
const base64 = buf.toString('base64')
parts.push({ type: 'input_image', image_url: `data:image/${mime};base64,${base64}` })
} catch {
parts.push({ type: 'input_text', text: `[Image: ${block.path}]` })
}
} else if (block.type === 'file') {
contentStr += `[File: ${block.path}]`
parts.push({ type: 'input_text', text: `[File: ${block.name || block.path}]` })
}
}
return contentStr
return parts
}
const compressor = new ChatContextCompressor()
@@ -112,8 +123,12 @@ function convertHistoryFormat(messages: any[]): any[] {
if (typeof content === 'string') {
result.push({ role: 'user', content: content })
} else if (Array.isArray(content)) {
// Already in array format, assume it's correct
result.push({ role: 'user', content: convertContentBlocks(content) })
// Extract text from content blocks for history
const textParts = content
.filter((b: any) => b.type === 'text')
.map((b: any) => b.text)
.join('\n')
result.push({ role: 'user', content: textParts || JSON.stringify(content) })
}
continue
}
@@ -660,7 +675,7 @@ export class ChatRunSocket {
logger.info('[context-compress] session=%s: snapshot at %d, %d new messages, assembled ~%d tokens (threshold %d)',
session_id, snapshot.lastMessageIndex, newMessages.length, totalTokens, triggerTokens)
// triggerTokens
if (totalTokens <= triggerTokens && newMessages.length <= 200) {
if (totalTokens <= triggerTokens && newMessages.length <= 150) {
// Under threshold — use assembled context directly, no LLM call needed
history = [
{ role: 'user', content: SUMMARY_PREFIX + '\n\n' + snapshot.summary },
@@ -766,7 +781,7 @@ export class ChatRunSocket {
} else if (history.length > 4) {
// No snapshot — check if raw history exceeds threshold
if (totalTokens <= triggerTokens && history.length <= 200) {
if (totalTokens <= triggerTokens && history.length <= 150) {
// Under threshold — use raw history as-is
logger.info('[context-compress] session=%s: %d messages, ~%d tokens — under threshold, skip', session_id, history.length, totalTokens)
} else {
@@ -880,9 +895,10 @@ export class ChatRunSocket {
const headers: Record<string, string> = { 'Content-Type': 'application/json' }
if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`
// Convert input from ContentBlock[] to Anthropic format (with base64 images)
// Convert input from ContentBlock[] to multimodal message format for /v1/responses
if (isContentBlockArray(input)) {
body.input = await convertContentBlocks(input)
const parts = await convertContentBlocks(input)
body.input = [{ role: 'user', content: parts }]
}
// Debug: write history to JSON file for analysis (before conversion)