From 4d8976784728726320f14f6d28872652cf615014 Mon Sep 17 00:00:00 2001 From: ekko <152005280+EKKOLearnAI@users.noreply.github.com> Date: Thu, 21 May 2026 20:19:06 +0800 Subject: [PATCH] Clean multimodal tool results before storage (#911) --- .../server/src/db/hermes/message-content.ts | 104 ++++++++++++++++++ .../server/src/db/hermes/session-store.ts | 5 +- .../src/services/hermes/group-chat/index.ts | 7 +- .../message-content-normalization.test.ts | 62 +++++++++++ 4 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 packages/server/src/db/hermes/message-content.ts create mode 100644 tests/server/message-content-normalization.test.ts diff --git a/packages/server/src/db/hermes/message-content.ts b/packages/server/src/db/hermes/message-content.ts new file mode 100644 index 0000000..8b0b377 --- /dev/null +++ b/packages/server/src/db/hermes/message-content.ts @@ -0,0 +1,104 @@ +const IMAGE_PART_TYPES = new Set(['image', 'image_url', 'input_image']) +const DATA_IMAGE_RE = /data:image\/[a-zA-Z0-9.+-]+;base64,[A-Za-z0-9+/=\r\n]+/g + +function isPlainRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value) +} + +function isContentPart(value: unknown): value is Record { + return isPlainRecord(value) && typeof value.type === 'string' +} + +function summarizeContentParts(parts: unknown[]): string | null { + let sawContentPart = false + const text: string[] = [] + + for (const part of parts) { + if (!isContentPart(part)) continue + const type = String(part.type) + if (type === 'text') { + sawContentPart = true + const value = part.text + if (value != null) text.push(String(value)) + } else if (IMAGE_PART_TYPES.has(type)) { + sawContentPart = true + text.push('[screenshot]') + } + } + + return sawContentPart ? text.filter(Boolean).join('\n') : null +} + +function summarizeMultimodalEnvelope(value: Record): string | null { + if (value._multimodal !== true && !Array.isArray(value.content)) return null + const parts = Array.isArray(value.content) ? value.content : [] + if (!parts.length) return null + return summarizeContentParts(parts) +} + +function redactDataImages(value: unknown): unknown { + if (typeof value === 'string') return value.replace(DATA_IMAGE_RE, '[screenshot]') + if (Array.isArray(value)) return value.map(redactDataImages) + if (!isPlainRecord(value)) return value + + const cleaned: Record = {} + for (const [key, child] of Object.entries(value)) { + cleaned[key] = redactDataImages(child) + } + return cleaned +} + +function summarizeKnownMultimodalContent(value: unknown): string | null { + if (Array.isArray(value)) { + return summarizeContentParts(value) + } + + if (isPlainRecord(value)) { + return summarizeMultimodalEnvelope(value) + } + + return null +} + +function serializeStructuredMessageContent(value: unknown): string | null { + const summary = summarizeKnownMultimodalContent(value) + if (summary != null) return summary + if (Array.isArray(value) || isPlainRecord(value)) return JSON.stringify(redactDataImages(value)) + return null +} + +function shouldTryParseStructuredString(value: string): boolean { + const trimmed = value.trim() + if (!trimmed || (!trimmed.startsWith('{') && !trimmed.startsWith('['))) return false + if (trimmed.includes('_multimodal') || trimmed.includes('data:image/')) return true + return ( + trimmed.includes('"image_url"') || + trimmed.includes('"input_image"') || + trimmed.includes('"type":"image"') || + trimmed.includes('"type": "image"') + ) +} + +export function normalizeMessageContentForStorage(content: unknown): string { + if (typeof content === 'string') { + if (shouldTryParseStructuredString(content)) { + try { + const parsed = JSON.parse(content.trim()) + const summary = summarizeKnownMultimodalContent(parsed) + if (summary != null) return summary + return JSON.stringify(redactDataImages(parsed)) + } catch { + // Fall back to direct redaction below. + } + } + return content.replace(DATA_IMAGE_RE, '[screenshot]') + } + + const normalized = serializeStructuredMessageContent(content) + if (normalized != null) return normalized + return String(content ?? '') +} + +export function normalizeMessageContentForStorageRole(role: string | undefined | null, content: string): string { + return role === 'user' ? content : normalizeMessageContentForStorage(content) +} diff --git a/packages/server/src/db/hermes/session-store.ts b/packages/server/src/db/hermes/session-store.ts index 53d5f22..613b212 100644 --- a/packages/server/src/db/hermes/session-store.ts +++ b/packages/server/src/db/hermes/session-store.ts @@ -4,6 +4,7 @@ */ import { isSqliteAvailable, getDb } from '../index' import { SESSIONS_TABLE, MESSAGES_TABLE } from './schemas' +import { normalizeMessageContentForStorageRole } from './message-content' // Re-export types for compatibility with sessions-db.ts consumers export interface HermesSessionRow { @@ -377,7 +378,7 @@ export function addMessage(msg: { `INSERT INTO ${MESSAGES_TABLE} (session_id, role, content, tool_call_id, tool_calls, tool_name, timestamp, token_count, finish_reason, reasoning, reasoning_details, reasoning_content) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ).run( - msg.session_id, msg.role, msg.content, + msg.session_id, msg.role, normalizeMessageContentForStorageRole(msg.role, msg.content), msg.tool_call_id ?? null, toolCallsJson, msg.tool_name ?? null, msg.timestamp ?? Math.floor(Date.now() / 1000), msg.token_count ?? null, msg.finish_reason ?? null, @@ -412,7 +413,7 @@ export function addMessages(msgs: Array<{ for (const msg of msgs) { const toolCallsJson = msg.tool_calls ? JSON.stringify(msg.tool_calls) : null insert.run( - msg.session_id, msg.role, msg.content, + msg.session_id, msg.role, normalizeMessageContentForStorageRole(msg.role, msg.content), msg.tool_call_id ?? null, toolCallsJson, msg.tool_name ?? null, msg.timestamp ?? Math.floor(Date.now() / 1000), msg.token_count ?? null, msg.finish_reason ?? null, diff --git a/packages/server/src/services/hermes/group-chat/index.ts b/packages/server/src/services/hermes/group-chat/index.ts index 2b858f8..8265f69 100644 --- a/packages/server/src/services/hermes/group-chat/index.ts +++ b/packages/server/src/services/hermes/group-chat/index.ts @@ -3,6 +3,7 @@ import type { Server as HttpServer } from 'http' import { getToken } from '../../../services/auth' import { logger } from '../../../services/logger' import { getDb } from '../../../db' +import { normalizeMessageContentForStorage, normalizeMessageContentForStorageRole } from '../../../db/hermes/message-content' import { AgentClients, GROUP_CHAT_AGENT_SOCKET_SECRET } from './agent-clients' import { ContextEngine } from '../context-engine/compressor' import { SessionDeleter } from '../session-deleter' @@ -34,6 +35,10 @@ function contentToStorageString(content: unknown): string { return JSON.stringify(content ?? '') } +function messageContentForStorage(role: string | undefined, content: string): string { + return normalizeMessageContentForStorageRole(role, content) +} + function contentToText(content: unknown): string { if (typeof content === 'string') { const trimmed = content.trim() @@ -406,7 +411,7 @@ class ChatStorage { reasoning_details = excluded.reasoning_details, reasoning_content = excluded.reasoning_content` ).run( - msg.id, msg.roomId, msg.senderId, msg.senderName, msg.content, msg.timestamp, + msg.id, msg.roomId, msg.senderId, msg.senderName, messageContentForStorage(msg.role, msg.content), msg.timestamp, msg.role || 'user', msg.tool_call_id ?? null, toolCallsJson, diff --git a/tests/server/message-content-normalization.test.ts b/tests/server/message-content-normalization.test.ts new file mode 100644 index 0000000..ba09851 --- /dev/null +++ b/tests/server/message-content-normalization.test.ts @@ -0,0 +1,62 @@ +import { describe, expect, it } from 'vitest' + +import { + normalizeMessageContentForStorage, + normalizeMessageContentForStorageRole, +} from '../../packages/server/src/db/hermes/message-content' + +describe('message content normalization', () => { + it('summarizes multimodal envelopes without persisting base64 images', () => { + const content = { + _multimodal: true, + content: [ + { type: 'text', text: 'Image loaded into context.' }, + { type: 'image_url', image_url: { url: 'data:image/png;base64,AAAA' } }, + ], + } + + const normalized = normalizeMessageContentForStorage(JSON.stringify(content)) + + expect(normalized).toBe('Image loaded into context.\n[screenshot]') + expect(normalized).not.toContain('data:image/') + expect(normalized).not.toContain('AAAA') + }) + + it('summarizes OpenAI-style content part arrays', () => { + const normalized = normalizeMessageContentForStorage([ + { type: 'text', text: 'Question: what is shown?' }, + { type: 'input_image', image_url: 'data:image/jpeg;base64,BBBB' }, + ]) + + expect(normalized).toBe('Question: what is shown?\n[screenshot]') + }) + + it('redacts nested data images in non-envelope JSON without dropping other fields', () => { + const normalized = normalizeMessageContentForStorage(JSON.stringify({ + output: { + url: 'data:image/png;base64,CCCC', + status: 'ok', + }, + })) + + expect(JSON.parse(normalized)).toEqual({ + output: { + url: '[screenshot]', + status: 'ok', + }, + }) + }) + + it('does not parse or rewrite unrelated JSON strings', () => { + const content = '{\n "type": "event",\n "payload": "ok"\n}' + + expect(normalizeMessageContentForStorage(content)).toBe(content) + }) + + it('keeps user-authored image data untouched and only cleans non-user messages', () => { + const content = '{"content":[{"type":"image_url","image_url":{"url":"data:image/png;base64,DDDD"}}]}' + + expect(normalizeMessageContentForStorageRole('user', content)).toBe(content) + expect(normalizeMessageContentForStorageRole('tool', content)).not.toContain('data:image/') + }) +})