feat: add robust LLM JSON parser and fix Group Chat schema (#388)

Add robust LLM JSON parsing utilities to handle unreliable model output:
- Parse tool arguments with tolerance for Python format (single quotes, trailing commas)
- Extract text from Anthropic-style content arrays in streaming events
- Normalize tool_result content to string format per Hermes spec
- Parse message.delta and run.completed output to avoid displaying JSON strings

Fix Group Chat database schema errors:
- Add id column as PRIMARY KEY to gc_room_agents and gc_room_members tables
- Change from composite primary keys to single-column id keys
- Update tests to match new schema structure

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ekko
2026-05-02 08:58:14 +08:00
committed by GitHub
parent 9325aa5482
commit 969c7c0e1a
5 changed files with 457 additions and 135 deletions
+267
View File
@@ -0,0 +1,267 @@
/**
* LLM JSON Parsing Utilities
*
* Handles unreliable JSON output from large language models.
* Provides extraction, tolerant parsing, and validation.
*
* Based on production-grade patterns for handling LLM JSON:
* - Extract JSON from text (code blocks, plain objects)
* - Fix common LLM mistakes (single quotes, missing quotes, trailing commas)
* - Validate against schema (zod)
* - Retry on failure
*/
/**
* Extract JSON string from LLM text output.
* Handles: ```json code blocks, plain {...} objects
*/
export function extractJSON(text: string): string {
if (!text || typeof text !== 'string') {
throw new Error('Invalid text: must be non-empty string')
}
const trimmed = text.trim()
// Extract from ```json ... ``` code block
const codeBlockMatch = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
if (codeBlockMatch) {
return codeBlockMatch[1].trim()
}
// Extract first {...} object (greedy match for nested objects)
const objectMatch = trimmed.match(/\{[\s\S]*\}/)
if (objectMatch) {
return objectMatch[0]
}
// Extract first [...] array (greedy match for nested arrays)
const arrayMatch = trimmed.match(/\[[\s\S]*\]/)
if (arrayMatch) {
return arrayMatch[0]
}
throw new Error('No JSON found in text (no code blocks, objects, or arrays detected)')
}
/**
* Fix common LLM JSON mistakes before parsing.
* Handles: single quotes, unquoted keys, trailing commas, Python booleans/null
*/
export function fixLLMJSON(jsonStr: string): string {
if (!jsonStr || typeof jsonStr !== 'string') {
throw new Error('Invalid JSON string')
}
let fixed = jsonStr
// Fix 1: Python boolean/null literals
fixed = fixed.replace(/\bTrue\b/g, 'true')
fixed = fixed.replace(/\bFalse\b/g, 'false')
fixed = fixed.replace(/\bNone\b/g, 'null')
// Fix 2: Single quotes to double quotes (but be careful with escaped quotes)
// This is a simple replacement - works for most cases but may fail on edge cases
fixed = fixed.replace(/'/g, '"')
// Fix 3: Unquoted object keys (e.g., {name: "value"} -> {"name": "value"})
// Match word followed by : (not already quoted)
fixed = fixed.replace(/(\w+):/g, '"$1":')
// Fix 4: Trailing commas in objects
fixed = fixed.replace(/,\s*}/g, '}')
// Fix 5: Trailing commas in arrays
fixed = fixed.replace(/,\s*]/g, ']')
// Fix 6: Remove extra text before/after JSON (common in LLM outputs)
// Find first { or [ and match to closing bracket
const firstBrace = fixed.indexOf('{')
const firstBracket = fixed.indexOf('[')
if (firstBrace >= 0 && (firstBracket < 0 || firstBrace < firstBracket)) {
// Object first
let depth = 0
let start = firstBrace
let end = -1
for (let i = start; i < fixed.length; i++) {
if (fixed[i] === '{') depth++
else if (fixed[i] === '}') depth--
if (depth === 0) {
end = i + 1
break
}
}
if (end > 0) fixed = fixed.substring(start, end)
} else if (firstBracket >= 0) {
// Array first
let depth = 0
let start = firstBracket
let end = -1
for (let i = start; i < fixed.length; i++) {
if (fixed[i] === '[') depth++
else if (fixed[i] === ']') depth--
if (depth === 0) {
end = i + 1
break
}
}
if (end > 0) fixed = fixed.substring(start, end)
}
return fixed
}
/**
* Parse LLM JSON with fallback attempts.
* Tries: direct parse -> fixed parse -> extracted parse
*/
export function parseLLMJSON(text: string, retries = 3): any {
const errors: Error[] = []
// Attempt 1: Direct parse (already valid JSON)
try {
return JSON.parse(text)
} catch (e) {
errors.push(e as Error)
}
for (let attempt = 0; attempt < retries; attempt++) {
try {
// Attempt 2: Extract and fix
const extracted = extractJSON(text)
const fixed = fixLLMJSON(extracted)
return JSON.parse(fixed)
} catch (e) {
errors.push(e as Error)
// If extraction failed, try fixing the whole text
try {
const fixed = fixLLMJSON(text)
return JSON.parse(fixed)
} catch (e2) {
errors.push(e2 as Error)
}
}
}
// All attempts failed
const error = new Error(`Failed to parse LLM JSON after ${retries + 1} attempts`)
error.cause = errors
throw error
}
/**
* Parse LLM JSON with schema validation (zod).
* Returns validated data or throws validation error.
*/
export async function parseLLMJSONWithSchema<T>(
text: string,
schema: { parse: (data: any) => T },
retries = 3
): Promise<T> {
const data = parseLLMJSON(text, retries)
try {
return schema.parse(data)
} catch (e) {
const error = new Error('LLM JSON schema validation failed')
error.cause = e
throw error
}
}
/**
* Safe parse - returns null on failure instead of throwing.
* Useful for optional JSON fields in LLM responses.
*/
export function safeParseLLMJSON(text: string): any | null {
try {
return parseLLMJSON(text, 1)
} catch {
return null
}
}
/**
* Parse tool_call arguments from LLM output.
* Specifically optimized for OpenAI-style tool calls.
*/
export function parseToolArguments(args: string | object): any {
if (typeof args === 'object') {
return args // Already parsed
}
if (typeof args !== 'string') {
throw new Error('Invalid arguments: must be string or object')
}
const trimmed = args.trim()
// Handle empty object
if (trimmed === '{}' || trimmed === '[]') {
return trimmed === '{}' ? {} : []
}
try {
// Try direct parse first
return JSON.parse(trimmed)
} catch {
// Fall back to LLM JSON parsing
return parseLLMJSON(trimmed, 2)
}
}
/**
* Parse array content from LLM (common in Anthropic-style messages).
* Handles Python-style arrays with thinking/text/tool_use blocks.
*/
export function parseAnthropicContentArray(content: string): Array<{
type: string
text?: string
thinking?: string
id?: string
name?: string
input?: any
}> {
if (!content || typeof content !== 'string') {
return []
}
const trimmed = content.trim()
// Handle double-serialized content: "[{...}]" -> "[{...}]"
let contentToParse = trimmed
if (trimmed.startsWith('"') && trimmed.endsWith('"') && trimmed.length >= 2) {
contentToParse = trimmed.slice(1, -1)
}
if (!contentToParse.startsWith('[') || !contentToParse.endsWith(']')) {
throw new Error('Content is not an array')
}
try {
// Parse with Python-to-JSON conversion
const parsed = JSON.parse(
contentToParse
.replace(/'/g, '"') // Python single quotes
.replace(/True/g, 'true')
.replace(/False/g, 'false')
.replace(/None/g, 'null')
)
if (!Array.isArray(parsed)) {
throw new Error('Parsed content is not an array')
}
return parsed
} catch (e) {
// Fall back to full LLM JSON parsing
const fixed = fixLLMJSON(contentToParse)
const parsed = JSON.parse(fixed)
if (!Array.isArray(parsed)) {
throw new Error('Parsed content is not an array')
}
return parsed
}
}