268 lines
6.9 KiB
TypeScript
268 lines
6.9 KiB
TypeScript
|
|
/**
|
||
|
|
* LLM JSON Parsing Utilities
|
||
|
|
*
|
||
|
|
* Handles unreliable JSON output from large language models.
|
||
|
|
* Provides extraction, tolerant parsing, and validation.
|
||
|
|
*
|
||
|
|
* Based on production-grade patterns for handling LLM JSON:
|
||
|
|
* - Extract JSON from text (code blocks, plain objects)
|
||
|
|
* - Fix common LLM mistakes (single quotes, missing quotes, trailing commas)
|
||
|
|
* - Validate against schema (zod)
|
||
|
|
* - Retry on failure
|
||
|
|
*/
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract JSON string from LLM text output.
|
||
|
|
* Handles: ```json code blocks, plain {...} objects
|
||
|
|
*/
|
||
|
|
export function extractJSON(text: string): string {
|
||
|
|
if (!text || typeof text !== 'string') {
|
||
|
|
throw new Error('Invalid text: must be non-empty string')
|
||
|
|
}
|
||
|
|
|
||
|
|
const trimmed = text.trim()
|
||
|
|
|
||
|
|
// Extract from ```json ... ``` code block
|
||
|
|
const codeBlockMatch = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
|
||
|
|
if (codeBlockMatch) {
|
||
|
|
return codeBlockMatch[1].trim()
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract first {...} object (greedy match for nested objects)
|
||
|
|
const objectMatch = trimmed.match(/\{[\s\S]*\}/)
|
||
|
|
if (objectMatch) {
|
||
|
|
return objectMatch[0]
|
||
|
|
}
|
||
|
|
|
||
|
|
// Extract first [...] array (greedy match for nested arrays)
|
||
|
|
const arrayMatch = trimmed.match(/\[[\s\S]*\]/)
|
||
|
|
if (arrayMatch) {
|
||
|
|
return arrayMatch[0]
|
||
|
|
}
|
||
|
|
|
||
|
|
throw new Error('No JSON found in text (no code blocks, objects, or arrays detected)')
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Fix common LLM JSON mistakes before parsing.
|
||
|
|
* Handles: single quotes, unquoted keys, trailing commas, Python booleans/null
|
||
|
|
*/
|
||
|
|
export function fixLLMJSON(jsonStr: string): string {
|
||
|
|
if (!jsonStr || typeof jsonStr !== 'string') {
|
||
|
|
throw new Error('Invalid JSON string')
|
||
|
|
}
|
||
|
|
|
||
|
|
let fixed = jsonStr
|
||
|
|
|
||
|
|
// Fix 1: Python boolean/null literals
|
||
|
|
fixed = fixed.replace(/\bTrue\b/g, 'true')
|
||
|
|
fixed = fixed.replace(/\bFalse\b/g, 'false')
|
||
|
|
fixed = fixed.replace(/\bNone\b/g, 'null')
|
||
|
|
|
||
|
|
// Fix 2: Single quotes to double quotes (but be careful with escaped quotes)
|
||
|
|
// This is a simple replacement - works for most cases but may fail on edge cases
|
||
|
|
fixed = fixed.replace(/'/g, '"')
|
||
|
|
|
||
|
|
// Fix 3: Unquoted object keys (e.g., {name: "value"} -> {"name": "value"})
|
||
|
|
// Match word followed by : (not already quoted)
|
||
|
|
fixed = fixed.replace(/(\w+):/g, '"$1":')
|
||
|
|
|
||
|
|
// Fix 4: Trailing commas in objects
|
||
|
|
fixed = fixed.replace(/,\s*}/g, '}')
|
||
|
|
|
||
|
|
// Fix 5: Trailing commas in arrays
|
||
|
|
fixed = fixed.replace(/,\s*]/g, ']')
|
||
|
|
|
||
|
|
// Fix 6: Remove extra text before/after JSON (common in LLM outputs)
|
||
|
|
// Find first { or [ and match to closing bracket
|
||
|
|
const firstBrace = fixed.indexOf('{')
|
||
|
|
const firstBracket = fixed.indexOf('[')
|
||
|
|
|
||
|
|
if (firstBrace >= 0 && (firstBracket < 0 || firstBrace < firstBracket)) {
|
||
|
|
// Object first
|
||
|
|
let depth = 0
|
||
|
|
let start = firstBrace
|
||
|
|
let end = -1
|
||
|
|
for (let i = start; i < fixed.length; i++) {
|
||
|
|
if (fixed[i] === '{') depth++
|
||
|
|
else if (fixed[i] === '}') depth--
|
||
|
|
if (depth === 0) {
|
||
|
|
end = i + 1
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (end > 0) fixed = fixed.substring(start, end)
|
||
|
|
} else if (firstBracket >= 0) {
|
||
|
|
// Array first
|
||
|
|
let depth = 0
|
||
|
|
let start = firstBracket
|
||
|
|
let end = -1
|
||
|
|
for (let i = start; i < fixed.length; i++) {
|
||
|
|
if (fixed[i] === '[') depth++
|
||
|
|
else if (fixed[i] === ']') depth--
|
||
|
|
if (depth === 0) {
|
||
|
|
end = i + 1
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (end > 0) fixed = fixed.substring(start, end)
|
||
|
|
}
|
||
|
|
|
||
|
|
return fixed
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse LLM JSON with fallback attempts.
|
||
|
|
* Tries: direct parse -> fixed parse -> extracted parse
|
||
|
|
*/
|
||
|
|
export function parseLLMJSON(text: string, retries = 3): any {
|
||
|
|
const errors: Error[] = []
|
||
|
|
|
||
|
|
// Attempt 1: Direct parse (already valid JSON)
|
||
|
|
try {
|
||
|
|
return JSON.parse(text)
|
||
|
|
} catch (e) {
|
||
|
|
errors.push(e as Error)
|
||
|
|
}
|
||
|
|
|
||
|
|
for (let attempt = 0; attempt < retries; attempt++) {
|
||
|
|
try {
|
||
|
|
// Attempt 2: Extract and fix
|
||
|
|
const extracted = extractJSON(text)
|
||
|
|
const fixed = fixLLMJSON(extracted)
|
||
|
|
return JSON.parse(fixed)
|
||
|
|
} catch (e) {
|
||
|
|
errors.push(e as Error)
|
||
|
|
// If extraction failed, try fixing the whole text
|
||
|
|
try {
|
||
|
|
const fixed = fixLLMJSON(text)
|
||
|
|
return JSON.parse(fixed)
|
||
|
|
} catch (e2) {
|
||
|
|
errors.push(e2 as Error)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// All attempts failed
|
||
|
|
const error = new Error(`Failed to parse LLM JSON after ${retries + 1} attempts`)
|
||
|
|
error.cause = errors
|
||
|
|
throw error
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse LLM JSON with schema validation (zod).
|
||
|
|
* Returns validated data or throws validation error.
|
||
|
|
*/
|
||
|
|
export async function parseLLMJSONWithSchema<T>(
|
||
|
|
text: string,
|
||
|
|
schema: { parse: (data: any) => T },
|
||
|
|
retries = 3
|
||
|
|
): Promise<T> {
|
||
|
|
const data = parseLLMJSON(text, retries)
|
||
|
|
|
||
|
|
try {
|
||
|
|
return schema.parse(data)
|
||
|
|
} catch (e) {
|
||
|
|
const error = new Error('LLM JSON schema validation failed')
|
||
|
|
error.cause = e
|
||
|
|
throw error
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Safe parse - returns null on failure instead of throwing.
|
||
|
|
* Useful for optional JSON fields in LLM responses.
|
||
|
|
*/
|
||
|
|
export function safeParseLLMJSON(text: string): any | null {
|
||
|
|
try {
|
||
|
|
return parseLLMJSON(text, 1)
|
||
|
|
} catch {
|
||
|
|
return null
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse tool_call arguments from LLM output.
|
||
|
|
* Specifically optimized for OpenAI-style tool calls.
|
||
|
|
*/
|
||
|
|
export function parseToolArguments(args: string | object): any {
|
||
|
|
if (typeof args === 'object') {
|
||
|
|
return args // Already parsed
|
||
|
|
}
|
||
|
|
|
||
|
|
if (typeof args !== 'string') {
|
||
|
|
throw new Error('Invalid arguments: must be string or object')
|
||
|
|
}
|
||
|
|
|
||
|
|
const trimmed = args.trim()
|
||
|
|
|
||
|
|
// Handle empty object
|
||
|
|
if (trimmed === '{}' || trimmed === '[]') {
|
||
|
|
return trimmed === '{}' ? {} : []
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Try direct parse first
|
||
|
|
return JSON.parse(trimmed)
|
||
|
|
} catch {
|
||
|
|
// Fall back to LLM JSON parsing
|
||
|
|
return parseLLMJSON(trimmed, 2)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Parse array content from LLM (common in Anthropic-style messages).
|
||
|
|
* Handles Python-style arrays with thinking/text/tool_use blocks.
|
||
|
|
*/
|
||
|
|
export function parseAnthropicContentArray(content: string): Array<{
|
||
|
|
type: string
|
||
|
|
text?: string
|
||
|
|
thinking?: string
|
||
|
|
id?: string
|
||
|
|
name?: string
|
||
|
|
input?: any
|
||
|
|
}> {
|
||
|
|
if (!content || typeof content !== 'string') {
|
||
|
|
return []
|
||
|
|
}
|
||
|
|
|
||
|
|
const trimmed = content.trim()
|
||
|
|
|
||
|
|
// Handle double-serialized content: "[{...}]" -> "[{...}]"
|
||
|
|
let contentToParse = trimmed
|
||
|
|
if (trimmed.startsWith('"') && trimmed.endsWith('"') && trimmed.length >= 2) {
|
||
|
|
contentToParse = trimmed.slice(1, -1)
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!contentToParse.startsWith('[') || !contentToParse.endsWith(']')) {
|
||
|
|
throw new Error('Content is not an array')
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Parse with Python-to-JSON conversion
|
||
|
|
const parsed = JSON.parse(
|
||
|
|
contentToParse
|
||
|
|
.replace(/'/g, '"') // Python single quotes
|
||
|
|
.replace(/True/g, 'true')
|
||
|
|
.replace(/False/g, 'false')
|
||
|
|
.replace(/None/g, 'null')
|
||
|
|
)
|
||
|
|
|
||
|
|
if (!Array.isArray(parsed)) {
|
||
|
|
throw new Error('Parsed content is not an array')
|
||
|
|
}
|
||
|
|
|
||
|
|
return parsed
|
||
|
|
} catch (e) {
|
||
|
|
// Fall back to full LLM JSON parsing
|
||
|
|
const fixed = fixLLMJSON(contentToParse)
|
||
|
|
const parsed = JSON.parse(fixed)
|
||
|
|
|
||
|
|
if (!Array.isArray(parsed)) {
|
||
|
|
throw new Error('Parsed content is not an array')
|
||
|
|
}
|
||
|
|
|
||
|
|
return parsed
|
||
|
|
}
|
||
|
|
}
|