From 0d48b71b27ea4554de65d5b2809fbbffcb3da770 Mon Sep 17 00:00:00 2001
From: Simon <10131203+gaomeng1900@users.noreply.github.com>
Date: Mon, 20 Oct 2025 22:03:09 +0800
Subject: [PATCH] feat(llm): auto fixing known llm format errors

---
 ROADMAP.md                      |   2 +-
 src/config/index.ts             |   2 +-
 src/llms/OpenAIClient.ts        |   2 +-
 src/llms/OpenAILenientClient.ts | 139 +++++++++++++++++++++++++
 src/llms/index.ts               |   2 +-
 src/llms/types.ts               |   4 +-
 src/llms/utils.ts               | 173 ++++++++++++++++++++++++++++++++
 7 files changed, 318 insertions(+), 6 deletions(-)
 create mode 100644 src/llms/OpenAILenientClient.ts
diff --git a/ROADMAP.md b/ROADMAP.md
index c8b0a08..de5d612 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -11,7 +11,7 @@ The development progress and future plans for PageAgent.
 - [x] **UI with HITL** - Human-in-the-loop user interface
 - [x] **Landing and doc pages**
 - [x] **Remove ai-sdk** - Only one function is being used
-- [ ] **Robust LLM output**
+- [x] **Robust LLM output**
 - [ ] **Working homepage with live LLM API**
 - [ ] **Hooks for Task and HITL**
 - [ ] **Hijacking `page_open` event**
diff --git a/src/config/index.ts b/src/config/index.ts
index 2907474..3c1cf6e 100644
--- a/src/config/index.ts
+++ b/src/config/index.ts
@@ -24,7 +24,7 @@ export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
 		baseURL: config.baseURL ?? DEFAULT_BASE_URL,
 		apiKey: config.apiKey ?? DEFAULT_API_KEY,
 		modelName: config.modelName ?? DEFAULT_MODEL_NAME,
-		temperature: config.temperature ?? 0.5, // higher randomness helps auto-recovery
+		temperature: config.temperature ?? 0.7, // higher randomness helps auto-recovery
 		maxTokens: config.maxTokens ?? 4096,
 		maxRetries: config.maxRetries ?? LLM_MAX_RETRIES,
 	}
diff --git a/src/llms/OpenAIClient.ts b/src/llms/OpenAIClient.ts
index 7bd389b..bf3f72e 100644
--- a/src/llms/OpenAIClient.ts
+++ b/src/llms/OpenAIClient.ts
@@ -180,7 +180,7 @@ export class OpenAIClient implements LLMClient {
 		// 9. Return result (including cache tokens)
 		return {
 			toolCall: {
-				id: toolCall.id,
+				// id: toolCall.id,
 				name: toolName,
 				args: validation.data as Record<string, unknown>,
 			},
diff --git a/src/llms/OpenAILenientClient.ts b/src/llms/OpenAILenientClient.ts
new file mode 100644
index 0000000..655c44c
--- /dev/null
+++ b/src/llms/OpenAILenientClient.ts
@@ -0,0 +1,139 @@
+/**
+ * OpenAI Client implementation
+ */
+import type { MacroToolInput } from '@/PageAgent'
+
+import { InvokeError, InvokeErrorType } from './errors'
+import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types'
+import { lenientParseMacroToolCall, zodToOpenAITool } from './utils'
+
+// Claude's openAI-API has different format for some fields
+const CLAUDE_PATCH = {
+	tool_choice: { type: 'tool', name: 'AgentOutput' },
+	thinking: { type: 'disabled' },
+}
+
+export class OpenAIClient implements LLMClient {
+	config: OpenAIClientConfig
+
+	constructor(config: OpenAIClientConfig) {
+		this.config = config
+	}
+
+	async invoke(
+		messages: Message[],
+		tools: { AgentOutput: Tool<MacroToolInput> },
+		abortSignal?: AbortSignal
+	): Promise<InvokeResult> {
+		// 1. Convert tools to OpenAI format
+		const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
+
+		// 2. Detect if Claude (auto-compatibility)
+		// TODO: Gemini also uses slightly different format than OpenAI
+		const isClaude = this.config.model.toLowerCase().startsWith('claude')
+
+		// 3. Call API
+		let response: Response
+		try {
+			response = await fetch(`${this.config.baseURL}/chat/completions`, {
+				method: 'POST',
+				headers: {
+					'Content-Type': 'application/json',
+					Authorization: `Bearer ${this.config.apiKey}`,
+				},
+				body: JSON.stringify({
+					model: this.config.model,
+					temperature: this.config.temperature,
+					max_tokens: this.config.maxTokens,
+					messages,
+
+					tools: openaiTools,
+					// tool_choice: 'required',
+					tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
+
+					// model specific params
+
+					// reasoning_effort: 'minimal',
+					// verbosity: 'low',
+					parallel_tool_calls: false,
+
+					...(isClaude ? CLAUDE_PATCH : {}),
+				}),
+				signal: abortSignal,
+			})
+		} catch (error: unknown) {
+			// Network error
+			throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
+		}
+
+		// 4. Handle HTTP errors
+		if (!response.ok) {
+			const errorData = await response.json().catch(() => ({}))
+			const errorMessage =
+				(errorData as { error?: { message?: string } }).error?.message || response.statusText
+
+			if (response.status === 401 || response.status === 403) {
+				throw new InvokeError(
+					InvokeErrorType.AUTH_ERROR,
+					`Authentication failed: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status === 429) {
+				throw new InvokeError(
+					InvokeErrorType.RATE_LIMIT,
+					`Rate limit exceeded: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status >= 500) {
+				throw new InvokeError(
+					InvokeErrorType.SERVER_ERROR,
+					`Server error: ${errorMessage}`,
+					errorData
+				)
+			}
+			throw new InvokeError(
+				InvokeErrorType.UNKNOWN,
+				`HTTP ${response.status}: ${errorMessage}`,
+				errorData
+			)
+		}
+
+		const data = await response.json()
+
+		const tool = tools.AgentOutput
+
+		const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any)
+
+		// Execute tool
+		let toolResult: unknown
+		try {
+			toolResult = await tool.execute(macroToolInput)
+		} catch (e) {
+			throw new InvokeError(
+				InvokeErrorType.TOOL_EXECUTION_ERROR,
+				`Tool execution failed: ${(e as Error).message}`,
+				e
+			)
+		}
+
+		// 9. Return result (including cache tokens)
+		return {
+			toolCall: {
+				// id: toolCall.id,
+				name: 'AgentOutput',
+				args: macroToolInput,
+			},
+			toolResult,
+			usage: {
+				promptTokens: data.usage?.prompt_tokens ?? 0,
+				completionTokens: data.usage?.completion_tokens ?? 0,
+				totalTokens: data.usage?.total_tokens ?? 0,
+				cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
+				reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
+			},
+			rawResponse: data,
+		}
+	}
+}
diff --git a/src/llms/index.ts b/src/llms/index.ts
index e549012..3ba7a62 100644
--- a/src/llms/index.ts
+++ b/src/llms/index.ts
@@ -35,7 +35,7 @@ import type { LLMConfig } from '@/config'
 import { parseLLMConfig } from '@/config'
 import { EventBus, getEventBus } from '@/utils/bus'
 
-import { OpenAIClient } from './OpenAIClient'
+import { OpenAIClient } from './OpenAILenientClient'
 import { InvokeError } from './errors'
 import type { InvokeResult, LLMClient, Message, Tool } from './types'
 
diff --git a/src/llms/types.ts b/src/llms/types.ts
index dd07112..fff8200 100644
--- a/src/llms/types.ts
+++ b/src/llms/types.ts
@@ -49,9 +49,9 @@ export interface LLMClient {
  */
 export interface InvokeResult<TResult = unknown> {
 	toolCall: {
-		id?: string // OpenAI's tool_call_id
+		// id?: string // OpenAI's tool_call_id
 		name: string
-		args: Record<string, unknown>
+		args: any
 	}
 	toolResult: TResult // Supports generics, but defaults to unknown
 	usage: {
diff --git a/src/llms/utils.ts b/src/llms/utils.ts
index ce332b8..86486db 100644
--- a/src/llms/utils.ts
+++ b/src/llms/utils.ts
@@ -1,8 +1,12 @@
 /**
  * Utility functions for LLM integration
  */
+import chalk from 'chalk'
 import { z } from 'zod'
 
+import type { MacroToolInput } from '@/PageAgent'
+
+import { InvokeError, InvokeErrorType } from './errors'
 import type { Tool } from './types'
 
 /**
@@ -19,3 +23,172 @@ export function zodToOpenAITool(name: string, tool: Tool) {
 		},
 	}
 }
+
+/**
+ * Although we require tool calls to be returned following the specified format,
+ * some models cannot guarantee correctness:
+ * - Don't return tool calls at all but instead return tool call parameters as a JSON string in the message.
+ * - Returned tool calls or messages don't follow the correct nested MacroToolInput format.
+ */
+export function lenientParseMacroToolCall(
+	responseData: any,
+	inputSchema: z.ZodObject<MacroToolInput & Record<string, any>>
+): MacroToolInput {
+	// check
+	const choice = responseData.choices?.[0]
+	if (!choice) {
+		throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData)
+	}
+
+	// check
+	switch (choice.finish_reason) {
+		case 'tool_calls':
+		case 'stop': // will try a robust parse
+			// ✅ Normal
+			break
+		case 'length':
+			// ⚠️ Token limit reached
+			throw new InvokeError(
+				InvokeErrorType.CONTEXT_LENGTH,
+				'Response truncated: max tokens reached'
+			)
+		case 'content_filter':
+			// ❌ Content filtered
+			throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
+		default:
+			throw new InvokeError(
+				InvokeErrorType.UNKNOWN,
+				`Unexpected finish_reason: ${choice.finish_reason}`
+			)
+	}
+
+	// Extract action schema from MacroToolInput schema
+	const actionSchema = inputSchema.shape.action
+	if (!actionSchema) {
+		throw new Error('inputSchema must have an "action" field')
+	}
+
+	// patch stopReason mis-format
+
+	let arg: string | null = null
+
+	// try to use tool call
+	const toolCall = choice.message?.tool_calls?.[0]?.function
+	arg = toolCall?.arguments ?? null
+
+	if (arg && toolCall.name !== 'AgentOutput') {
+		// throw new InvokeError(
+		// 	InvokeErrorType.INVALID_TOOL_ARGS,
+		// 	`Expected function name "AgentOutput", got "${toolCall.name}"`,
+		// 	null
+		// )
+		// case: instead of AgentOutput, the model returned a action name as tool call
+		console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call'))
+		let tmpArg
+		try {
+			tmpArg = JSON.parse(arg)
+		} catch (error) {
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				'Failed to parse tool arguments as JSON',
+				error
+			)
+		}
+		arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } })
+	}
+
+	if (!arg) {
+		// try to use message content as JSON
+		arg = choice.message?.content.trim() || null
+	}
+
+	if (!arg) {
+		throw new InvokeError(
+			InvokeErrorType.NO_TOOL_CALL,
+			'No tool call or content found in response',
+			responseData
+		)
+	}
+
+	// make sure is valid JSON
+
+	let parsedArgs: any
+	try {
+		parsedArgs = JSON.parse(arg)
+	} catch (error) {
+		throw new InvokeError(
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			'Failed to parse tool arguments as JSON',
+			error
+		)
+	}
+
+	// patch incomplete formats
+
+	if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) {
+		// case: nested MacroToolInput format (correct format)
+
+		// some models may give a empty action (they may think reasoning and action should be separate)
+		if (!parsedArgs.action) {
+			console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call'))
+			parsedArgs.action = {
+				wait: { seconds: 1 },
+			}
+		}
+	} else if (parsedArgs.type && parsedArgs.function) {
+		// case: upper level function call format provided. only keep its arguments
+		if (parsedArgs.function.name !== 'AgentOutput')
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				`Expected function name "AgentOutput", got "${parsedArgs.function.name}"`,
+				null
+			)
+
+		console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call'))
+		parsedArgs = parsedArgs.function.arguments
+	} else if (parsedArgs.name && parsedArgs.arguments) {
+		// case: upper level function call format provided. only keep its arguments
+		if (parsedArgs.name !== 'AgentOutput')
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				`Expected function name "AgentOutput", got "${parsedArgs.name}"`,
+				null
+			)
+
+		console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call'))
+		parsedArgs = parsedArgs.arguments
+	} else {
+		// case: only action parameters provided, wrap into MacroToolInput
+		console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call'))
+		parsedArgs = { action: parsedArgs } as MacroToolInput
+	}
+
+	// make sure it's not wrapped as string
+	if (typeof parsedArgs === 'string') {
+		console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call'))
+		try {
+			parsedArgs = JSON.parse(parsedArgs)
+		} catch (error) {
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				'Failed to parse nested tool arguments as JSON',
+				error
+			)
+		}
+	}
+
+	const validation = inputSchema.safeParse(parsedArgs)
+	if (validation.success) {
+		return validation.data as unknown as MacroToolInput
+	} else {
+		const action = parsedArgs.action ?? {}
+		const actionName = Object.keys(action)[0] || 'unknown'
+		const actionArgs = JSON.stringify(action[actionName] || 'unknown')
+
+		throw new InvokeError(
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			`Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`,
+			validation.error
+		)
+	}
+}