From 0d48b71b27ea4554de65d5b2809fbbffcb3da770 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Mon, 20 Oct 2025 22:03:09 +0800 Subject: [PATCH] feat(llm): auto fixing known llm format errors --- ROADMAP.md | 2 +- src/config/index.ts | 2 +- src/llms/OpenAIClient.ts | 2 +- src/llms/OpenAILenientClient.ts | 139 +++++++++++++++++++++++++ src/llms/index.ts | 2 +- src/llms/types.ts | 4 +- src/llms/utils.ts | 173 ++++++++++++++++++++++++++++++++ 7 files changed, 318 insertions(+), 6 deletions(-) create mode 100644 src/llms/OpenAILenientClient.ts diff --git a/ROADMAP.md b/ROADMAP.md index c8b0a08..de5d612 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -11,7 +11,7 @@ The development progress and future plans for PageAgent. - [x] **UI with HITL** - Human-in-the-loop user interface - [x] **Landing and doc pages** - [x] **Remove ai-sdk** - Only one function is being used -- [ ] **Robust LLM output** +- [x] **Robust LLM output** - [ ] **Working homepage with live LLM API** - [ ] **Hooks for Task and HITL** - [ ] **Hijacking `page_open` event** diff --git a/src/config/index.ts b/src/config/index.ts index 2907474..3c1cf6e 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -24,7 +24,7 @@ export function parseLLMConfig(config: LLMConfig): Required { baseURL: config.baseURL ?? DEFAULT_BASE_URL, apiKey: config.apiKey ?? DEFAULT_API_KEY, modelName: config.modelName ?? DEFAULT_MODEL_NAME, - temperature: config.temperature ?? 0.5, // higher randomness helps auto-recovery + temperature: config.temperature ?? 0.7, // higher randomness helps auto-recovery maxTokens: config.maxTokens ?? 4096, maxRetries: config.maxRetries ?? LLM_MAX_RETRIES, } diff --git a/src/llms/OpenAIClient.ts b/src/llms/OpenAIClient.ts index 7bd389b..bf3f72e 100644 --- a/src/llms/OpenAIClient.ts +++ b/src/llms/OpenAIClient.ts @@ -180,7 +180,7 @@ export class OpenAIClient implements LLMClient { // 9. Return result (including cache tokens) return { toolCall: { - id: toolCall.id, + // id: toolCall.id, name: toolName, args: validation.data as Record, }, diff --git a/src/llms/OpenAILenientClient.ts b/src/llms/OpenAILenientClient.ts new file mode 100644 index 0000000..655c44c --- /dev/null +++ b/src/llms/OpenAILenientClient.ts @@ -0,0 +1,139 @@ +/** + * OpenAI Client implementation + */ +import type { MacroToolInput } from '@/PageAgent' + +import { InvokeError, InvokeErrorType } from './errors' +import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types' +import { lenientParseMacroToolCall, zodToOpenAITool } from './utils' + +// Claude's openAI-API has different format for some fields +const CLAUDE_PATCH = { + tool_choice: { type: 'tool', name: 'AgentOutput' }, + thinking: { type: 'disabled' }, +} + +export class OpenAIClient implements LLMClient { + config: OpenAIClientConfig + + constructor(config: OpenAIClientConfig) { + this.config = config + } + + async invoke( + messages: Message[], + tools: { AgentOutput: Tool }, + abortSignal?: AbortSignal + ): Promise { + // 1. Convert tools to OpenAI format + const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool)) + + // 2. Detect if Claude (auto-compatibility) + // TODO: Gemini also uses slightly different format than OpenAI + const isClaude = this.config.model.toLowerCase().startsWith('claude') + + // 3. Call API + let response: Response + try { + response = await fetch(`${this.config.baseURL}/chat/completions`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.config.apiKey}`, + }, + body: JSON.stringify({ + model: this.config.model, + temperature: this.config.temperature, + max_tokens: this.config.maxTokens, + messages, + + tools: openaiTools, + // tool_choice: 'required', + tool_choice: { type: 'function', function: { name: 'AgentOutput' } }, + + // model specific params + + // reasoning_effort: 'minimal', + // verbosity: 'low', + parallel_tool_calls: false, + + ...(isClaude ? CLAUDE_PATCH : {}), + }), + signal: abortSignal, + }) + } catch (error: unknown) { + // Network error + throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error) + } + + // 4. Handle HTTP errors + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + const errorMessage = + (errorData as { error?: { message?: string } }).error?.message || response.statusText + + if (response.status === 401 || response.status === 403) { + throw new InvokeError( + InvokeErrorType.AUTH_ERROR, + `Authentication failed: ${errorMessage}`, + errorData + ) + } + if (response.status === 429) { + throw new InvokeError( + InvokeErrorType.RATE_LIMIT, + `Rate limit exceeded: ${errorMessage}`, + errorData + ) + } + if (response.status >= 500) { + throw new InvokeError( + InvokeErrorType.SERVER_ERROR, + `Server error: ${errorMessage}`, + errorData + ) + } + throw new InvokeError( + InvokeErrorType.UNKNOWN, + `HTTP ${response.status}: ${errorMessage}`, + errorData + ) + } + + const data = await response.json() + + const tool = tools.AgentOutput + + const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any) + + // Execute tool + let toolResult: unknown + try { + toolResult = await tool.execute(macroToolInput) + } catch (e) { + throw new InvokeError( + InvokeErrorType.TOOL_EXECUTION_ERROR, + `Tool execution failed: ${(e as Error).message}`, + e + ) + } + + // 9. Return result (including cache tokens) + return { + toolCall: { + // id: toolCall.id, + name: 'AgentOutput', + args: macroToolInput, + }, + toolResult, + usage: { + promptTokens: data.usage?.prompt_tokens ?? 0, + completionTokens: data.usage?.completion_tokens ?? 0, + totalTokens: data.usage?.total_tokens ?? 0, + cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens, + reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens, + }, + rawResponse: data, + } + } +} diff --git a/src/llms/index.ts b/src/llms/index.ts index e549012..3ba7a62 100644 --- a/src/llms/index.ts +++ b/src/llms/index.ts @@ -35,7 +35,7 @@ import type { LLMConfig } from '@/config' import { parseLLMConfig } from '@/config' import { EventBus, getEventBus } from '@/utils/bus' -import { OpenAIClient } from './OpenAIClient' +import { OpenAIClient } from './OpenAILenientClient' import { InvokeError } from './errors' import type { InvokeResult, LLMClient, Message, Tool } from './types' diff --git a/src/llms/types.ts b/src/llms/types.ts index dd07112..fff8200 100644 --- a/src/llms/types.ts +++ b/src/llms/types.ts @@ -49,9 +49,9 @@ export interface LLMClient { */ export interface InvokeResult { toolCall: { - id?: string // OpenAI's tool_call_id + // id?: string // OpenAI's tool_call_id name: string - args: Record + args: any } toolResult: TResult // Supports generics, but defaults to unknown usage: { diff --git a/src/llms/utils.ts b/src/llms/utils.ts index ce332b8..86486db 100644 --- a/src/llms/utils.ts +++ b/src/llms/utils.ts @@ -1,8 +1,12 @@ /** * Utility functions for LLM integration */ +import chalk from 'chalk' import { z } from 'zod' +import type { MacroToolInput } from '@/PageAgent' + +import { InvokeError, InvokeErrorType } from './errors' import type { Tool } from './types' /** @@ -19,3 +23,172 @@ export function zodToOpenAITool(name: string, tool: Tool) { }, } } + +/** + * Although we require tool calls to be returned following the specified format, + * some models cannot guarantee correctness: + * - Don't return tool calls at all but instead return tool call parameters as a JSON string in the message. + * - Returned tool calls or messages don't follow the correct nested MacroToolInput format. + */ +export function lenientParseMacroToolCall( + responseData: any, + inputSchema: z.ZodObject> +): MacroToolInput { + // check + const choice = responseData.choices?.[0] + if (!choice) { + throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData) + } + + // check + switch (choice.finish_reason) { + case 'tool_calls': + case 'stop': // will try a robust parse + // ✅ Normal + break + case 'length': + // ⚠️ Token limit reached + throw new InvokeError( + InvokeErrorType.CONTEXT_LENGTH, + 'Response truncated: max tokens reached' + ) + case 'content_filter': + // ❌ Content filtered + throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system') + default: + throw new InvokeError( + InvokeErrorType.UNKNOWN, + `Unexpected finish_reason: ${choice.finish_reason}` + ) + } + + // Extract action schema from MacroToolInput schema + const actionSchema = inputSchema.shape.action + if (!actionSchema) { + throw new Error('inputSchema must have an "action" field') + } + + // patch stopReason mis-format + + let arg: string | null = null + + // try to use tool call + const toolCall = choice.message?.tool_calls?.[0]?.function + arg = toolCall?.arguments ?? null + + if (arg && toolCall.name !== 'AgentOutput') { + // throw new InvokeError( + // InvokeErrorType.INVALID_TOOL_ARGS, + // `Expected function name "AgentOutput", got "${toolCall.name}"`, + // null + // ) + // case: instead of AgentOutput, the model returned a action name as tool call + console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call')) + let tmpArg + try { + tmpArg = JSON.parse(arg) + } catch (error) { + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + 'Failed to parse tool arguments as JSON', + error + ) + } + arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } }) + } + + if (!arg) { + // try to use message content as JSON + arg = choice.message?.content.trim() || null + } + + if (!arg) { + throw new InvokeError( + InvokeErrorType.NO_TOOL_CALL, + 'No tool call or content found in response', + responseData + ) + } + + // make sure is valid JSON + + let parsedArgs: any + try { + parsedArgs = JSON.parse(arg) + } catch (error) { + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + 'Failed to parse tool arguments as JSON', + error + ) + } + + // patch incomplete formats + + if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) { + // case: nested MacroToolInput format (correct format) + + // some models may give a empty action (they may think reasoning and action should be separate) + if (!parsedArgs.action) { + console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call')) + parsedArgs.action = { + wait: { seconds: 1 }, + } + } + } else if (parsedArgs.type && parsedArgs.function) { + // case: upper level function call format provided. only keep its arguments + if (parsedArgs.function.name !== 'AgentOutput') + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + `Expected function name "AgentOutput", got "${parsedArgs.function.name}"`, + null + ) + + console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call')) + parsedArgs = parsedArgs.function.arguments + } else if (parsedArgs.name && parsedArgs.arguments) { + // case: upper level function call format provided. only keep its arguments + if (parsedArgs.name !== 'AgentOutput') + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + `Expected function name "AgentOutput", got "${parsedArgs.name}"`, + null + ) + + console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call')) + parsedArgs = parsedArgs.arguments + } else { + // case: only action parameters provided, wrap into MacroToolInput + console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call')) + parsedArgs = { action: parsedArgs } as MacroToolInput + } + + // make sure it's not wrapped as string + if (typeof parsedArgs === 'string') { + console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call')) + try { + parsedArgs = JSON.parse(parsedArgs) + } catch (error) { + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + 'Failed to parse nested tool arguments as JSON', + error + ) + } + } + + const validation = inputSchema.safeParse(parsedArgs) + if (validation.success) { + return validation.data as unknown as MacroToolInput + } else { + const action = parsedArgs.action ?? {} + const actionName = Object.keys(action)[0] || 'unknown' + const actionArgs = JSON.stringify(action[actionName] || 'unknown') + + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + `Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`, + validation.error + ) + } +}