From 14974c0257136dc3bc1dbcb86b882c29d4f73591 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Tue, 13 Jan 2026 13:49:19 +0800 Subject: [PATCH] feat!: mv brain from llms to agent; redo toolCall auto fixer --- packages/llms/src/OpenAILenientClient.ts | 135 ++++++++++++--- packages/llms/src/index.ts | 27 +-- packages/llms/src/types.ts | 54 +++--- packages/llms/src/utils.ts | 188 ++------------------- packages/page-agent/src/PageAgent.ts | 54 ++++-- packages/page-agent/src/utils/index.ts | 2 + packages/page-agent/src/utils/normalize.ts | 154 +++++++++++++++++ 7 files changed, 341 insertions(+), 273 deletions(-) create mode 100644 packages/page-agent/src/utils/normalize.ts diff --git a/packages/llms/src/OpenAILenientClient.ts b/packages/llms/src/OpenAILenientClient.ts index d94b5b7..d0c437b 100644 --- a/packages/llms/src/OpenAILenientClient.ts +++ b/packages/llms/src/OpenAILenientClient.ts @@ -2,8 +2,8 @@ * OpenAI Client implementation */ import { InvokeError, InvokeErrorType } from './errors' -import type { InvokeResult, LLMClient, LLMConfig, MacroToolInput, Message, Tool } from './types' -import { lenientParseMacroToolCall, modelPatch, zodToOpenAITool } from './utils' +import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types' +import { modelPatch, zodToOpenAITool } from './utils' export class OpenAIClient implements LLMClient { config: Required @@ -16,11 +16,25 @@ export class OpenAIClient implements LLMClient { async invoke( messages: Message[], - tools: { AgentOutput: Tool }, - abortSignal?: AbortSignal + tools: Record, + abortSignal?: AbortSignal, + options?: InvokeOptions ): Promise { // 1. Convert tools to OpenAI format - const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool)) + const openaiTools = Object.entries(tools).map(([name, t]) => zodToOpenAITool(name, t)) + + // Build request body + const requestBody: Record = { + model: this.config.model, + temperature: this.config.temperature, + messages, + tools: openaiTools, + parallel_tool_calls: false, + // Require tool call: specific tool if provided, otherwise any tool + tool_choice: options?.toolChoiceName + ? { type: 'function', function: { name: options.toolChoiceName } } + : 'required', + } // 2. Call API let response: Response @@ -31,22 +45,10 @@ export class OpenAIClient implements LLMClient { 'Content-Type': 'application/json', Authorization: `Bearer ${this.config.apiKey}`, }, - body: JSON.stringify( - modelPatch({ - model: this.config.model, - temperature: this.config.temperature, - messages, - - tools: openaiTools, - // tool_choice: 'required', - tool_choice: { type: 'function', function: { name: 'AgentOutput' } }, - parallel_tool_calls: false, - }) - ), + body: JSON.stringify(modelPatch(requestBody)), signal: abortSignal, }) } catch (error: unknown) { - // Network error console.error(error) throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error) } @@ -85,16 +87,94 @@ export class OpenAIClient implements LLMClient { ) } - // parse response - + // 4. Parse and validate response const data = await response.json() - const tool = tools.AgentOutput - const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any) - // Execute tool + // Basic validation before normalize (these are structural issues, not format issues) + const choice = data.choices?.[0] + if (!choice) { + throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data) + } + + // Check finish_reason + switch (choice.finish_reason) { + case 'tool_calls': + case 'function_call': // gemini + case 'stop': // some models use this even with tool calls + break + case 'length': + throw new InvokeError( + InvokeErrorType.CONTEXT_LENGTH, + 'Response truncated: max tokens reached' + ) + case 'content_filter': + throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system') + default: + throw new InvokeError( + InvokeErrorType.UNKNOWN, + `Unexpected finish_reason: ${choice.finish_reason}` + ) + } + + // Apply normalizeResponse if provided (for fixing format issues like wrong tool name) + const normalizedData = options?.normalizeResponse ? options.normalizeResponse(data) : data + const normalizedChoice = (normalizedData as any).choices?.[0] + + // Get tool name from response + const toolCallName = normalizedChoice?.message?.tool_calls?.[0]?.function?.name + if (!toolCallName) { + throw new InvokeError( + InvokeErrorType.NO_TOOL_CALL, + 'No tool call found in response', + normalizedData + ) + } + + const tool = tools[toolCallName] + if (!tool) { + throw new InvokeError( + InvokeErrorType.UNKNOWN, + `Tool "${toolCallName}" not found in tools`, + normalizedData + ) + } + + // Extract and parse tool arguments + const argString = normalizedChoice.message?.tool_calls?.[0]?.function?.arguments + if (!argString) { + throw new InvokeError( + InvokeErrorType.NO_TOOL_CALL, + 'No tool call arguments found', + normalizedData + ) + } + + let parsedArgs: unknown + try { + parsedArgs = JSON.parse(argString) + } catch (error) { + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + 'Failed to parse tool arguments as JSON', + error + ) + } + + // Validate with schema + const validation = tool.inputSchema.safeParse(parsedArgs) + if (!validation.success) { + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + 'Tool arguments validation failed', + validation.error + ) + } + const toolInput = validation.data + + // 5. Execute tool let toolResult: unknown try { - toolResult = await tool.execute(macroToolInput) + toolResult = await tool.execute(toolInput) } catch (e) { throw new InvokeError( InvokeErrorType.TOOL_EXECUTION_ERROR, @@ -103,12 +183,11 @@ export class OpenAIClient implements LLMClient { ) } - // Return result (including cache tokens) + // Return result return { toolCall: { - // id: toolCall.id, - name: 'AgentOutput', - args: macroToolInput, + name: toolCallName, + args: toolInput, }, toolResult, usage: { diff --git a/packages/llms/src/index.ts b/packages/llms/src/index.ts index b747ebc..3c46f0b 100644 --- a/packages/llms/src/index.ts +++ b/packages/llms/src/index.ts @@ -40,27 +40,9 @@ import { LLM_MAX_RETRIES, } from './constants' import { InvokeError } from './errors' -import type { - AgentBrain, - InvokeResult, - LLMClient, - LLMConfig, - MacroToolInput, - MacroToolResult, - Message, - Tool, -} from './types' +import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types' -export type { - AgentBrain, - InvokeResult, - LLMClient, - LLMConfig, - MacroToolInput, - MacroToolResult, - Message, - Tool, -} +export type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } export function parseLLMConfig(config: LLMConfig): Required { return { @@ -93,11 +75,12 @@ export class LLM extends EventTarget { async invoke( messages: Message[], tools: Record, - abortSignal: AbortSignal + abortSignal: AbortSignal, + options?: InvokeOptions ): Promise { return await withRetry( async () => { - const result = await this.client.invoke(messages, tools, abortSignal) + const result = await this.client.invoke(messages, tools, abortSignal, options) return result }, diff --git a/packages/llms/src/types.ts b/packages/llms/src/types.ts index 543985b..f51bd45 100644 --- a/packages/llms/src/types.ts +++ b/packages/llms/src/types.ts @@ -32,6 +32,24 @@ export interface Tool { execute: (args: TParams) => Promise } +/** + * Invoke options for LLM call + */ +export interface InvokeOptions { + /** + * Force LLM to call a specific tool by name. + * If provided: tool_choice = { type: 'function', function: { name: toolChoiceName } } + * If not provided: tool_choice = 'required' (must call some tool, but model chooses which) + */ + toolChoiceName?: string + /** + * Response normalization function. + * Called before parsing the response. + * Used to fix various response format errors from the model. + */ + normalizeResponse?: (response: any) => any +} + /** * LLM Client interface * Note: Does not use generics because each tool in the tools array has different types @@ -40,7 +58,8 @@ export interface LLMClient { invoke( messages: Message[], tools: Record, - abortSignal?: AbortSignal + abortSignal?: AbortSignal, + options?: InvokeOptions ): Promise } @@ -82,36 +101,3 @@ export interface LLMConfig { */ customFetch?: typeof globalThis.fetch } - -/** - * Agent brain state - the reflection-before-action model - * - * Every tool call must first reflect on: - * - evaluation_previous_goal: How well did the previous action achieve its goal? - * - memory: Key information to remember for future steps - * - next_goal: What should be accomplished in the next action? - */ -export interface AgentBrain { - // thinking?: string - evaluation_previous_goal: string - memory: string - next_goal: string -} - -/** - * MacroTool input structure - * - * This is the core abstraction that enforces the "reflection-before-action" mental model. - * Before executing any action, the LLM must output its reasoning state. - */ -export interface MacroToolInput extends AgentBrain { - action: Record -} - -/** - * MacroTool output structure - */ -export interface MacroToolResult { - input: MacroToolInput - output: string -} diff --git a/packages/llms/src/utils.ts b/packages/llms/src/utils.ts index d5d34f5..9b192d7 100644 --- a/packages/llms/src/utils.ts +++ b/packages/llms/src/utils.ts @@ -4,8 +4,7 @@ import chalk from 'chalk' import { z } from 'zod' -import { InvokeError, InvokeErrorType } from './errors' -import type { MacroToolInput, Tool } from './types' +import type { Tool } from './types' function debug(message: string) { console.debug(chalk.gray('[LLM]'), message) @@ -26,176 +25,6 @@ export function zodToOpenAITool(name: string, tool: Tool) { } } -/** - * Although some models cannot guarantee correct response. Common issues are fixable: - * - Instead of returning a proper tool call. Return the tool call parameters in the message content. - * - Returned tool calls or messages don't follow the nested MacroToolInput format. - */ -export function lenientParseMacroToolCall( - responseData: any, - inputSchema: z.ZodObject> -): MacroToolInput { - // check - const choice = responseData.choices?.[0] - if (!choice) { - throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData) - } - - // check - switch (choice.finish_reason) { - case 'tool_calls': - case 'function_call': // gemini - case 'stop': // will try a robust parse - // ✅ Normal - break - case 'length': - // ⚠️ Token limit reached - throw new InvokeError( - InvokeErrorType.CONTEXT_LENGTH, - 'Response truncated: max tokens reached' - ) - case 'content_filter': - // ❌ Content filtered - throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system') - default: - throw new InvokeError( - InvokeErrorType.UNKNOWN, - `Unexpected finish_reason: ${choice.finish_reason}` - ) - } - - // Extract action schema from MacroToolInput schema - const actionSchema = inputSchema.shape.action - if (!actionSchema) { - throw new Error('inputSchema must have an "action" field') - } - - // patch stopReason mis-format - - let arg: string | null = null - - // try to use tool call - const toolCall = choice.message?.tool_calls?.[0]?.function - arg = toolCall?.arguments ?? null - - if (arg && toolCall.name !== 'AgentOutput') { - // TODO: check if toolCall.name is a valid action name - // case: instead of AgentOutput, the model returned a action name as tool call - console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call')) - let tmpArg - try { - tmpArg = JSON.parse(arg) - } catch (error) { - throw new InvokeError( - InvokeErrorType.INVALID_TOOL_ARGS, - 'Failed to parse tool arguments as JSON', - error - ) - } - arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } }) - } - - if (!arg) { - // try to use message content as JSON - arg = choice.message?.content.trim() || null - } - - if (!arg) { - throw new InvokeError( - InvokeErrorType.NO_TOOL_CALL, - 'No tool call or content found in response', - responseData - ) - } - - // make sure is valid JSON - - let parsedArgs: any - try { - parsedArgs = JSON.parse(arg) - } catch (error) { - throw new InvokeError( - InvokeErrorType.INVALID_TOOL_ARGS, - 'Failed to parse tool arguments as JSON', - error - ) - } - - // patch incomplete formats - - if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) { - // case: nested MacroToolInput format (correct format) - - // some models may give a empty action (they may think reasoning and action should be separate) - if (!parsedArgs.action) { - console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call')) - parsedArgs.action = { - wait: { seconds: 1 }, - } - } - } else if (parsedArgs.type && parsedArgs.function) { - // case: upper level function call format provided. only keep its arguments - // TODO: check if function name is a valid action name - if (parsedArgs.function.name !== 'AgentOutput') - throw new InvokeError( - InvokeErrorType.INVALID_TOOL_ARGS, - `Expected function name "AgentOutput", got "${parsedArgs.function.name}"`, - null - ) - - console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call')) - parsedArgs = parsedArgs.function.arguments - } else if (parsedArgs.name && parsedArgs.arguments) { - // case: upper level function call format provided. only keep its arguments - // TODO: check if function name is a valid action name - if (parsedArgs.name !== 'AgentOutput') - throw new InvokeError( - InvokeErrorType.INVALID_TOOL_ARGS, - `Expected function name "AgentOutput", got "${parsedArgs.name}"`, - null - ) - - console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call')) - parsedArgs = parsedArgs.arguments - } else { - // case: only action parameters provided, wrap into MacroToolInput - // TODO: check if action name is valid - console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call')) - parsedArgs = { action: parsedArgs } as MacroToolInput - } - - // make sure it's not wrapped as string - if (typeof parsedArgs === 'string') { - console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call')) - try { - parsedArgs = JSON.parse(parsedArgs) - } catch (error) { - throw new InvokeError( - InvokeErrorType.INVALID_TOOL_ARGS, - 'Failed to parse nested tool arguments as JSON', - error - ) - } - } - - const validation = inputSchema.safeParse(parsedArgs) - if (validation.success) { - return validation.data as unknown as MacroToolInput - } else { - const action = parsedArgs.action ?? {} - const actionName = Object.keys(action)[0] || 'unknown' - const actionArgs = JSON.stringify(action[actionName] || 'unknown') - - // TODO: check if action name is valid. give a readable error message - - throw new InvokeError( - InvokeErrorType.INVALID_TOOL_ARGS, - `Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`, - validation.error - ) - } -} - /** * Patch model specific parameters */ @@ -206,10 +35,19 @@ export function modelPatch(body: Record) { const modelName = normalizeModelName(model) if (modelName.startsWith('claude')) { - debug('Applying Claude patch: change tool_choice and disable thinking') - body.tool_choice = { type: 'tool', name: 'AgentOutput' } + debug('Applying Claude patch: disable thinking') body.thinking = { type: 'disabled' } - // body.reasoning = { enabled: 'disabled' } + + // Convert tool_choice to Claude format + if (body.tool_choice === 'required') { + // 'required' -> { type: 'any' } (must call some tool) + debug('Applying Claude patch: convert tool_choice "required" to { type: "any" }') + body.tool_choice = { type: 'any' } + } else if (body.tool_choice?.function?.name) { + // { type: 'function', function: { name: '...' } } -> { type: 'tool', name: '...' } + debug('Applying Claude patch: convert tool_choice format') + body.tool_choice = { type: 'tool', name: body.tool_choice.function.name } + } } if (modelName.startsWith('grok')) { diff --git a/packages/page-agent/src/PageAgent.ts b/packages/page-agent/src/PageAgent.ts index 8b039f4..bfdf33a 100644 --- a/packages/page-agent/src/PageAgent.ts +++ b/packages/page-agent/src/PageAgent.ts @@ -2,13 +2,7 @@ * Copyright (C) 2025 Alibaba Group Holding Limited * All rights reserved. */ -import { - type AgentBrain, - LLM, - type MacroToolInput, - type MacroToolResult, - type Tool, -} from '@page-agent/llms' +import { LLM, type Tool } from '@page-agent/llms' import { PageController } from '@page-agent/page-controller' import { Panel, SimulatorMask } from '@page-agent/ui' import chalk from 'chalk' @@ -18,15 +12,46 @@ import type { PageAgentConfig } from './config' import { MAX_STEPS } from './config/constants' import SYSTEM_PROMPT from './prompts/system_prompt.md?raw' import { tools } from './tools' -import { trimLines, uid, waitUntil } from './utils' +import { normalizeResponse, trimLines, uid, waitUntil } from './utils' import { assert } from './utils/assert' +/** + * Agent brain state - the reflection-before-action model + * + * Every tool call must first reflect on: + * - evaluation_previous_goal: How well did the previous action achieve its goal? + * - memory: Key information to remember for future steps + * - next_goal: What should be accomplished in the next action? + */ +export interface AgentReflection { + evaluation_previous_goal: string + memory: string + next_goal: string +} + +/** + * MacroTool input structure + * + * This is the core abstraction that enforces the "reflection-before-action" mental model. + * Before executing any action, the LLM must output its reasoning state. + */ +export interface MacroToolInput extends Partial { + action: Record +} + +/** + * MacroTool output structure + */ +export interface MacroToolResult { + input: MacroToolInput + output: string +} + export type { PageAgentConfig } export { tool, type PageAgentTool } from './tools' -export type { AgentBrain, MacroToolInput, MacroToolResult } export interface AgentHistory { - brain: AgentBrain + brain: AgentReflection action: { name: string input: any @@ -124,9 +149,6 @@ export class PageAgent extends EventTarget { window.addEventListener('beforeunload', this.#beforeUnloadListener) } - /** - * @todo maybe return something? - */ async execute(task: string): Promise { if (!task) throw new Error('Task is required') this.task = task @@ -183,7 +205,11 @@ export class PageAgent extends EventTarget { }, ], { AgentOutput: this.#packMacroTool() }, - this.#abortController.signal + this.#abortController.signal, + { + toolChoiceName: 'AgentOutput', + normalizeResponse, + } ) const macroResult = result.toolResult as MacroToolResult diff --git a/packages/page-agent/src/utils/index.ts b/packages/page-agent/src/utils/index.ts index f24db17..1c6c0b1 100644 --- a/packages/page-agent/src/utils/index.ts +++ b/packages/page-agent/src/utils/index.ts @@ -1,3 +1,5 @@ +export { normalizeResponse } from './normalize' + /** * Wait until condition becomes true * @returns Returns when condition becomes true, throws otherwise diff --git a/packages/page-agent/src/utils/normalize.ts b/packages/page-agent/src/utils/normalize.ts new file mode 100644 index 0000000..a8b88ea --- /dev/null +++ b/packages/page-agent/src/utils/normalize.ts @@ -0,0 +1,154 @@ +import chalk from 'chalk' + +/** + * Normalize LLM response to fix common format issues. + * + * Handles: + * - No tool_calls but JSON in message.content (fallback) + * - Model returns action name as tool call instead of AgentOutput + * - Arguments wrapped as double JSON string + * - Nested function call format + * - Missing action field (fallback to wait) + * - etc. + */ +export function normalizeResponse(response: any): any { + let resolvedArguments = null as any + + const choice = (response as { choices?: Choice[] }).choices?.[0] + if (!choice) throw new Error('No choices in response') + + const message = choice.message + if (!message) throw new Error('No message in choice') + + const toolCall = message.tool_calls?.[0] + + // fix level and location of arguments + + if (toolCall?.function?.arguments) { + resolvedArguments = safeJsonParse(toolCall.function.arguments) + + // case: sometimes the model only returns the action level + if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') { + console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`)) + resolvedArguments = { action: safeJsonParse(resolvedArguments) } + } + } else { + // case: sometimes the model returns json in content instead of tool_calls + if (message.content) { + const content = message.content.trim() + const jsonInContent = retrieveJsonFromString(content) + if (jsonInContent) { + resolvedArguments = safeJsonParse(jsonInContent) + + // case: sometimes the content json includes upper level wrapper + if (resolvedArguments?.name === 'AgentOutput') { + console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`)) + resolvedArguments = safeJsonParse(resolvedArguments.arguments) + } + + // case: sometimes even 2-levels of wrapping + if (resolvedArguments?.type === 'function') { + console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`)) + resolvedArguments = safeJsonParse(resolvedArguments.function.arguments) + } + + // case: and sometimes action level only + // todo: needs better detection logic + if ( + !resolvedArguments?.action && + !resolvedArguments?.evaluation_previous_goal && + !resolvedArguments?.memory && + !resolvedArguments?.next_goal && + !resolvedArguments?.thinking + ) { + console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`)) + resolvedArguments = { action: safeJsonParse(resolvedArguments) } + } + } else { + throw new Error('No tool_call and message content does not contain valid JSON') + } + } else { + throw new Error('No tool_call nor message content is present') + } + } + + // fix double stringified arguments + resolvedArguments = safeJsonParse(resolvedArguments) + + // fix incomplete formats + if (!resolvedArguments.action) { + console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`)) + resolvedArguments.action = { name: 'wait', input: { seconds: 1 } } + } + + // pack back to standard format + return { + ...response, + choices: [ + { + ...choice, + message: { + ...message, + tool_calls: [ + { + ...(toolCall || {}), + function: { + ...(toolCall?.function || {}), + name: 'AgentOutput', + arguments: JSON.stringify(resolvedArguments), + }, + }, + ], + }, + }, + ], + } +} + +/** + * Safely parse JSON, return original input if not json. + */ +function safeJsonParse(input: any): any { + if (typeof input === 'string') { + try { + return JSON.parse(input.trim()) + } catch { + return input + } + } + return input +} + +/** + * Retrieve the JSON part from a string. + * - treat content between the first `{` and the last `}` as JSON. + * - try to parse as JSON, return the parsed result if successful, otherwise return null. + */ +function retrieveJsonFromString(str: string): any { + try { + const json = /({[\s\S]*})/.exec(str) ?? [] + if (json.length === 0) { + return null + } + return JSON.parse(json[0]!) + } catch { + return null + } +} + +interface Choice { + message?: { + role?: 'assistant' + content?: string + tool_calls?: { + id?: string + type?: 'function' + function?: { + name?: string + arguments?: string + } + }[] + } + index?: 0 + finish_reason?: 'tool_calls' +}