feat!: mv brain from llms to agent; redo toolCall auto fixer

This commit is contained in:
Simon
2026-01-13 13:49:19 +08:00
parent e70ae40096
commit 14974c0257
7 changed files with 341 additions and 273 deletions

View File

@@ -2,8 +2,8 @@
* OpenAI Client implementation * OpenAI Client implementation
*/ */
import { InvokeError, InvokeErrorType } from './errors' import { InvokeError, InvokeErrorType } from './errors'
import type { InvokeResult, LLMClient, LLMConfig, MacroToolInput, Message, Tool } from './types' import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
import { lenientParseMacroToolCall, modelPatch, zodToOpenAITool } from './utils' import { modelPatch, zodToOpenAITool } from './utils'
export class OpenAIClient implements LLMClient { export class OpenAIClient implements LLMClient {
config: Required<LLMConfig> config: Required<LLMConfig>
@@ -16,11 +16,25 @@ export class OpenAIClient implements LLMClient {
async invoke( async invoke(
messages: Message[], messages: Message[],
tools: { AgentOutput: Tool<MacroToolInput> }, tools: Record<string, Tool>,
abortSignal?: AbortSignal abortSignal?: AbortSignal,
options?: InvokeOptions
): Promise<InvokeResult> { ): Promise<InvokeResult> {
// 1. Convert tools to OpenAI format // 1. Convert tools to OpenAI format
const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool)) const openaiTools = Object.entries(tools).map(([name, t]) => zodToOpenAITool(name, t))
// Build request body
const requestBody: Record<string, unknown> = {
model: this.config.model,
temperature: this.config.temperature,
messages,
tools: openaiTools,
parallel_tool_calls: false,
// Require tool call: specific tool if provided, otherwise any tool
tool_choice: options?.toolChoiceName
? { type: 'function', function: { name: options.toolChoiceName } }
: 'required',
}
// 2. Call API // 2. Call API
let response: Response let response: Response
@@ -31,22 +45,10 @@ export class OpenAIClient implements LLMClient {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
Authorization: `Bearer ${this.config.apiKey}`, Authorization: `Bearer ${this.config.apiKey}`,
}, },
body: JSON.stringify( body: JSON.stringify(modelPatch(requestBody)),
modelPatch({
model: this.config.model,
temperature: this.config.temperature,
messages,
tools: openaiTools,
// tool_choice: 'required',
tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
parallel_tool_calls: false,
})
),
signal: abortSignal, signal: abortSignal,
}) })
} catch (error: unknown) { } catch (error: unknown) {
// Network error
console.error(error) console.error(error)
throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error) throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
} }
@@ -85,16 +87,94 @@ export class OpenAIClient implements LLMClient {
) )
} }
// parse response // 4. Parse and validate response
const data = await response.json() const data = await response.json()
const tool = tools.AgentOutput
const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any)
// Execute tool // Basic validation before normalize (these are structural issues, not format issues)
const choice = data.choices?.[0]
if (!choice) {
throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data)
}
// Check finish_reason
switch (choice.finish_reason) {
case 'tool_calls':
case 'function_call': // gemini
case 'stop': // some models use this even with tool calls
break
case 'length':
throw new InvokeError(
InvokeErrorType.CONTEXT_LENGTH,
'Response truncated: max tokens reached'
)
case 'content_filter':
throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
default:
throw new InvokeError(
InvokeErrorType.UNKNOWN,
`Unexpected finish_reason: ${choice.finish_reason}`
)
}
// Apply normalizeResponse if provided (for fixing format issues like wrong tool name)
const normalizedData = options?.normalizeResponse ? options.normalizeResponse(data) : data
const normalizedChoice = (normalizedData as any).choices?.[0]
// Get tool name from response
const toolCallName = normalizedChoice?.message?.tool_calls?.[0]?.function?.name
if (!toolCallName) {
throw new InvokeError(
InvokeErrorType.NO_TOOL_CALL,
'No tool call found in response',
normalizedData
)
}
const tool = tools[toolCallName]
if (!tool) {
throw new InvokeError(
InvokeErrorType.UNKNOWN,
`Tool "${toolCallName}" not found in tools`,
normalizedData
)
}
// Extract and parse tool arguments
const argString = normalizedChoice.message?.tool_calls?.[0]?.function?.arguments
if (!argString) {
throw new InvokeError(
InvokeErrorType.NO_TOOL_CALL,
'No tool call arguments found',
normalizedData
)
}
let parsedArgs: unknown
try {
parsedArgs = JSON.parse(argString)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse tool arguments as JSON',
error
)
}
// Validate with schema
const validation = tool.inputSchema.safeParse(parsedArgs)
if (!validation.success) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Tool arguments validation failed',
validation.error
)
}
const toolInput = validation.data
// 5. Execute tool
let toolResult: unknown let toolResult: unknown
try { try {
toolResult = await tool.execute(macroToolInput) toolResult = await tool.execute(toolInput)
} catch (e) { } catch (e) {
throw new InvokeError( throw new InvokeError(
InvokeErrorType.TOOL_EXECUTION_ERROR, InvokeErrorType.TOOL_EXECUTION_ERROR,
@@ -103,12 +183,11 @@ export class OpenAIClient implements LLMClient {
) )
} }
// Return result (including cache tokens) // Return result
return { return {
toolCall: { toolCall: {
// id: toolCall.id, name: toolCallName,
name: 'AgentOutput', args: toolInput,
args: macroToolInput,
}, },
toolResult, toolResult,
usage: { usage: {

View File

@@ -40,27 +40,9 @@ import {
LLM_MAX_RETRIES, LLM_MAX_RETRIES,
} from './constants' } from './constants'
import { InvokeError } from './errors' import { InvokeError } from './errors'
import type { import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
AgentBrain,
InvokeResult,
LLMClient,
LLMConfig,
MacroToolInput,
MacroToolResult,
Message,
Tool,
} from './types'
export type { export type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }
AgentBrain,
InvokeResult,
LLMClient,
LLMConfig,
MacroToolInput,
MacroToolResult,
Message,
Tool,
}
export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> { export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
return { return {
@@ -93,11 +75,12 @@ export class LLM extends EventTarget {
async invoke( async invoke(
messages: Message[], messages: Message[],
tools: Record<string, Tool>, tools: Record<string, Tool>,
abortSignal: AbortSignal abortSignal: AbortSignal,
options?: InvokeOptions
): Promise<InvokeResult> { ): Promise<InvokeResult> {
return await withRetry( return await withRetry(
async () => { async () => {
const result = await this.client.invoke(messages, tools, abortSignal) const result = await this.client.invoke(messages, tools, abortSignal, options)
return result return result
}, },

View File

@@ -32,6 +32,24 @@ export interface Tool<TParams = any, TResult = any> {
execute: (args: TParams) => Promise<TResult> execute: (args: TParams) => Promise<TResult>
} }
/**
* Invoke options for LLM call
*/
export interface InvokeOptions {
/**
* Force LLM to call a specific tool by name.
* If provided: tool_choice = { type: 'function', function: { name: toolChoiceName } }
* If not provided: tool_choice = 'required' (must call some tool, but model chooses which)
*/
toolChoiceName?: string
/**
* Response normalization function.
* Called before parsing the response.
* Used to fix various response format errors from the model.
*/
normalizeResponse?: (response: any) => any
}
/** /**
* LLM Client interface * LLM Client interface
* Note: Does not use generics because each tool in the tools array has different types * Note: Does not use generics because each tool in the tools array has different types
@@ -40,7 +58,8 @@ export interface LLMClient {
invoke( invoke(
messages: Message[], messages: Message[],
tools: Record<string, Tool>, tools: Record<string, Tool>,
abortSignal?: AbortSignal abortSignal?: AbortSignal,
options?: InvokeOptions
): Promise<InvokeResult> ): Promise<InvokeResult>
} }
@@ -82,36 +101,3 @@ export interface LLMConfig {
*/ */
customFetch?: typeof globalThis.fetch customFetch?: typeof globalThis.fetch
} }
/**
* Agent brain state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentBrain {
// thinking?: string
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends AgentBrain {
action: Record<string, any>
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}

View File

@@ -4,8 +4,7 @@
import chalk from 'chalk' import chalk from 'chalk'
import { z } from 'zod' import { z } from 'zod'
import { InvokeError, InvokeErrorType } from './errors' import type { Tool } from './types'
import type { MacroToolInput, Tool } from './types'
function debug(message: string) { function debug(message: string) {
console.debug(chalk.gray('[LLM]'), message) console.debug(chalk.gray('[LLM]'), message)
@@ -26,176 +25,6 @@ export function zodToOpenAITool(name: string, tool: Tool) {
} }
} }
/**
* Although some models cannot guarantee correct response. Common issues are fixable:
* - Instead of returning a proper tool call. Return the tool call parameters in the message content.
* - Returned tool calls or messages don't follow the nested MacroToolInput format.
*/
export function lenientParseMacroToolCall(
responseData: any,
inputSchema: z.ZodObject<MacroToolInput & Record<string, any>>
): MacroToolInput {
// check
const choice = responseData.choices?.[0]
if (!choice) {
throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData)
}
// check
switch (choice.finish_reason) {
case 'tool_calls':
case 'function_call': // gemini
case 'stop': // will try a robust parse
// ✅ Normal
break
case 'length':
// ⚠️ Token limit reached
throw new InvokeError(
InvokeErrorType.CONTEXT_LENGTH,
'Response truncated: max tokens reached'
)
case 'content_filter':
// ❌ Content filtered
throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
default:
throw new InvokeError(
InvokeErrorType.UNKNOWN,
`Unexpected finish_reason: ${choice.finish_reason}`
)
}
// Extract action schema from MacroToolInput schema
const actionSchema = inputSchema.shape.action
if (!actionSchema) {
throw new Error('inputSchema must have an "action" field')
}
// patch stopReason mis-format
let arg: string | null = null
// try to use tool call
const toolCall = choice.message?.tool_calls?.[0]?.function
arg = toolCall?.arguments ?? null
if (arg && toolCall.name !== 'AgentOutput') {
// TODO: check if toolCall.name is a valid action name
// case: instead of AgentOutput, the model returned a action name as tool call
console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call'))
let tmpArg
try {
tmpArg = JSON.parse(arg)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse tool arguments as JSON',
error
)
}
arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } })
}
if (!arg) {
// try to use message content as JSON
arg = choice.message?.content.trim() || null
}
if (!arg) {
throw new InvokeError(
InvokeErrorType.NO_TOOL_CALL,
'No tool call or content found in response',
responseData
)
}
// make sure is valid JSON
let parsedArgs: any
try {
parsedArgs = JSON.parse(arg)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse tool arguments as JSON',
error
)
}
// patch incomplete formats
if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) {
// case: nested MacroToolInput format (correct format)
// some models may give a empty action (they may think reasoning and action should be separate)
if (!parsedArgs.action) {
console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call'))
parsedArgs.action = {
wait: { seconds: 1 },
}
}
} else if (parsedArgs.type && parsedArgs.function) {
// case: upper level function call format provided. only keep its arguments
// TODO: check if function name is a valid action name
if (parsedArgs.function.name !== 'AgentOutput')
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Expected function name "AgentOutput", got "${parsedArgs.function.name}"`,
null
)
console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call'))
parsedArgs = parsedArgs.function.arguments
} else if (parsedArgs.name && parsedArgs.arguments) {
// case: upper level function call format provided. only keep its arguments
// TODO: check if function name is a valid action name
if (parsedArgs.name !== 'AgentOutput')
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Expected function name "AgentOutput", got "${parsedArgs.name}"`,
null
)
console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call'))
parsedArgs = parsedArgs.arguments
} else {
// case: only action parameters provided, wrap into MacroToolInput
// TODO: check if action name is valid
console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call'))
parsedArgs = { action: parsedArgs } as MacroToolInput
}
// make sure it's not wrapped as string
if (typeof parsedArgs === 'string') {
console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call'))
try {
parsedArgs = JSON.parse(parsedArgs)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse nested tool arguments as JSON',
error
)
}
}
const validation = inputSchema.safeParse(parsedArgs)
if (validation.success) {
return validation.data as unknown as MacroToolInput
} else {
const action = parsedArgs.action ?? {}
const actionName = Object.keys(action)[0] || 'unknown'
const actionArgs = JSON.stringify(action[actionName] || 'unknown')
// TODO: check if action name is valid. give a readable error message
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`,
validation.error
)
}
}
/** /**
* Patch model specific parameters * Patch model specific parameters
*/ */
@@ -206,10 +35,19 @@ export function modelPatch(body: Record<string, any>) {
const modelName = normalizeModelName(model) const modelName = normalizeModelName(model)
if (modelName.startsWith('claude')) { if (modelName.startsWith('claude')) {
debug('Applying Claude patch: change tool_choice and disable thinking') debug('Applying Claude patch: disable thinking')
body.tool_choice = { type: 'tool', name: 'AgentOutput' }
body.thinking = { type: 'disabled' } body.thinking = { type: 'disabled' }
// body.reasoning = { enabled: 'disabled' }
// Convert tool_choice to Claude format
if (body.tool_choice === 'required') {
// 'required' -> { type: 'any' } (must call some tool)
debug('Applying Claude patch: convert tool_choice "required" to { type: "any" }')
body.tool_choice = { type: 'any' }
} else if (body.tool_choice?.function?.name) {
// { type: 'function', function: { name: '...' } } -> { type: 'tool', name: '...' }
debug('Applying Claude patch: convert tool_choice format')
body.tool_choice = { type: 'tool', name: body.tool_choice.function.name }
}
} }
if (modelName.startsWith('grok')) { if (modelName.startsWith('grok')) {

View File

@@ -2,13 +2,7 @@
* Copyright (C) 2025 Alibaba Group Holding Limited * Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved. * All rights reserved.
*/ */
import { import { LLM, type Tool } from '@page-agent/llms'
type AgentBrain,
LLM,
type MacroToolInput,
type MacroToolResult,
type Tool,
} from '@page-agent/llms'
import { PageController } from '@page-agent/page-controller' import { PageController } from '@page-agent/page-controller'
import { Panel, SimulatorMask } from '@page-agent/ui' import { Panel, SimulatorMask } from '@page-agent/ui'
import chalk from 'chalk' import chalk from 'chalk'
@@ -18,15 +12,46 @@ import type { PageAgentConfig } from './config'
import { MAX_STEPS } from './config/constants' import { MAX_STEPS } from './config/constants'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw' import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools' import { tools } from './tools'
import { trimLines, uid, waitUntil } from './utils' import { normalizeResponse, trimLines, uid, waitUntil } from './utils'
import { assert } from './utils/assert' import { assert } from './utils/assert'
/**
* Agent brain state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentReflection {
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends Partial<AgentReflection> {
action: Record<string, any>
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}
export type { PageAgentConfig } export type { PageAgentConfig }
export { tool, type PageAgentTool } from './tools' export { tool, type PageAgentTool } from './tools'
export type { AgentBrain, MacroToolInput, MacroToolResult }
export interface AgentHistory { export interface AgentHistory {
brain: AgentBrain brain: AgentReflection
action: { action: {
name: string name: string
input: any input: any
@@ -124,9 +149,6 @@ export class PageAgent extends EventTarget {
window.addEventListener('beforeunload', this.#beforeUnloadListener) window.addEventListener('beforeunload', this.#beforeUnloadListener)
} }
/**
* @todo maybe return something?
*/
async execute(task: string): Promise<ExecutionResult> { async execute(task: string): Promise<ExecutionResult> {
if (!task) throw new Error('Task is required') if (!task) throw new Error('Task is required')
this.task = task this.task = task
@@ -183,7 +205,11 @@ export class PageAgent extends EventTarget {
}, },
], ],
{ AgentOutput: this.#packMacroTool() }, { AgentOutput: this.#packMacroTool() },
this.#abortController.signal this.#abortController.signal,
{
toolChoiceName: 'AgentOutput',
normalizeResponse,
}
) )
const macroResult = result.toolResult as MacroToolResult const macroResult = result.toolResult as MacroToolResult

View File

@@ -1,3 +1,5 @@
export { normalizeResponse } from './normalize'
/** /**
* Wait until condition becomes true * Wait until condition becomes true
* @returns Returns when condition becomes true, throws otherwise * @returns Returns when condition becomes true, throws otherwise

View File

@@ -0,0 +1,154 @@
import chalk from 'chalk'
/**
* Normalize LLM response to fix common format issues.
*
* Handles:
* - No tool_calls but JSON in message.content (fallback)
* - Model returns action name as tool call instead of AgentOutput
* - Arguments wrapped as double JSON string
* - Nested function call format
* - Missing action field (fallback to wait)
* - etc.
*/
export function normalizeResponse(response: any): any {
let resolvedArguments = null as any
const choice = (response as { choices?: Choice[] }).choices?.[0]
if (!choice) throw new Error('No choices in response')
const message = choice.message
if (!message) throw new Error('No message in choice')
const toolCall = message.tool_calls?.[0]
// fix level and location of arguments
if (toolCall?.function?.arguments) {
resolvedArguments = safeJsonParse(toolCall.function.arguments)
// case: sometimes the model only returns the action level
if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
// case: sometimes the model returns json in content instead of tool_calls
if (message.content) {
const content = message.content.trim()
const jsonInContent = retrieveJsonFromString(content)
if (jsonInContent) {
resolvedArguments = safeJsonParse(jsonInContent)
// case: sometimes the content json includes upper level wrapper
if (resolvedArguments?.name === 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.arguments)
}
// case: sometimes even 2-levels of wrapping
if (resolvedArguments?.type === 'function') {
console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
}
// case: and sometimes action level only
// todo: needs better detection logic
if (
!resolvedArguments?.action &&
!resolvedArguments?.evaluation_previous_goal &&
!resolvedArguments?.memory &&
!resolvedArguments?.next_goal &&
!resolvedArguments?.thinking
) {
console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
throw new Error('No tool_call and message content does not contain valid JSON')
}
} else {
throw new Error('No tool_call nor message content is present')
}
}
// fix double stringified arguments
resolvedArguments = safeJsonParse(resolvedArguments)
// fix incomplete formats
if (!resolvedArguments.action) {
console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
}
// pack back to standard format
return {
...response,
choices: [
{
...choice,
message: {
...message,
tool_calls: [
{
...(toolCall || {}),
function: {
...(toolCall?.function || {}),
name: 'AgentOutput',
arguments: JSON.stringify(resolvedArguments),
},
},
],
},
},
],
}
}
/**
* Safely parse JSON, return original input if not json.
*/
function safeJsonParse(input: any): any {
if (typeof input === 'string') {
try {
return JSON.parse(input.trim())
} catch {
return input
}
}
return input
}
/**
* Retrieve the JSON part from a string.
* - treat content between the first `{` and the last `}` as JSON.
* - try to parse as JSON, return the parsed result if successful, otherwise return null.
*/
function retrieveJsonFromString(str: string): any {
try {
const json = /({[\s\S]*})/.exec(str) ?? []
if (json.length === 0) {
return null
}
return JSON.parse(json[0]!)
} catch {
return null
}
}
interface Choice {
message?: {
role?: 'assistant'
content?: string
tool_calls?: {
id?: string
type?: 'function'
function?: {
name?: string
arguments?: string
}
}[]
}
index?: 0
finish_reason?: 'tool_calls'
}