feat!: mv brain from llms to agent; redo toolCall auto fixer

This commit is contained in:
Simon
2026-01-13 13:49:19 +08:00
parent e70ae40096
commit 14974c0257
7 changed files with 341 additions and 273 deletions

View File

@@ -2,13 +2,7 @@
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import {
type AgentBrain,
LLM,
type MacroToolInput,
type MacroToolResult,
type Tool,
} from '@page-agent/llms'
import { LLM, type Tool } from '@page-agent/llms'
import { PageController } from '@page-agent/page-controller'
import { Panel, SimulatorMask } from '@page-agent/ui'
import chalk from 'chalk'
@@ -18,15 +12,46 @@ import type { PageAgentConfig } from './config'
import { MAX_STEPS } from './config/constants'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools'
import { trimLines, uid, waitUntil } from './utils'
import { normalizeResponse, trimLines, uid, waitUntil } from './utils'
import { assert } from './utils/assert'
/**
* Agent brain state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentReflection {
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends Partial<AgentReflection> {
action: Record<string, any>
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}
export type { PageAgentConfig }
export { tool, type PageAgentTool } from './tools'
export type { AgentBrain, MacroToolInput, MacroToolResult }
export interface AgentHistory {
brain: AgentBrain
brain: AgentReflection
action: {
name: string
input: any
@@ -124,9 +149,6 @@ export class PageAgent extends EventTarget {
window.addEventListener('beforeunload', this.#beforeUnloadListener)
}
/**
* @todo maybe return something?
*/
async execute(task: string): Promise<ExecutionResult> {
if (!task) throw new Error('Task is required')
this.task = task
@@ -183,7 +205,11 @@ export class PageAgent extends EventTarget {
},
],
{ AgentOutput: this.#packMacroTool() },
this.#abortController.signal
this.#abortController.signal,
{
toolChoiceName: 'AgentOutput',
normalizeResponse,
}
)
const macroResult = result.toolResult as MacroToolResult

View File

@@ -1,3 +1,5 @@
export { normalizeResponse } from './normalize'
/**
* Wait until condition becomes true
* @returns Returns when condition becomes true, throws otherwise

View File

@@ -0,0 +1,154 @@
import chalk from 'chalk'
/**
* Normalize LLM response to fix common format issues.
*
* Handles:
* - No tool_calls but JSON in message.content (fallback)
* - Model returns action name as tool call instead of AgentOutput
* - Arguments wrapped as double JSON string
* - Nested function call format
* - Missing action field (fallback to wait)
* - etc.
*/
export function normalizeResponse(response: any): any {
let resolvedArguments = null as any
const choice = (response as { choices?: Choice[] }).choices?.[0]
if (!choice) throw new Error('No choices in response')
const message = choice.message
if (!message) throw new Error('No message in choice')
const toolCall = message.tool_calls?.[0]
// fix level and location of arguments
if (toolCall?.function?.arguments) {
resolvedArguments = safeJsonParse(toolCall.function.arguments)
// case: sometimes the model only returns the action level
if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
// case: sometimes the model returns json in content instead of tool_calls
if (message.content) {
const content = message.content.trim()
const jsonInContent = retrieveJsonFromString(content)
if (jsonInContent) {
resolvedArguments = safeJsonParse(jsonInContent)
// case: sometimes the content json includes upper level wrapper
if (resolvedArguments?.name === 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.arguments)
}
// case: sometimes even 2-levels of wrapping
if (resolvedArguments?.type === 'function') {
console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
}
// case: and sometimes action level only
// todo: needs better detection logic
if (
!resolvedArguments?.action &&
!resolvedArguments?.evaluation_previous_goal &&
!resolvedArguments?.memory &&
!resolvedArguments?.next_goal &&
!resolvedArguments?.thinking
) {
console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
throw new Error('No tool_call and message content does not contain valid JSON')
}
} else {
throw new Error('No tool_call nor message content is present')
}
}
// fix double stringified arguments
resolvedArguments = safeJsonParse(resolvedArguments)
// fix incomplete formats
if (!resolvedArguments.action) {
console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
}
// pack back to standard format
return {
...response,
choices: [
{
...choice,
message: {
...message,
tool_calls: [
{
...(toolCall || {}),
function: {
...(toolCall?.function || {}),
name: 'AgentOutput',
arguments: JSON.stringify(resolvedArguments),
},
},
],
},
},
],
}
}
/**
* Safely parse JSON, return original input if not json.
*/
function safeJsonParse(input: any): any {
if (typeof input === 'string') {
try {
return JSON.parse(input.trim())
} catch {
return input
}
}
return input
}
/**
* Retrieve the JSON part from a string.
* - treat content between the first `{` and the last `}` as JSON.
* - try to parse as JSON, return the parsed result if successful, otherwise return null.
*/
function retrieveJsonFromString(str: string): any {
try {
const json = /({[\s\S]*})/.exec(str) ?? []
if (json.length === 0) {
return null
}
return JSON.parse(json[0]!)
} catch {
return null
}
}
interface Choice {
message?: {
role?: 'assistant'
content?: string
tool_calls?: {
id?: string
type?: 'function'
function?: {
name?: string
arguments?: string
}
}[]
}
index?: 0
finish_reason?: 'tool_calls'
}