feat!: mv brain from llms to agent; redo toolCall auto fixer

2026-01-13 13:49:19 +08:00
parent e70ae40096
commit 14974c0257
7 changed files with 341 additions and 273 deletions
--- a/packages/page-agent/src/PageAgent.ts
+++ b/packages/page-agent/src/PageAgent.ts
@@ -2,13 +2,7 @@
 * Copyright (C) 2025 Alibaba Group Holding Limited
 * All rights reserved.
 */
-import {
-	type AgentBrain,
-	LLM,
-	type MacroToolInput,
-	type MacroToolResult,
-	type Tool,
-} from '@page-agent/llms'
+import { LLM, type Tool } from '@page-agent/llms'
 import { PageController } from '@page-agent/page-controller'
 import { Panel, SimulatorMask } from '@page-agent/ui'
 import chalk from 'chalk'
@@ -18,15 +12,46 @@ import type { PageAgentConfig } from './config'
 import { MAX_STEPS } from './config/constants'
 import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
 import { tools } from './tools'
-import { trimLines, uid, waitUntil } from './utils'
+import { normalizeResponse, trimLines, uid, waitUntil } from './utils'
 import { assert } from './utils/assert'

+/**
+ * Agent brain state - the reflection-before-action model
+ *
+ * Every tool call must first reflect on:
+ * - evaluation_previous_goal: How well did the previous action achieve its goal?
+ * - memory: Key information to remember for future steps
+ * - next_goal: What should be accomplished in the next action?
+ */
+export interface AgentReflection {
+	evaluation_previous_goal: string
+	memory: string
+	next_goal: string
+}
+
+/**
+ * MacroTool input structure
+ *
+ * This is the core abstraction that enforces the "reflection-before-action" mental model.
+ * Before executing any action, the LLM must output its reasoning state.
+ */
+export interface MacroToolInput extends Partial<AgentReflection> {
+	action: Record<string, any>
+}
+
+/**
+ * MacroTool output structure
+ */
+export interface MacroToolResult {
+	input: MacroToolInput
+	output: string
+}
+
 export type { PageAgentConfig }
 export { tool, type PageAgentTool } from './tools'
-export type { AgentBrain, MacroToolInput, MacroToolResult }

 export interface AgentHistory {
-	brain: AgentBrain
+	brain: AgentReflection
 	action: {
 		name: string
 		input: any
@@ -124,9 +149,6 @@ export class PageAgent extends EventTarget {
 		window.addEventListener('beforeunload', this.#beforeUnloadListener)
 	}

-	/**
-	 * @todo maybe return something?
-	 */
 	async execute(task: string): Promise<ExecutionResult> {
 		if (!task) throw new Error('Task is required')
 		this.task = task
@@ -183,7 +205,11 @@ export class PageAgent extends EventTarget {
 						},
 					],
 					{ AgentOutput: this.#packMacroTool() },
-					this.#abortController.signal
+					this.#abortController.signal,
+					{
+						toolChoiceName: 'AgentOutput',
+						normalizeResponse,
+					}
 				)

 				const macroResult = result.toolResult as MacroToolResult
--- a/packages/page-agent/src/utils/index.ts
+++ b/packages/page-agent/src/utils/index.ts
@@ -1,3 +1,5 @@
+export { normalizeResponse } from './normalize'
+
 /**
 * Wait until condition becomes true
 * @returns Returns when condition becomes true, throws otherwise
--- a/packages/page-agent/src/utils/normalize.ts
+++ b/packages/page-agent/src/utils/normalize.ts
@@ -0,0 +1,154 @@
+import chalk from 'chalk'
+
+/**
+ * Normalize LLM response to fix common format issues.
+ *
+ * Handles:
+ * - No tool_calls but JSON in message.content (fallback)
+ * - Model returns action name as tool call instead of AgentOutput
+ * - Arguments wrapped as double JSON string
+ * - Nested function call format
+ * - Missing action field (fallback to wait)
+ * - etc.
+ */
+export function normalizeResponse(response: any): any {
+	let resolvedArguments = null as any
+
+	const choice = (response as { choices?: Choice[] }).choices?.[0]
+	if (!choice) throw new Error('No choices in response')
+
+	const message = choice.message
+	if (!message) throw new Error('No message in choice')
+
+	const toolCall = message.tool_calls?.[0]
+
+	// fix level and location of arguments
+
+	if (toolCall?.function?.arguments) {
+		resolvedArguments = safeJsonParse(toolCall.function.arguments)
+
+		// case: sometimes the model only returns the action level
+		if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
+			console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`))
+			resolvedArguments = { action: safeJsonParse(resolvedArguments) }
+		}
+	} else {
+		// case: sometimes the model returns json in content instead of tool_calls
+		if (message.content) {
+			const content = message.content.trim()
+			const jsonInContent = retrieveJsonFromString(content)
+			if (jsonInContent) {
+				resolvedArguments = safeJsonParse(jsonInContent)
+
+				// case: sometimes the content json includes upper level wrapper
+				if (resolvedArguments?.name === 'AgentOutput') {
+					console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`))
+					resolvedArguments = safeJsonParse(resolvedArguments.arguments)
+				}
+
+				// case: sometimes even 2-levels of wrapping
+				if (resolvedArguments?.type === 'function') {
+					console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`))
+					resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
+				}
+
+				// case: and sometimes action level only
+				// todo: needs better detection logic
+				if (
+					!resolvedArguments?.action &&
+					!resolvedArguments?.evaluation_previous_goal &&
+					!resolvedArguments?.memory &&
+					!resolvedArguments?.next_goal &&
+					!resolvedArguments?.thinking
+				) {
+					console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`))
+					resolvedArguments = { action: safeJsonParse(resolvedArguments) }
+				}
+			} else {
+				throw new Error('No tool_call and message content does not contain valid JSON')
+			}
+		} else {
+			throw new Error('No tool_call nor message content is present')
+		}
+	}
+
+	// fix double stringified arguments
+	resolvedArguments = safeJsonParse(resolvedArguments)
+
+	// fix incomplete formats
+	if (!resolvedArguments.action) {
+		console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
+		resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
+	}
+
+	// pack back to standard format
+	return {
+		...response,
+		choices: [
+			{
+				...choice,
+				message: {
+					...message,
+					tool_calls: [
+						{
+							...(toolCall || {}),
+							function: {
+								...(toolCall?.function || {}),
+								name: 'AgentOutput',
+								arguments: JSON.stringify(resolvedArguments),
+							},
+						},
+					],
+				},
+			},
+		],
+	}
+}
+
+/**
+ * Safely parse JSON, return original input if not json.
+ */
+function safeJsonParse(input: any): any {
+	if (typeof input === 'string') {
+		try {
+			return JSON.parse(input.trim())
+		} catch {
+			return input
+		}
+	}
+	return input
+}
+
+/**
+ * Retrieve the JSON part from a string.
+ * - treat content between the first `{` and the last `}` as JSON.
+ * - try to parse as JSON, return the parsed result if successful, otherwise return null.
+ */
+function retrieveJsonFromString(str: string): any {
+	try {
+		const json = /({[\s\S]*})/.exec(str) ?? []
+		if (json.length === 0) {
+			return null
+		}
+		return JSON.parse(json[0]!)
+	} catch {
+		return null
+	}
+}
+
+interface Choice {
+	message?: {
+		role?: 'assistant'
+		content?: string
+		tool_calls?: {
+			id?: string
+			type?: 'function'
+			function?: {
+				name?: string
+				arguments?: string
+			}
+		}[]
+	}
+	index?: 0
+	finish_reason?: 'tool_calls'
+}