Merge pull request #147 from alibaba/feat/robust-llm-toolcall

2026-02-25 17:25:05 +08:00
parent 22fe448d95 b4377ed152
commit b45a2af9bf
4 changed files with 66 additions and 7 deletions
--- a/packages/core/src/PageAgentCore.ts
+++ b/packages/core/src/PageAgentCore.ts
@@ -248,16 +248,16 @@ export class PageAgentCore extends EventTarget {
 					{ role: 'user' as const, content: await this.#assembleUserPrompt() },
 				]

-				const tools = { AgentOutput: this.#packMacroTool() }
+				const macroTool = { AgentOutput: this.#packMacroTool() }

 				// invoke LLM

 				console.log(chalk.blue.bold('🧠 Thinking...'))
 				this.#emitActivity({ type: 'thinking' })

-				const result = await this.#llm.invoke(messages, tools, this.#abortController.signal, {
+				const result = await this.#llm.invoke(messages, macroTool, this.#abortController.signal, {
 					toolChoiceName: 'AgentOutput',
-					normalizeResponse,
+					normalizeResponse: (res) => normalizeResponse(res, this.tools),
 				})

 				// assemble history
--- a/packages/core/src/utils/autoFixer.ts
+++ b/packages/core/src/utils/autoFixer.ts
@@ -1,4 +1,8 @@
+import { InvokeError, InvokeErrorType } from '@page-agent/llms'
 import chalk from 'chalk'
+import * as z from 'zod'
+
+import type { PageAgentTool } from '../tools'

 /**
 * Normalize LLM response and fix common format issues.
@@ -9,9 +13,10 @@ import chalk from 'chalk'
 * - Arguments wrapped as double JSON string
 * - Nested function call format
 * - Missing action field (fallback to wait)
+ * - Primitive action input for single-field tools (e.g. `{"click_element_by_index": 2}`)
 * - etc.
 */
-export function normalizeResponse(response: any): any {
+export function normalizeResponse(response: any, tools?: Map<string, PageAgentTool>): any {
 	let resolvedArguments = null as any

 	const choice = (response as { choices?: Choice[] }).choices?.[0]
@@ -78,6 +83,11 @@ export function normalizeResponse(response: any): any {
 		resolvedArguments.action = safeJsonParse(resolvedArguments.action)
 	}

+	// validate and fix action input using tool schemas
+	if (resolvedArguments.action && tools) {
+		resolvedArguments.action = validateAction(resolvedArguments.action, tools)
+	}
+
 	// fix incomplete formats
 	if (!resolvedArguments.action) {
 		console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
@@ -108,6 +118,55 @@ export function normalizeResponse(response: any): any {
 	}
 }

+/**
+ * Validate action against tool schemas. Provides clear error messages
+ * instead of letting the union schema produce unreadable errors.
+ *
+ * Also coerces primitive inputs for single-field tools:
+ * e.g. `{"click_element_by_index": 2}` → `{"click_element_by_index": {"index": 2}}`
+ */
+function validateAction(action: any, tools: Map<string, PageAgentTool>): any {
+	if (typeof action !== 'object' || action === null) return action
+
+	const toolName = Object.keys(action)[0]
+	if (!toolName) return action
+
+	const tool = tools.get(toolName)
+	if (!tool) {
+		const available = Array.from(tools.keys()).join(', ')
+		throw new InvokeError(
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			`Unknown action "${toolName}". Available: ${available}`
+		)
+	}
+
+	let value = action[toolName]
+	const schema = tool.inputSchema
+
+	// coerce primitive input for single-field tools
+	if (schema instanceof z.ZodObject && value !== null && typeof value !== 'object') {
+		const requiredKey = Object.keys(schema.shape).find(
+			(k) => !(schema.shape as Record<string, z.ZodType>)[k].safeParse(undefined).success
+		)
+		if (requiredKey) {
+			console.log(
+				chalk.yellow(`[normalizeResponse] coercing primitive action input for "${toolName}"`)
+			)
+			value = { [requiredKey]: value }
+		}
+	}
+
+	const result = schema.safeParse(value)
+	if (!result.success) {
+		throw new InvokeError(
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			`Invalid input for action "${toolName}": ${z.prettifyError(result.error)}`
+		)
+	}
+
+	return { [toolName]: result.data }
+}
+
 /**
 * Safely parse JSON, return original input if not json.
 */
--- a/packages/llms/src/env.d.ts
+++ b/packages/llms/src/env.d.ts
@@ -1 +0,0 @@
-/// <reference types="vite/client" />
--- a/packages/llms/src/index.ts
+++ b/packages/llms/src/index.ts
@@ -1,9 +1,10 @@
 import { OpenAIClient } from './OpenAIClient'
 import { DEFAULT_TEMPERATURE, LLM_MAX_RETRIES } from './constants'
-import { InvokeError } from './errors'
+import { InvokeError, InvokeErrorType } from './errors'
 import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'

-export type { InvokeError, InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }
+export { InvokeError, InvokeErrorType }
+export type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }

 export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
 	// Runtime validation as defensive programming (types already guarantee these)