Merge pull request #104 from alibaba/refactor/mv-brain-outof-llms

refactor: move agent logic out of `llms`
This commit is contained in:
Simon
2026-01-13 14:24:29 +08:00
committed by GitHub
8 changed files with 322 additions and 477 deletions

View File

@@ -1,60 +1,58 @@
/** /**
* OpenAI Client implementation * OpenAI Client implementation
* @note This client is only for demonstrating how to implement a LLM client.
* @note Use OpenAILenientClient instead.
*/ */
import { InvokeError, InvokeErrorType } from './errors' import { InvokeError, InvokeErrorType } from './errors'
import type { InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types' import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
import { modelPatch, zodToOpenAITool } from './utils' import { modelPatch, zodToOpenAITool } from './utils'
/** /**
* @deprecated Use OpenAILenientClient instead. * Client for OpenAI compatible APIs
*/ */
export class OpenAIClient implements LLMClient { export class OpenAIClient implements LLMClient {
config: LLMConfig config: Required<LLMConfig>
private fetch: typeof globalThis.fetch
constructor(config: LLMConfig) { constructor(config: Required<LLMConfig>) {
this.config = config this.config = config
this.fetch = config.customFetch
} }
async invoke( async invoke(
messages: Message[], messages: Message[],
tools: Record<string, Tool>, tools: Record<string, Tool>,
abortSignal?: AbortSignal abortSignal?: AbortSignal,
options?: InvokeOptions
): Promise<InvokeResult> { ): Promise<InvokeResult> {
// 1. Convert tools to OpenAI format // 1. Convert tools to OpenAI format
const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool)) const openaiTools = Object.entries(tools).map(([name, t]) => zodToOpenAITool(name, t))
// Build request body
const requestBody: Record<string, unknown> = {
model: this.config.model,
temperature: this.config.temperature,
messages,
tools: openaiTools,
parallel_tool_calls: false,
// Require tool call: specific tool if provided, otherwise any tool
tool_choice: options?.toolChoiceName
? { type: 'function', function: { name: options.toolChoiceName } }
: 'required',
}
// 2. Call API // 2. Call API
let response: Response let response: Response
try { try {
response = await fetch(`${this.config.baseURL}/chat/completions`, { response = await this.fetch(`${this.config.baseURL}/chat/completions`, {
method: 'POST', method: 'POST',
headers: { headers: {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
Authorization: `Bearer ${this.config.apiKey}`, Authorization: `Bearer ${this.config.apiKey}`,
}, },
body: JSON.stringify( body: JSON.stringify(modelPatch(requestBody)),
modelPatch({
model: this.config.model,
temperature: this.config.temperature,
messages,
tools: openaiTools,
// tool_choice: 'required',
tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
// model specific params
// reasoning_effort: 'minimal',
// verbosity: 'low',
parallel_tool_calls: false,
})
),
signal: abortSignal, signal: abortSignal,
}) })
} catch (error: unknown) { } catch (error: unknown) {
// Network error console.error(error)
throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error) throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
} }
@@ -92,77 +90,93 @@ export class OpenAIClient implements LLMClient {
) )
} }
// 4. Parse and validate response
const data = await response.json() const data = await response.json()
// 4. Check finish_reason
const choice = data.choices?.[0] const choice = data.choices?.[0]
if (!choice) { if (!choice) {
throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data) throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data)
} }
// Check finish_reason
switch (choice.finish_reason) { switch (choice.finish_reason) {
case 'tool_calls': case 'tool_calls':
// ✅ Normal case 'function_call': // gemini
case 'stop': // some models use this even with tool calls
break break
case 'length': case 'length':
// ⚠️ Token limit reached
throw new InvokeError( throw new InvokeError(
InvokeErrorType.CONTEXT_LENGTH, InvokeErrorType.CONTEXT_LENGTH,
'Response truncated: max tokens reached', 'Response truncated: max tokens reached'
data
) )
case 'content_filter': case 'content_filter':
// ❌ Content filtered throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
throw new InvokeError(
InvokeErrorType.CONTENT_FILTER,
'Content filtered by safety system',
data
)
case 'stop':
// ❌ Did not call tool (we require tool call)
throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'Model did not call any tool', data)
default: default:
throw new InvokeError( throw new InvokeError(
InvokeErrorType.UNKNOWN, InvokeErrorType.UNKNOWN,
`Unexpected finish_reason: ${choice.finish_reason}`, `Unexpected finish_reason: ${choice.finish_reason}`
data
) )
} }
// 5. Parse tool call // Apply normalizeResponse if provided (for fixing format issues automatically)
const toolCall = choice.message?.tool_calls?.[0] const normalizedData = options?.normalizeResponse ? options.normalizeResponse(data) : data
if (!toolCall) { const normalizedChoice = (normalizedData as any).choices?.[0]
throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'No tool call found in response', data)
}
const toolName = toolCall.function.name // Get tool name from response
const tool = tools[toolName] const toolCallName = normalizedChoice?.message?.tool_calls?.[0]?.function?.name
if (!tool) { if (!toolCallName) {
throw new InvokeError(InvokeErrorType.UNKNOWN, `Tool ${toolName} not found`, data)
}
// 6. Parse and validate arguments
let toolArgs: unknown
try {
toolArgs = JSON.parse(toolCall.function.arguments)
} catch (e) {
throw new InvokeError(InvokeErrorType.INVALID_TOOL_ARGS, 'Invalid JSON in tool arguments', e)
}
// Validate against zod schema
const validation = tool.inputSchema.safeParse(toolArgs)
if (!validation.success) {
throw new InvokeError( throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS, InvokeErrorType.NO_TOOL_CALL,
`Tool arguments validation failed: ${validation.error.message}`, 'No tool call found in response',
validation.error normalizedData
) )
} }
// 7. Execute tool const tool = tools[toolCallName]
if (!tool) {
throw new InvokeError(
InvokeErrorType.UNKNOWN,
`Tool "${toolCallName}" not found in tools`,
normalizedData
)
}
// Extract and parse tool arguments
const argString = normalizedChoice.message?.tool_calls?.[0]?.function?.arguments
if (!argString) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'No tool call arguments found',
normalizedData
)
}
let parsedArgs: unknown
try {
parsedArgs = JSON.parse(argString)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse tool arguments as JSON',
error
)
}
// Validate with schema
const validation = tool.inputSchema.safeParse(parsedArgs)
if (!validation.success) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Tool arguments validation failed',
validation.error
)
}
const toolInput = validation.data
// 5. Execute tool
let toolResult: unknown let toolResult: unknown
try { try {
toolResult = await tool.execute(validation.data) toolResult = await tool.execute(toolInput)
} catch (e) { } catch (e) {
throw new InvokeError( throw new InvokeError(
InvokeErrorType.TOOL_EXECUTION_ERROR, InvokeErrorType.TOOL_EXECUTION_ERROR,
@@ -171,12 +185,11 @@ export class OpenAIClient implements LLMClient {
) )
} }
// 8. Return result (including cache tokens) // Return result
return { return {
toolCall: { toolCall: {
// id: toolCall.id, name: toolCallName,
name: toolName, args: toolInput,
args: validation.data as Record<string, unknown>,
}, },
toolResult, toolResult,
usage: { usage: {

View File

@@ -1,124 +0,0 @@
/**
* OpenAI Client implementation
*/
import { InvokeError, InvokeErrorType } from './errors'
import type { InvokeResult, LLMClient, LLMConfig, MacroToolInput, Message, Tool } from './types'
import { lenientParseMacroToolCall, modelPatch, zodToOpenAITool } from './utils'
export class OpenAIClient implements LLMClient {
config: Required<LLMConfig>
private fetch: typeof globalThis.fetch
constructor(config: Required<LLMConfig>) {
this.config = config
this.fetch = config.customFetch
}
async invoke(
messages: Message[],
tools: { AgentOutput: Tool<MacroToolInput> },
abortSignal?: AbortSignal
): Promise<InvokeResult> {
// 1. Convert tools to OpenAI format
const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
// 2. Call API
let response: Response
try {
response = await this.fetch(`${this.config.baseURL}/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.config.apiKey}`,
},
body: JSON.stringify(
modelPatch({
model: this.config.model,
temperature: this.config.temperature,
messages,
tools: openaiTools,
// tool_choice: 'required',
tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
parallel_tool_calls: false,
})
),
signal: abortSignal,
})
} catch (error: unknown) {
// Network error
console.error(error)
throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
}
// 3. Handle HTTP errors
if (!response.ok) {
const errorData = await response.json().catch()
const errorMessage =
(errorData as { error?: { message?: string } }).error?.message || response.statusText
if (response.status === 401 || response.status === 403) {
throw new InvokeError(
InvokeErrorType.AUTH_ERROR,
`Authentication failed: ${errorMessage}`,
errorData
)
}
if (response.status === 429) {
throw new InvokeError(
InvokeErrorType.RATE_LIMIT,
`Rate limit exceeded: ${errorMessage}`,
errorData
)
}
if (response.status >= 500) {
throw new InvokeError(
InvokeErrorType.SERVER_ERROR,
`Server error: ${errorMessage}`,
errorData
)
}
throw new InvokeError(
InvokeErrorType.UNKNOWN,
`HTTP ${response.status}: ${errorMessage}`,
errorData
)
}
// parse response
const data = await response.json()
const tool = tools.AgentOutput
const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any)
// Execute tool
let toolResult: unknown
try {
toolResult = await tool.execute(macroToolInput)
} catch (e) {
throw new InvokeError(
InvokeErrorType.TOOL_EXECUTION_ERROR,
`Tool execution failed: ${(e as Error).message}`,
e
)
}
// Return result (including cache tokens)
return {
toolCall: {
// id: toolCall.id,
name: 'AgentOutput',
args: macroToolInput,
},
toolResult,
usage: {
promptTokens: data.usage?.prompt_tokens ?? 0,
completionTokens: data.usage?.completion_tokens ?? 0,
totalTokens: data.usage?.total_tokens ?? 0,
cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
},
rawResponse: data,
}
}
}

View File

@@ -1,37 +1,4 @@
/** import { OpenAIClient } from './OpenAIClient'
* @topic LLM 与主流程的隔离
* @reasoning
* 将 llm 的调用和主流程分开是复杂的,
* 因为 agent 的 tool call 通常集成在 llm 模块中,而而先得到 llm 返回,然后处理工具调用
* tools 和 llm 调用的逻辑不可避免地耦合在一起tool 的执行又和主流程耦合在一起
* 而 history 的维护和更新逻辑,又必须嵌入多轮 tool call 中
* @reasoning
* - 放弃框架提供的自动的多轮调用,每轮调用都由主流程发起
* - 理想情况下llm 调用应该获得 structured output然后由额外的模块触发 tool call目前模型和框架都无法实现
* - 当前只能将 llm api 和 本地 tool call 耦合在一起,不关心其中的衔接方式
* @conclusion
* - @llm responsibility boundary:
* - call llm api with given messages and tools
* - invoke tool call and get the result of the tool
* - return the result to main loop
* - @main_loop responsibility boundary:
* - maintain all behaviors of an **agent**
* @conclusion
* - 这里的 llm 模块不是 agent只负责一轮 llm 调用和工具调用,无状态
*/
/**
* @topic 结构化输出
* @facts
* - 几乎所有模型都支持 tool call schema
* - 几乎所有模型都支持返回 json
* - 只有 openAI/grok/gemini 支持 schema 并保证格式
* - 主流模型都支持 tool_choice: required
* - 除了 qwen 必须指定一个函数名 (9月上新后支持)
* @conclusion
* - 永远使用 tool call 来返回结构化数据,禁止模型直接返回(视为出错)
* - 不能假设 tool 参数合法,必须有修复机制,而且修复也应该使用 tool call 返回
*/
import { OpenAIClient } from './OpenAILenientClient'
import { import {
DEFAULT_API_KEY, DEFAULT_API_KEY,
DEFAULT_BASE_URL, DEFAULT_BASE_URL,
@@ -40,27 +7,9 @@ import {
LLM_MAX_RETRIES, LLM_MAX_RETRIES,
} from './constants' } from './constants'
import { InvokeError } from './errors' import { InvokeError } from './errors'
import type { import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
AgentBrain,
InvokeResult,
LLMClient,
LLMConfig,
MacroToolInput,
MacroToolResult,
Message,
Tool,
} from './types'
export type { export type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }
AgentBrain,
InvokeResult,
LLMClient,
LLMConfig,
MacroToolInput,
MacroToolResult,
Message,
Tool,
}
export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> { export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
return { return {
@@ -93,11 +42,12 @@ export class LLM extends EventTarget {
async invoke( async invoke(
messages: Message[], messages: Message[],
tools: Record<string, Tool>, tools: Record<string, Tool>,
abortSignal: AbortSignal abortSignal: AbortSignal,
options?: InvokeOptions
): Promise<InvokeResult> { ): Promise<InvokeResult> {
return await withRetry( return await withRetry(
async () => { async () => {
const result = await this.client.invoke(messages, tools, abortSignal) const result = await this.client.invoke(messages, tools, abortSignal, options)
return result return result
}, },

View File

@@ -32,6 +32,24 @@ export interface Tool<TParams = any, TResult = any> {
execute: (args: TParams) => Promise<TResult> execute: (args: TParams) => Promise<TResult>
} }
/**
* Invoke options for LLM call
*/
export interface InvokeOptions {
/**
* Force LLM to call a specific tool by name.
* If provided: tool_choice = { type: 'function', function: { name: toolChoiceName } }
* If not provided: tool_choice = 'required' (must call some tool, but model chooses which)
*/
toolChoiceName?: string
/**
* Response normalization function.
* Called before parsing the response.
* Used to fix various response format errors from the model.
*/
normalizeResponse?: (response: any) => any
}
/** /**
* LLM Client interface * LLM Client interface
* Note: Does not use generics because each tool in the tools array has different types * Note: Does not use generics because each tool in the tools array has different types
@@ -40,7 +58,8 @@ export interface LLMClient {
invoke( invoke(
messages: Message[], messages: Message[],
tools: Record<string, Tool>, tools: Record<string, Tool>,
abortSignal?: AbortSignal abortSignal?: AbortSignal,
options?: InvokeOptions
): Promise<InvokeResult> ): Promise<InvokeResult>
} }
@@ -82,36 +101,3 @@ export interface LLMConfig {
*/ */
customFetch?: typeof globalThis.fetch customFetch?: typeof globalThis.fetch
} }
/**
* Agent brain state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentBrain {
// thinking?: string
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends AgentBrain {
action: Record<string, any>
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}

View File

@@ -4,8 +4,7 @@
import chalk from 'chalk' import chalk from 'chalk'
import { z } from 'zod' import { z } from 'zod'
import { InvokeError, InvokeErrorType } from './errors' import type { Tool } from './types'
import type { MacroToolInput, Tool } from './types'
function debug(message: string) { function debug(message: string) {
console.debug(chalk.gray('[LLM]'), message) console.debug(chalk.gray('[LLM]'), message)
@@ -26,176 +25,6 @@ export function zodToOpenAITool(name: string, tool: Tool) {
} }
} }
/**
* Although some models cannot guarantee correct response. Common issues are fixable:
* - Instead of returning a proper tool call. Return the tool call parameters in the message content.
* - Returned tool calls or messages don't follow the nested MacroToolInput format.
*/
export function lenientParseMacroToolCall(
responseData: any,
inputSchema: z.ZodObject<MacroToolInput & Record<string, any>>
): MacroToolInput {
// check
const choice = responseData.choices?.[0]
if (!choice) {
throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData)
}
// check
switch (choice.finish_reason) {
case 'tool_calls':
case 'function_call': // gemini
case 'stop': // will try a robust parse
// ✅ Normal
break
case 'length':
// ⚠️ Token limit reached
throw new InvokeError(
InvokeErrorType.CONTEXT_LENGTH,
'Response truncated: max tokens reached'
)
case 'content_filter':
// ❌ Content filtered
throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
default:
throw new InvokeError(
InvokeErrorType.UNKNOWN,
`Unexpected finish_reason: ${choice.finish_reason}`
)
}
// Extract action schema from MacroToolInput schema
const actionSchema = inputSchema.shape.action
if (!actionSchema) {
throw new Error('inputSchema must have an "action" field')
}
// patch stopReason mis-format
let arg: string | null = null
// try to use tool call
const toolCall = choice.message?.tool_calls?.[0]?.function
arg = toolCall?.arguments ?? null
if (arg && toolCall.name !== 'AgentOutput') {
// TODO: check if toolCall.name is a valid action name
// case: instead of AgentOutput, the model returned a action name as tool call
console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call'))
let tmpArg
try {
tmpArg = JSON.parse(arg)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse tool arguments as JSON',
error
)
}
arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } })
}
if (!arg) {
// try to use message content as JSON
arg = choice.message?.content.trim() || null
}
if (!arg) {
throw new InvokeError(
InvokeErrorType.NO_TOOL_CALL,
'No tool call or content found in response',
responseData
)
}
// make sure is valid JSON
let parsedArgs: any
try {
parsedArgs = JSON.parse(arg)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse tool arguments as JSON',
error
)
}
// patch incomplete formats
if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) {
// case: nested MacroToolInput format (correct format)
// some models may give a empty action (they may think reasoning and action should be separate)
if (!parsedArgs.action) {
console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call'))
parsedArgs.action = {
wait: { seconds: 1 },
}
}
} else if (parsedArgs.type && parsedArgs.function) {
// case: upper level function call format provided. only keep its arguments
// TODO: check if function name is a valid action name
if (parsedArgs.function.name !== 'AgentOutput')
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Expected function name "AgentOutput", got "${parsedArgs.function.name}"`,
null
)
console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call'))
parsedArgs = parsedArgs.function.arguments
} else if (parsedArgs.name && parsedArgs.arguments) {
// case: upper level function call format provided. only keep its arguments
// TODO: check if function name is a valid action name
if (parsedArgs.name !== 'AgentOutput')
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Expected function name "AgentOutput", got "${parsedArgs.name}"`,
null
)
console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call'))
parsedArgs = parsedArgs.arguments
} else {
// case: only action parameters provided, wrap into MacroToolInput
// TODO: check if action name is valid
console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call'))
parsedArgs = { action: parsedArgs } as MacroToolInput
}
// make sure it's not wrapped as string
if (typeof parsedArgs === 'string') {
console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call'))
try {
parsedArgs = JSON.parse(parsedArgs)
} catch (error) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Failed to parse nested tool arguments as JSON',
error
)
}
}
const validation = inputSchema.safeParse(parsedArgs)
if (validation.success) {
return validation.data as unknown as MacroToolInput
} else {
const action = parsedArgs.action ?? {}
const actionName = Object.keys(action)[0] || 'unknown'
const actionArgs = JSON.stringify(action[actionName] || 'unknown')
// TODO: check if action name is valid. give a readable error message
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`,
validation.error
)
}
}
/** /**
* Patch model specific parameters * Patch model specific parameters
*/ */
@@ -206,10 +35,19 @@ export function modelPatch(body: Record<string, any>) {
const modelName = normalizeModelName(model) const modelName = normalizeModelName(model)
if (modelName.startsWith('claude')) { if (modelName.startsWith('claude')) {
debug('Applying Claude patch: change tool_choice and disable thinking') debug('Applying Claude patch: disable thinking')
body.tool_choice = { type: 'tool', name: 'AgentOutput' }
body.thinking = { type: 'disabled' } body.thinking = { type: 'disabled' }
// body.reasoning = { enabled: 'disabled' }
// Convert tool_choice to Claude format
if (body.tool_choice === 'required') {
// 'required' -> { type: 'any' } (must call some tool)
debug('Applying Claude patch: convert tool_choice "required" to { type: "any" }')
body.tool_choice = { type: 'any' }
} else if (body.tool_choice?.function?.name) {
// { type: 'function', function: { name: '...' } } -> { type: 'tool', name: '...' }
debug('Applying Claude patch: convert tool_choice format')
body.tool_choice = { type: 'tool', name: body.tool_choice.function.name }
}
} }
if (modelName.startsWith('grok')) { if (modelName.startsWith('grok')) {

View File

@@ -2,13 +2,7 @@
* Copyright (C) 2025 Alibaba Group Holding Limited * Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved. * All rights reserved.
*/ */
import { import { LLM, type Tool } from '@page-agent/llms'
type AgentBrain,
LLM,
type MacroToolInput,
type MacroToolResult,
type Tool,
} from '@page-agent/llms'
import { PageController } from '@page-agent/page-controller' import { PageController } from '@page-agent/page-controller'
import { Panel, SimulatorMask } from '@page-agent/ui' import { Panel, SimulatorMask } from '@page-agent/ui'
import chalk from 'chalk' import chalk from 'chalk'
@@ -18,15 +12,46 @@ import type { PageAgentConfig } from './config'
import { MAX_STEPS } from './config/constants' import { MAX_STEPS } from './config/constants'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw' import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools' import { tools } from './tools'
import { trimLines, uid, waitUntil } from './utils' import { normalizeResponse, trimLines, uid, waitUntil } from './utils'
import { assert } from './utils/assert' import { assert } from './utils/assert'
/**
* Agent brain state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentReflection {
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends Partial<AgentReflection> {
action: Record<string, any>
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}
export type { PageAgentConfig } export type { PageAgentConfig }
export { tool, type PageAgentTool } from './tools' export { tool, type PageAgentTool } from './tools'
export type { AgentBrain, MacroToolInput, MacroToolResult }
export interface AgentHistory { export interface AgentHistory {
brain: AgentBrain brain: Partial<AgentReflection>
action: { action: {
name: string name: string
input: any input: any
@@ -124,9 +149,6 @@ export class PageAgent extends EventTarget {
window.addEventListener('beforeunload', this.#beforeUnloadListener) window.addEventListener('beforeunload', this.#beforeUnloadListener)
} }
/**
* @todo maybe return something?
*/
async execute(task: string): Promise<ExecutionResult> { async execute(task: string): Promise<ExecutionResult> {
if (!task) throw new Error('Task is required') if (!task) throw new Error('Task is required')
this.task = task this.task = task
@@ -183,7 +205,11 @@ export class PageAgent extends EventTarget {
}, },
], ],
{ AgentOutput: this.#packMacroTool() }, { AgentOutput: this.#packMacroTool() },
this.#abortController.signal this.#abortController.signal,
{
toolChoiceName: 'AgentOutput',
normalizeResponse,
}
) )
const macroResult = result.toolResult as MacroToolResult const macroResult = result.toolResult as MacroToolResult

View File

@@ -0,0 +1,154 @@
import chalk from 'chalk'
/**
* Normalize LLM response and fix common format issues.
*
* Handles:
* - No tool_calls but JSON in message.content (fallback)
* - Model returns action name as tool call instead of AgentOutput
* - Arguments wrapped as double JSON string
* - Nested function call format
* - Missing action field (fallback to wait)
* - etc.
*/
export function normalizeResponse(response: any): any {
let resolvedArguments = null as any
const choice = (response as { choices?: Choice[] }).choices?.[0]
if (!choice) throw new Error('No choices in response')
const message = choice.message
if (!message) throw new Error('No message in choice')
const toolCall = message.tool_calls?.[0]
// fix level and location of arguments
if (toolCall?.function?.arguments) {
resolvedArguments = safeJsonParse(toolCall.function.arguments)
// case: sometimes the model only returns the action level
if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
// case: sometimes the model returns json in content instead of tool_calls
if (message.content) {
const content = message.content.trim()
const jsonInContent = retrieveJsonFromString(content)
if (jsonInContent) {
resolvedArguments = safeJsonParse(jsonInContent)
// case: sometimes the content json includes upper level wrapper
if (resolvedArguments?.name === 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.arguments)
}
// case: sometimes even 2-levels of wrapping
if (resolvedArguments?.type === 'function') {
console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
}
// case: and sometimes action level only
// todo: needs better detection logic
if (
!resolvedArguments?.action &&
!resolvedArguments?.evaluation_previous_goal &&
!resolvedArguments?.memory &&
!resolvedArguments?.next_goal &&
!resolvedArguments?.thinking
) {
console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
throw new Error('No tool_call and the message content does not contain valid JSON')
}
} else {
throw new Error('No tool_call nor message content is present')
}
}
// fix double stringified arguments
resolvedArguments = safeJsonParse(resolvedArguments)
// fix incomplete formats
if (!resolvedArguments.action) {
console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
}
// pack back to standard format
return {
...response,
choices: [
{
...choice,
message: {
...message,
tool_calls: [
{
...(toolCall || {}),
function: {
...(toolCall?.function || {}),
name: 'AgentOutput',
arguments: JSON.stringify(resolvedArguments),
},
},
],
},
},
],
}
}
/**
* Safely parse JSON, return original input if not json.
*/
function safeJsonParse(input: any): any {
if (typeof input === 'string') {
try {
return JSON.parse(input.trim())
} catch {
return input
}
}
return input
}
/**
* Extract and parse JSON from a string.
* - Treat content between the first `{` and the last `}` as JSON.
* - Try to parse that content as JSON and return the parsed value (object/array/primitive) if successful, otherwise return null.
*/
function retrieveJsonFromString(str: string): any {
try {
const json = /({[\s\S]*})/.exec(str) ?? []
if (json.length === 0) {
return null
}
return JSON.parse(json[0]!)
} catch {
return null
}
}
interface Choice {
message?: {
role?: 'assistant'
content?: string
tool_calls?: {
id?: string
type?: 'function'
function?: {
name?: string
arguments?: string
}
}[]
}
index?: 0
finish_reason?: 'tool_calls'
}

View File

@@ -1,3 +1,5 @@
export { normalizeResponse } from './autoFixer'
/** /**
* Wait until condition becomes true * Wait until condition becomes true
* @returns Returns when condition becomes true, throws otherwise * @returns Returns when condition becomes true, throws otherwise