Merge pull request #104 from alibaba/refactor/mv-brain-outof-llms
refactor: move agent logic out of `llms`
This commit is contained in:
@@ -1,60 +1,58 @@
|
||||
/**
|
||||
* OpenAI Client implementation
|
||||
* @note This client is only for demonstrating how to implement a LLM client.
|
||||
* @note Use OpenAILenientClient instead.
|
||||
*/
|
||||
import { InvokeError, InvokeErrorType } from './errors'
|
||||
import type { InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
|
||||
import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
|
||||
import { modelPatch, zodToOpenAITool } from './utils'
|
||||
|
||||
/**
|
||||
* @deprecated Use OpenAILenientClient instead.
|
||||
* Client for OpenAI compatible APIs
|
||||
*/
|
||||
export class OpenAIClient implements LLMClient {
|
||||
config: LLMConfig
|
||||
config: Required<LLMConfig>
|
||||
private fetch: typeof globalThis.fetch
|
||||
|
||||
constructor(config: LLMConfig) {
|
||||
constructor(config: Required<LLMConfig>) {
|
||||
this.config = config
|
||||
this.fetch = config.customFetch
|
||||
}
|
||||
|
||||
async invoke(
|
||||
messages: Message[],
|
||||
tools: Record<string, Tool>,
|
||||
abortSignal?: AbortSignal
|
||||
abortSignal?: AbortSignal,
|
||||
options?: InvokeOptions
|
||||
): Promise<InvokeResult> {
|
||||
// 1. Convert tools to OpenAI format
|
||||
const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
|
||||
const openaiTools = Object.entries(tools).map(([name, t]) => zodToOpenAITool(name, t))
|
||||
|
||||
// Build request body
|
||||
const requestBody: Record<string, unknown> = {
|
||||
model: this.config.model,
|
||||
temperature: this.config.temperature,
|
||||
messages,
|
||||
tools: openaiTools,
|
||||
parallel_tool_calls: false,
|
||||
// Require tool call: specific tool if provided, otherwise any tool
|
||||
tool_choice: options?.toolChoiceName
|
||||
? { type: 'function', function: { name: options.toolChoiceName } }
|
||||
: 'required',
|
||||
}
|
||||
|
||||
// 2. Call API
|
||||
let response: Response
|
||||
try {
|
||||
response = await fetch(`${this.config.baseURL}/chat/completions`, {
|
||||
response = await this.fetch(`${this.config.baseURL}/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${this.config.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify(
|
||||
modelPatch({
|
||||
model: this.config.model,
|
||||
temperature: this.config.temperature,
|
||||
messages,
|
||||
|
||||
tools: openaiTools,
|
||||
// tool_choice: 'required',
|
||||
tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
|
||||
|
||||
// model specific params
|
||||
|
||||
// reasoning_effort: 'minimal',
|
||||
// verbosity: 'low',
|
||||
parallel_tool_calls: false,
|
||||
})
|
||||
),
|
||||
body: JSON.stringify(modelPatch(requestBody)),
|
||||
signal: abortSignal,
|
||||
})
|
||||
} catch (error: unknown) {
|
||||
// Network error
|
||||
console.error(error)
|
||||
throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
|
||||
}
|
||||
|
||||
@@ -92,77 +90,93 @@ export class OpenAIClient implements LLMClient {
|
||||
)
|
||||
}
|
||||
|
||||
// 4. Parse and validate response
|
||||
const data = await response.json()
|
||||
|
||||
// 4. Check finish_reason
|
||||
const choice = data.choices?.[0]
|
||||
if (!choice) {
|
||||
throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data)
|
||||
}
|
||||
|
||||
// Check finish_reason
|
||||
switch (choice.finish_reason) {
|
||||
case 'tool_calls':
|
||||
// ✅ Normal
|
||||
case 'function_call': // gemini
|
||||
case 'stop': // some models use this even with tool calls
|
||||
break
|
||||
case 'length':
|
||||
// ⚠️ Token limit reached
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.CONTEXT_LENGTH,
|
||||
'Response truncated: max tokens reached',
|
||||
data
|
||||
'Response truncated: max tokens reached'
|
||||
)
|
||||
case 'content_filter':
|
||||
// ❌ Content filtered
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.CONTENT_FILTER,
|
||||
'Content filtered by safety system',
|
||||
data
|
||||
)
|
||||
case 'stop':
|
||||
// ❌ Did not call tool (we require tool call)
|
||||
throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'Model did not call any tool', data)
|
||||
throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
|
||||
default:
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.UNKNOWN,
|
||||
`Unexpected finish_reason: ${choice.finish_reason}`,
|
||||
data
|
||||
`Unexpected finish_reason: ${choice.finish_reason}`
|
||||
)
|
||||
}
|
||||
|
||||
// 5. Parse tool call
|
||||
const toolCall = choice.message?.tool_calls?.[0]
|
||||
if (!toolCall) {
|
||||
throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'No tool call found in response', data)
|
||||
}
|
||||
// Apply normalizeResponse if provided (for fixing format issues automatically)
|
||||
const normalizedData = options?.normalizeResponse ? options.normalizeResponse(data) : data
|
||||
const normalizedChoice = (normalizedData as any).choices?.[0]
|
||||
|
||||
const toolName = toolCall.function.name
|
||||
const tool = tools[toolName]
|
||||
if (!tool) {
|
||||
throw new InvokeError(InvokeErrorType.UNKNOWN, `Tool ${toolName} not found`, data)
|
||||
}
|
||||
|
||||
// 6. Parse and validate arguments
|
||||
let toolArgs: unknown
|
||||
try {
|
||||
toolArgs = JSON.parse(toolCall.function.arguments)
|
||||
} catch (e) {
|
||||
throw new InvokeError(InvokeErrorType.INVALID_TOOL_ARGS, 'Invalid JSON in tool arguments', e)
|
||||
}
|
||||
|
||||
// Validate against zod schema
|
||||
const validation = tool.inputSchema.safeParse(toolArgs)
|
||||
if (!validation.success) {
|
||||
// Get tool name from response
|
||||
const toolCallName = normalizedChoice?.message?.tool_calls?.[0]?.function?.name
|
||||
if (!toolCallName) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
`Tool arguments validation failed: ${validation.error.message}`,
|
||||
validation.error
|
||||
InvokeErrorType.NO_TOOL_CALL,
|
||||
'No tool call found in response',
|
||||
normalizedData
|
||||
)
|
||||
}
|
||||
|
||||
// 7. Execute tool
|
||||
const tool = tools[toolCallName]
|
||||
if (!tool) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.UNKNOWN,
|
||||
`Tool "${toolCallName}" not found in tools`,
|
||||
normalizedData
|
||||
)
|
||||
}
|
||||
|
||||
// Extract and parse tool arguments
|
||||
const argString = normalizedChoice.message?.tool_calls?.[0]?.function?.arguments
|
||||
if (!argString) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
'No tool call arguments found',
|
||||
normalizedData
|
||||
)
|
||||
}
|
||||
|
||||
let parsedArgs: unknown
|
||||
try {
|
||||
parsedArgs = JSON.parse(argString)
|
||||
} catch (error) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
'Failed to parse tool arguments as JSON',
|
||||
error
|
||||
)
|
||||
}
|
||||
|
||||
// Validate with schema
|
||||
const validation = tool.inputSchema.safeParse(parsedArgs)
|
||||
if (!validation.success) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
'Tool arguments validation failed',
|
||||
validation.error
|
||||
)
|
||||
}
|
||||
const toolInput = validation.data
|
||||
|
||||
// 5. Execute tool
|
||||
let toolResult: unknown
|
||||
try {
|
||||
toolResult = await tool.execute(validation.data)
|
||||
toolResult = await tool.execute(toolInput)
|
||||
} catch (e) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.TOOL_EXECUTION_ERROR,
|
||||
@@ -171,12 +185,11 @@ export class OpenAIClient implements LLMClient {
|
||||
)
|
||||
}
|
||||
|
||||
// 8. Return result (including cache tokens)
|
||||
// Return result
|
||||
return {
|
||||
toolCall: {
|
||||
// id: toolCall.id,
|
||||
name: toolName,
|
||||
args: validation.data as Record<string, unknown>,
|
||||
name: toolCallName,
|
||||
args: toolInput,
|
||||
},
|
||||
toolResult,
|
||||
usage: {
|
||||
|
||||
@@ -1,124 +0,0 @@
|
||||
/**
|
||||
* OpenAI Client implementation
|
||||
*/
|
||||
import { InvokeError, InvokeErrorType } from './errors'
|
||||
import type { InvokeResult, LLMClient, LLMConfig, MacroToolInput, Message, Tool } from './types'
|
||||
import { lenientParseMacroToolCall, modelPatch, zodToOpenAITool } from './utils'
|
||||
|
||||
export class OpenAIClient implements LLMClient {
|
||||
config: Required<LLMConfig>
|
||||
private fetch: typeof globalThis.fetch
|
||||
|
||||
constructor(config: Required<LLMConfig>) {
|
||||
this.config = config
|
||||
this.fetch = config.customFetch
|
||||
}
|
||||
|
||||
async invoke(
|
||||
messages: Message[],
|
||||
tools: { AgentOutput: Tool<MacroToolInput> },
|
||||
abortSignal?: AbortSignal
|
||||
): Promise<InvokeResult> {
|
||||
// 1. Convert tools to OpenAI format
|
||||
const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
|
||||
|
||||
// 2. Call API
|
||||
let response: Response
|
||||
try {
|
||||
response = await this.fetch(`${this.config.baseURL}/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${this.config.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify(
|
||||
modelPatch({
|
||||
model: this.config.model,
|
||||
temperature: this.config.temperature,
|
||||
messages,
|
||||
|
||||
tools: openaiTools,
|
||||
// tool_choice: 'required',
|
||||
tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
|
||||
parallel_tool_calls: false,
|
||||
})
|
||||
),
|
||||
signal: abortSignal,
|
||||
})
|
||||
} catch (error: unknown) {
|
||||
// Network error
|
||||
console.error(error)
|
||||
throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
|
||||
}
|
||||
|
||||
// 3. Handle HTTP errors
|
||||
if (!response.ok) {
|
||||
const errorData = await response.json().catch()
|
||||
const errorMessage =
|
||||
(errorData as { error?: { message?: string } }).error?.message || response.statusText
|
||||
|
||||
if (response.status === 401 || response.status === 403) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.AUTH_ERROR,
|
||||
`Authentication failed: ${errorMessage}`,
|
||||
errorData
|
||||
)
|
||||
}
|
||||
if (response.status === 429) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.RATE_LIMIT,
|
||||
`Rate limit exceeded: ${errorMessage}`,
|
||||
errorData
|
||||
)
|
||||
}
|
||||
if (response.status >= 500) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.SERVER_ERROR,
|
||||
`Server error: ${errorMessage}`,
|
||||
errorData
|
||||
)
|
||||
}
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.UNKNOWN,
|
||||
`HTTP ${response.status}: ${errorMessage}`,
|
||||
errorData
|
||||
)
|
||||
}
|
||||
|
||||
// parse response
|
||||
|
||||
const data = await response.json()
|
||||
const tool = tools.AgentOutput
|
||||
const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any)
|
||||
|
||||
// Execute tool
|
||||
let toolResult: unknown
|
||||
try {
|
||||
toolResult = await tool.execute(macroToolInput)
|
||||
} catch (e) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.TOOL_EXECUTION_ERROR,
|
||||
`Tool execution failed: ${(e as Error).message}`,
|
||||
e
|
||||
)
|
||||
}
|
||||
|
||||
// Return result (including cache tokens)
|
||||
return {
|
||||
toolCall: {
|
||||
// id: toolCall.id,
|
||||
name: 'AgentOutput',
|
||||
args: macroToolInput,
|
||||
},
|
||||
toolResult,
|
||||
usage: {
|
||||
promptTokens: data.usage?.prompt_tokens ?? 0,
|
||||
completionTokens: data.usage?.completion_tokens ?? 0,
|
||||
totalTokens: data.usage?.total_tokens ?? 0,
|
||||
cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
|
||||
reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
|
||||
},
|
||||
rawResponse: data,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,37 +1,4 @@
|
||||
/**
|
||||
* @topic LLM 与主流程的隔离
|
||||
* @reasoning
|
||||
* 将 llm 的调用和主流程分开是复杂的,
|
||||
* 因为 agent 的 tool call 通常集成在 llm 模块中,而而先得到 llm 返回,然后处理工具调用
|
||||
* tools 和 llm 调用的逻辑不可避免地耦合在一起,tool 的执行又和主流程耦合在一起
|
||||
* 而 history 的维护和更新逻辑,又必须嵌入多轮 tool call 中
|
||||
* @reasoning
|
||||
* - 放弃框架提供的自动的多轮调用,每轮调用都由主流程发起
|
||||
* - 理想情况下,llm 调用应该获得 structured output,然后由额外的模块触发 tool call,目前模型和框架都无法实现
|
||||
* - 当前只能将 llm api 和 本地 tool call 耦合在一起,不关心其中的衔接方式
|
||||
* @conclusion
|
||||
* - @llm responsibility boundary:
|
||||
* - call llm api with given messages and tools
|
||||
* - invoke tool call and get the result of the tool
|
||||
* - return the result to main loop
|
||||
* - @main_loop responsibility boundary:
|
||||
* - maintain all behaviors of an **agent**
|
||||
* @conclusion
|
||||
* - 这里的 llm 模块不是 agent,只负责一轮 llm 调用和工具调用,无状态
|
||||
*/
|
||||
/**
|
||||
* @topic 结构化输出
|
||||
* @facts
|
||||
* - 几乎所有模型都支持 tool call schema
|
||||
* - 几乎所有模型都支持返回 json
|
||||
* - 只有 openAI/grok/gemini 支持 schema 并保证格式
|
||||
* - 主流模型都支持 tool_choice: required
|
||||
* - 除了 qwen 必须指定一个函数名 (9月上新后支持)
|
||||
* @conclusion
|
||||
* - 永远使用 tool call 来返回结构化数据,禁止模型直接返回(视为出错)
|
||||
* - 不能假设 tool 参数合法,必须有修复机制,而且修复也应该使用 tool call 返回
|
||||
*/
|
||||
import { OpenAIClient } from './OpenAILenientClient'
|
||||
import { OpenAIClient } from './OpenAIClient'
|
||||
import {
|
||||
DEFAULT_API_KEY,
|
||||
DEFAULT_BASE_URL,
|
||||
@@ -40,27 +7,9 @@ import {
|
||||
LLM_MAX_RETRIES,
|
||||
} from './constants'
|
||||
import { InvokeError } from './errors'
|
||||
import type {
|
||||
AgentBrain,
|
||||
InvokeResult,
|
||||
LLMClient,
|
||||
LLMConfig,
|
||||
MacroToolInput,
|
||||
MacroToolResult,
|
||||
Message,
|
||||
Tool,
|
||||
} from './types'
|
||||
import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
|
||||
|
||||
export type {
|
||||
AgentBrain,
|
||||
InvokeResult,
|
||||
LLMClient,
|
||||
LLMConfig,
|
||||
MacroToolInput,
|
||||
MacroToolResult,
|
||||
Message,
|
||||
Tool,
|
||||
}
|
||||
export type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }
|
||||
|
||||
export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
|
||||
return {
|
||||
@@ -93,11 +42,12 @@ export class LLM extends EventTarget {
|
||||
async invoke(
|
||||
messages: Message[],
|
||||
tools: Record<string, Tool>,
|
||||
abortSignal: AbortSignal
|
||||
abortSignal: AbortSignal,
|
||||
options?: InvokeOptions
|
||||
): Promise<InvokeResult> {
|
||||
return await withRetry(
|
||||
async () => {
|
||||
const result = await this.client.invoke(messages, tools, abortSignal)
|
||||
const result = await this.client.invoke(messages, tools, abortSignal, options)
|
||||
|
||||
return result
|
||||
},
|
||||
|
||||
@@ -32,6 +32,24 @@ export interface Tool<TParams = any, TResult = any> {
|
||||
execute: (args: TParams) => Promise<TResult>
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke options for LLM call
|
||||
*/
|
||||
export interface InvokeOptions {
|
||||
/**
|
||||
* Force LLM to call a specific tool by name.
|
||||
* If provided: tool_choice = { type: 'function', function: { name: toolChoiceName } }
|
||||
* If not provided: tool_choice = 'required' (must call some tool, but model chooses which)
|
||||
*/
|
||||
toolChoiceName?: string
|
||||
/**
|
||||
* Response normalization function.
|
||||
* Called before parsing the response.
|
||||
* Used to fix various response format errors from the model.
|
||||
*/
|
||||
normalizeResponse?: (response: any) => any
|
||||
}
|
||||
|
||||
/**
|
||||
* LLM Client interface
|
||||
* Note: Does not use generics because each tool in the tools array has different types
|
||||
@@ -40,7 +58,8 @@ export interface LLMClient {
|
||||
invoke(
|
||||
messages: Message[],
|
||||
tools: Record<string, Tool>,
|
||||
abortSignal?: AbortSignal
|
||||
abortSignal?: AbortSignal,
|
||||
options?: InvokeOptions
|
||||
): Promise<InvokeResult>
|
||||
}
|
||||
|
||||
@@ -82,36 +101,3 @@ export interface LLMConfig {
|
||||
*/
|
||||
customFetch?: typeof globalThis.fetch
|
||||
}
|
||||
|
||||
/**
|
||||
* Agent brain state - the reflection-before-action model
|
||||
*
|
||||
* Every tool call must first reflect on:
|
||||
* - evaluation_previous_goal: How well did the previous action achieve its goal?
|
||||
* - memory: Key information to remember for future steps
|
||||
* - next_goal: What should be accomplished in the next action?
|
||||
*/
|
||||
export interface AgentBrain {
|
||||
// thinking?: string
|
||||
evaluation_previous_goal: string
|
||||
memory: string
|
||||
next_goal: string
|
||||
}
|
||||
|
||||
/**
|
||||
* MacroTool input structure
|
||||
*
|
||||
* This is the core abstraction that enforces the "reflection-before-action" mental model.
|
||||
* Before executing any action, the LLM must output its reasoning state.
|
||||
*/
|
||||
export interface MacroToolInput extends AgentBrain {
|
||||
action: Record<string, any>
|
||||
}
|
||||
|
||||
/**
|
||||
* MacroTool output structure
|
||||
*/
|
||||
export interface MacroToolResult {
|
||||
input: MacroToolInput
|
||||
output: string
|
||||
}
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
import chalk from 'chalk'
|
||||
import { z } from 'zod'
|
||||
|
||||
import { InvokeError, InvokeErrorType } from './errors'
|
||||
import type { MacroToolInput, Tool } from './types'
|
||||
import type { Tool } from './types'
|
||||
|
||||
function debug(message: string) {
|
||||
console.debug(chalk.gray('[LLM]'), message)
|
||||
@@ -26,176 +25,6 @@ export function zodToOpenAITool(name: string, tool: Tool) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Although some models cannot guarantee correct response. Common issues are fixable:
|
||||
* - Instead of returning a proper tool call. Return the tool call parameters in the message content.
|
||||
* - Returned tool calls or messages don't follow the nested MacroToolInput format.
|
||||
*/
|
||||
export function lenientParseMacroToolCall(
|
||||
responseData: any,
|
||||
inputSchema: z.ZodObject<MacroToolInput & Record<string, any>>
|
||||
): MacroToolInput {
|
||||
// check
|
||||
const choice = responseData.choices?.[0]
|
||||
if (!choice) {
|
||||
throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData)
|
||||
}
|
||||
|
||||
// check
|
||||
switch (choice.finish_reason) {
|
||||
case 'tool_calls':
|
||||
case 'function_call': // gemini
|
||||
case 'stop': // will try a robust parse
|
||||
// ✅ Normal
|
||||
break
|
||||
case 'length':
|
||||
// ⚠️ Token limit reached
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.CONTEXT_LENGTH,
|
||||
'Response truncated: max tokens reached'
|
||||
)
|
||||
case 'content_filter':
|
||||
// ❌ Content filtered
|
||||
throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
|
||||
default:
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.UNKNOWN,
|
||||
`Unexpected finish_reason: ${choice.finish_reason}`
|
||||
)
|
||||
}
|
||||
|
||||
// Extract action schema from MacroToolInput schema
|
||||
const actionSchema = inputSchema.shape.action
|
||||
if (!actionSchema) {
|
||||
throw new Error('inputSchema must have an "action" field')
|
||||
}
|
||||
|
||||
// patch stopReason mis-format
|
||||
|
||||
let arg: string | null = null
|
||||
|
||||
// try to use tool call
|
||||
const toolCall = choice.message?.tool_calls?.[0]?.function
|
||||
arg = toolCall?.arguments ?? null
|
||||
|
||||
if (arg && toolCall.name !== 'AgentOutput') {
|
||||
// TODO: check if toolCall.name is a valid action name
|
||||
// case: instead of AgentOutput, the model returned a action name as tool call
|
||||
console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call'))
|
||||
let tmpArg
|
||||
try {
|
||||
tmpArg = JSON.parse(arg)
|
||||
} catch (error) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
'Failed to parse tool arguments as JSON',
|
||||
error
|
||||
)
|
||||
}
|
||||
arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } })
|
||||
}
|
||||
|
||||
if (!arg) {
|
||||
// try to use message content as JSON
|
||||
arg = choice.message?.content.trim() || null
|
||||
}
|
||||
|
||||
if (!arg) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.NO_TOOL_CALL,
|
||||
'No tool call or content found in response',
|
||||
responseData
|
||||
)
|
||||
}
|
||||
|
||||
// make sure is valid JSON
|
||||
|
||||
let parsedArgs: any
|
||||
try {
|
||||
parsedArgs = JSON.parse(arg)
|
||||
} catch (error) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
'Failed to parse tool arguments as JSON',
|
||||
error
|
||||
)
|
||||
}
|
||||
|
||||
// patch incomplete formats
|
||||
|
||||
if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) {
|
||||
// case: nested MacroToolInput format (correct format)
|
||||
|
||||
// some models may give a empty action (they may think reasoning and action should be separate)
|
||||
if (!parsedArgs.action) {
|
||||
console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call'))
|
||||
parsedArgs.action = {
|
||||
wait: { seconds: 1 },
|
||||
}
|
||||
}
|
||||
} else if (parsedArgs.type && parsedArgs.function) {
|
||||
// case: upper level function call format provided. only keep its arguments
|
||||
// TODO: check if function name is a valid action name
|
||||
if (parsedArgs.function.name !== 'AgentOutput')
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
`Expected function name "AgentOutput", got "${parsedArgs.function.name}"`,
|
||||
null
|
||||
)
|
||||
|
||||
console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call'))
|
||||
parsedArgs = parsedArgs.function.arguments
|
||||
} else if (parsedArgs.name && parsedArgs.arguments) {
|
||||
// case: upper level function call format provided. only keep its arguments
|
||||
// TODO: check if function name is a valid action name
|
||||
if (parsedArgs.name !== 'AgentOutput')
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
`Expected function name "AgentOutput", got "${parsedArgs.name}"`,
|
||||
null
|
||||
)
|
||||
|
||||
console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call'))
|
||||
parsedArgs = parsedArgs.arguments
|
||||
} else {
|
||||
// case: only action parameters provided, wrap into MacroToolInput
|
||||
// TODO: check if action name is valid
|
||||
console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call'))
|
||||
parsedArgs = { action: parsedArgs } as MacroToolInput
|
||||
}
|
||||
|
||||
// make sure it's not wrapped as string
|
||||
if (typeof parsedArgs === 'string') {
|
||||
console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call'))
|
||||
try {
|
||||
parsedArgs = JSON.parse(parsedArgs)
|
||||
} catch (error) {
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
'Failed to parse nested tool arguments as JSON',
|
||||
error
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
const validation = inputSchema.safeParse(parsedArgs)
|
||||
if (validation.success) {
|
||||
return validation.data as unknown as MacroToolInput
|
||||
} else {
|
||||
const action = parsedArgs.action ?? {}
|
||||
const actionName = Object.keys(action)[0] || 'unknown'
|
||||
const actionArgs = JSON.stringify(action[actionName] || 'unknown')
|
||||
|
||||
// TODO: check if action name is valid. give a readable error message
|
||||
|
||||
throw new InvokeError(
|
||||
InvokeErrorType.INVALID_TOOL_ARGS,
|
||||
`Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`,
|
||||
validation.error
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Patch model specific parameters
|
||||
*/
|
||||
@@ -206,10 +35,19 @@ export function modelPatch(body: Record<string, any>) {
|
||||
const modelName = normalizeModelName(model)
|
||||
|
||||
if (modelName.startsWith('claude')) {
|
||||
debug('Applying Claude patch: change tool_choice and disable thinking')
|
||||
body.tool_choice = { type: 'tool', name: 'AgentOutput' }
|
||||
debug('Applying Claude patch: disable thinking')
|
||||
body.thinking = { type: 'disabled' }
|
||||
// body.reasoning = { enabled: 'disabled' }
|
||||
|
||||
// Convert tool_choice to Claude format
|
||||
if (body.tool_choice === 'required') {
|
||||
// 'required' -> { type: 'any' } (must call some tool)
|
||||
debug('Applying Claude patch: convert tool_choice "required" to { type: "any" }')
|
||||
body.tool_choice = { type: 'any' }
|
||||
} else if (body.tool_choice?.function?.name) {
|
||||
// { type: 'function', function: { name: '...' } } -> { type: 'tool', name: '...' }
|
||||
debug('Applying Claude patch: convert tool_choice format')
|
||||
body.tool_choice = { type: 'tool', name: body.tool_choice.function.name }
|
||||
}
|
||||
}
|
||||
|
||||
if (modelName.startsWith('grok')) {
|
||||
|
||||
@@ -2,13 +2,7 @@
|
||||
* Copyright (C) 2025 Alibaba Group Holding Limited
|
||||
* All rights reserved.
|
||||
*/
|
||||
import {
|
||||
type AgentBrain,
|
||||
LLM,
|
||||
type MacroToolInput,
|
||||
type MacroToolResult,
|
||||
type Tool,
|
||||
} from '@page-agent/llms'
|
||||
import { LLM, type Tool } from '@page-agent/llms'
|
||||
import { PageController } from '@page-agent/page-controller'
|
||||
import { Panel, SimulatorMask } from '@page-agent/ui'
|
||||
import chalk from 'chalk'
|
||||
@@ -18,15 +12,46 @@ import type { PageAgentConfig } from './config'
|
||||
import { MAX_STEPS } from './config/constants'
|
||||
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
|
||||
import { tools } from './tools'
|
||||
import { trimLines, uid, waitUntil } from './utils'
|
||||
import { normalizeResponse, trimLines, uid, waitUntil } from './utils'
|
||||
import { assert } from './utils/assert'
|
||||
|
||||
/**
|
||||
* Agent brain state - the reflection-before-action model
|
||||
*
|
||||
* Every tool call must first reflect on:
|
||||
* - evaluation_previous_goal: How well did the previous action achieve its goal?
|
||||
* - memory: Key information to remember for future steps
|
||||
* - next_goal: What should be accomplished in the next action?
|
||||
*/
|
||||
export interface AgentReflection {
|
||||
evaluation_previous_goal: string
|
||||
memory: string
|
||||
next_goal: string
|
||||
}
|
||||
|
||||
/**
|
||||
* MacroTool input structure
|
||||
*
|
||||
* This is the core abstraction that enforces the "reflection-before-action" mental model.
|
||||
* Before executing any action, the LLM must output its reasoning state.
|
||||
*/
|
||||
export interface MacroToolInput extends Partial<AgentReflection> {
|
||||
action: Record<string, any>
|
||||
}
|
||||
|
||||
/**
|
||||
* MacroTool output structure
|
||||
*/
|
||||
export interface MacroToolResult {
|
||||
input: MacroToolInput
|
||||
output: string
|
||||
}
|
||||
|
||||
export type { PageAgentConfig }
|
||||
export { tool, type PageAgentTool } from './tools'
|
||||
export type { AgentBrain, MacroToolInput, MacroToolResult }
|
||||
|
||||
export interface AgentHistory {
|
||||
brain: AgentBrain
|
||||
brain: Partial<AgentReflection>
|
||||
action: {
|
||||
name: string
|
||||
input: any
|
||||
@@ -124,9 +149,6 @@ export class PageAgent extends EventTarget {
|
||||
window.addEventListener('beforeunload', this.#beforeUnloadListener)
|
||||
}
|
||||
|
||||
/**
|
||||
* @todo maybe return something?
|
||||
*/
|
||||
async execute(task: string): Promise<ExecutionResult> {
|
||||
if (!task) throw new Error('Task is required')
|
||||
this.task = task
|
||||
@@ -183,7 +205,11 @@ export class PageAgent extends EventTarget {
|
||||
},
|
||||
],
|
||||
{ AgentOutput: this.#packMacroTool() },
|
||||
this.#abortController.signal
|
||||
this.#abortController.signal,
|
||||
{
|
||||
toolChoiceName: 'AgentOutput',
|
||||
normalizeResponse,
|
||||
}
|
||||
)
|
||||
|
||||
const macroResult = result.toolResult as MacroToolResult
|
||||
|
||||
154
packages/page-agent/src/utils/autoFixer.ts
Normal file
154
packages/page-agent/src/utils/autoFixer.ts
Normal file
@@ -0,0 +1,154 @@
|
||||
import chalk from 'chalk'
|
||||
|
||||
/**
|
||||
* Normalize LLM response and fix common format issues.
|
||||
*
|
||||
* Handles:
|
||||
* - No tool_calls but JSON in message.content (fallback)
|
||||
* - Model returns action name as tool call instead of AgentOutput
|
||||
* - Arguments wrapped as double JSON string
|
||||
* - Nested function call format
|
||||
* - Missing action field (fallback to wait)
|
||||
* - etc.
|
||||
*/
|
||||
export function normalizeResponse(response: any): any {
|
||||
let resolvedArguments = null as any
|
||||
|
||||
const choice = (response as { choices?: Choice[] }).choices?.[0]
|
||||
if (!choice) throw new Error('No choices in response')
|
||||
|
||||
const message = choice.message
|
||||
if (!message) throw new Error('No message in choice')
|
||||
|
||||
const toolCall = message.tool_calls?.[0]
|
||||
|
||||
// fix level and location of arguments
|
||||
|
||||
if (toolCall?.function?.arguments) {
|
||||
resolvedArguments = safeJsonParse(toolCall.function.arguments)
|
||||
|
||||
// case: sometimes the model only returns the action level
|
||||
if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
|
||||
console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`))
|
||||
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
|
||||
}
|
||||
} else {
|
||||
// case: sometimes the model returns json in content instead of tool_calls
|
||||
if (message.content) {
|
||||
const content = message.content.trim()
|
||||
const jsonInContent = retrieveJsonFromString(content)
|
||||
if (jsonInContent) {
|
||||
resolvedArguments = safeJsonParse(jsonInContent)
|
||||
|
||||
// case: sometimes the content json includes upper level wrapper
|
||||
if (resolvedArguments?.name === 'AgentOutput') {
|
||||
console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`))
|
||||
resolvedArguments = safeJsonParse(resolvedArguments.arguments)
|
||||
}
|
||||
|
||||
// case: sometimes even 2-levels of wrapping
|
||||
if (resolvedArguments?.type === 'function') {
|
||||
console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`))
|
||||
resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
|
||||
}
|
||||
|
||||
// case: and sometimes action level only
|
||||
// todo: needs better detection logic
|
||||
if (
|
||||
!resolvedArguments?.action &&
|
||||
!resolvedArguments?.evaluation_previous_goal &&
|
||||
!resolvedArguments?.memory &&
|
||||
!resolvedArguments?.next_goal &&
|
||||
!resolvedArguments?.thinking
|
||||
) {
|
||||
console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`))
|
||||
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
|
||||
}
|
||||
} else {
|
||||
throw new Error('No tool_call and the message content does not contain valid JSON')
|
||||
}
|
||||
} else {
|
||||
throw new Error('No tool_call nor message content is present')
|
||||
}
|
||||
}
|
||||
|
||||
// fix double stringified arguments
|
||||
resolvedArguments = safeJsonParse(resolvedArguments)
|
||||
|
||||
// fix incomplete formats
|
||||
if (!resolvedArguments.action) {
|
||||
console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
|
||||
resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
|
||||
}
|
||||
|
||||
// pack back to standard format
|
||||
return {
|
||||
...response,
|
||||
choices: [
|
||||
{
|
||||
...choice,
|
||||
message: {
|
||||
...message,
|
||||
tool_calls: [
|
||||
{
|
||||
...(toolCall || {}),
|
||||
function: {
|
||||
...(toolCall?.function || {}),
|
||||
name: 'AgentOutput',
|
||||
arguments: JSON.stringify(resolvedArguments),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely parse JSON, return original input if not json.
|
||||
*/
|
||||
function safeJsonParse(input: any): any {
|
||||
if (typeof input === 'string') {
|
||||
try {
|
||||
return JSON.parse(input.trim())
|
||||
} catch {
|
||||
return input
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and parse JSON from a string.
|
||||
* - Treat content between the first `{` and the last `}` as JSON.
|
||||
* - Try to parse that content as JSON and return the parsed value (object/array/primitive) if successful, otherwise return null.
|
||||
*/
|
||||
function retrieveJsonFromString(str: string): any {
|
||||
try {
|
||||
const json = /({[\s\S]*})/.exec(str) ?? []
|
||||
if (json.length === 0) {
|
||||
return null
|
||||
}
|
||||
return JSON.parse(json[0]!)
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
interface Choice {
|
||||
message?: {
|
||||
role?: 'assistant'
|
||||
content?: string
|
||||
tool_calls?: {
|
||||
id?: string
|
||||
type?: 'function'
|
||||
function?: {
|
||||
name?: string
|
||||
arguments?: string
|
||||
}
|
||||
}[]
|
||||
}
|
||||
index?: 0
|
||||
finish_reason?: 'tool_calls'
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
export { normalizeResponse } from './autoFixer'
|
||||
|
||||
/**
|
||||
* Wait until condition becomes true
|
||||
* @returns Returns when condition becomes true, throws otherwise
|
||||
|
||||
Reference in New Issue
Block a user