From 6f332aa24a85ee2dca50b7a76f0f0a9773456883 Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Fri, 17 Oct 2025 18:43:41 +0800 Subject: [PATCH] refactor: remove ai-sdk --- ROADMAP.md | 7 +- package-lock.json | 121 ---------------------- package.json | 2 - src/PageAgent.ts | 213 ++++++++++++++++++++------------------- src/config/constants.ts | 1 - src/config/index.ts | 12 ++- src/llms/OpenAIClient.ts | 188 ++++++++++++++++++++++++++++++++++ src/llms/errors.ts | 50 +++++++++ src/llms/index.ts | 131 +++++------------------- src/llms/types.ts | 77 ++++++++++++++ src/llms/utils.ts | 21 ++++ src/tools/index.ts | 27 +++-- 12 files changed, 510 insertions(+), 340 deletions(-) create mode 100644 src/llms/OpenAIClient.ts create mode 100644 src/llms/errors.ts create mode 100644 src/llms/types.ts create mode 100644 src/llms/utils.ts diff --git a/ROADMAP.md b/ROADMAP.md index 7b0d7de..8958c7e 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -10,18 +10,19 @@ The development progress and future plans for PageAgent. - [x] **Multi model provider integration and testing** - [x] **UI with HITL** - Human-in-the-loop user interface - [x] **Landing and doc pages** +- [x] **Remove ai-sdk** - Only one function is being used - [ ] **Robust LLM output** +- [ ] **Hooks for Task and HITL** +- [ ] **Hijacking `page_open` event** - [ ] **Custom knowledge base and instructions** - [ ] **Black/white-list safeguard** - [ ] **Data-masking** -- [ ] **Custom actions and hooks** +- [ ] **Custom actions** - [ ] **Optimize for popular UI frameworks** -- [ ] **Handling page open event** - [ ] **Free evaluation plan?** - [ ] **Working homepage with live LLM API** - [ ] **free CDN** - [ ] **Testing suits** -- [ ] **Remove ai-sdk** - Only one function is being used - [ ] **Support custom llm fetch** - [ ] **Refactor: Separate Agent and Page-Controller** - Agent can run w/o dom diff --git a/package-lock.json b/package-lock.json index 52105ba..8b12332 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,8 +9,6 @@ "version": "0.0.0", "license": "MIT", "dependencies": { - "@ai-sdk/openai": "^2.0.49", - "ai": "^5.0.68", "ai-motion": "^0.4.7", "chalk": "^5.6.2", "zod": "^4.1.12" @@ -50,68 +48,6 @@ "node": ">=20.0.0" } }, - "node_modules/@ai-sdk/gateway": { - "version": "1.0.39", - "resolved": "https://registry.npmjs.org/@ai-sdk/gateway/-/gateway-1.0.39.tgz", - "integrity": "sha512-ijYCKG2sbn2RBVfIgaXNXvzHAf2HpFXxQODtjMI+T7Z4CLryflytchsZZ9qrGtsjiQVopKOV6m6kj4lq5fnbsg==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "2.0.0", - "@ai-sdk/provider-utils": "3.0.12", - "@vercel/oidc": "3.0.2" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@ai-sdk/openai": { - "version": "2.0.49", - "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-2.0.49.tgz", - "integrity": "sha512-BkeTl+gfeJAEap0srZEMK36t9IuQDGYWJyvQTWEgmWn8zoT70LAuO5EKWgAaJYUfjLY5OsM6QlYkjF1XfxCRgw==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "2.0.0", - "@ai-sdk/provider-utils": "3.0.12" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@ai-sdk/provider": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-2.0.0.tgz", - "integrity": "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==", - "license": "Apache-2.0", - "dependencies": { - "json-schema": "^0.4.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/@ai-sdk/provider-utils": { - "version": "3.0.12", - "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-3.0.12.tgz", - "integrity": "sha512-ZtbdvYxdMoria+2SlNarEk6Hlgyf+zzcznlD55EAl+7VZvJaSg2sqPvwArY7L6TfDEDJsnCq0fdhBSkYo0Xqdg==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "2.0.0", - "@standard-schema/spec": "^1.0.0", - "eventsource-parser": "^3.0.5" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, "node_modules/@babel/code-frame": { "version": "7.27.1", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", @@ -1663,15 +1599,6 @@ "node": ">= 8" } }, - "node_modules/@opentelemetry/api": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", - "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", - "license": "Apache-2.0", - "engines": { - "node": ">=8.0.0" - } - }, "node_modules/@rolldown/pluginutils": { "version": "1.0.0-beta.35", "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.35.tgz", @@ -2166,12 +2093,6 @@ "sprintf-js": "~1.0.2" } }, - "node_modules/@standard-schema/spec": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.0.0.tgz", - "integrity": "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==", - "license": "MIT" - }, "node_modules/@swc/core": { "version": "1.13.20", "resolved": "https://registry.npmjs.org/@swc/core/-/core-1.13.20.tgz", @@ -3089,15 +3010,6 @@ "url": "https://opencollective.com/typescript-eslint" } }, - "node_modules/@vercel/oidc": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@vercel/oidc/-/oidc-3.0.2.tgz", - "integrity": "sha512-JekxQ0RApo4gS4un/iMGsIL1/k4KUBe3HmnGcDvzHuFBdQdudEJgTqcsJC7y6Ul4Yw5CeykgvQbX2XeEJd0+DA==", - "license": "Apache-2.0", - "engines": { - "node": ">= 20" - } - }, "node_modules/@vitejs/plugin-react-swc": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/@vitejs/plugin-react-swc/-/plugin-react-swc-4.1.0.tgz", @@ -3167,24 +3079,6 @@ "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, - "node_modules/ai": { - "version": "5.0.68", - "resolved": "https://registry.npmjs.org/ai/-/ai-5.0.68.tgz", - "integrity": "sha512-SB6r+4TkKVlSg2ozGBSfuf6Is5hrcX/bpGBzOoyHIN3b4ILGhaly0IHEvP8+3GGIHXqtkPVEUmR6V05jKdjNlg==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/gateway": "1.0.39", - "@ai-sdk/provider": "2.0.0", - "@ai-sdk/provider-utils": "3.0.12", - "@opentelemetry/api": "1.9.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, "node_modules/ai-motion": { "version": "0.4.7", "resolved": "https://registry.npmjs.org/ai-motion/-/ai-motion-0.4.7.tgz", @@ -4369,15 +4263,6 @@ "dev": true, "license": "MIT" }, - "node_modules/eventsource-parser": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz", - "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==", - "license": "MIT", - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/exsolve": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/exsolve/-/exsolve-1.0.7.tgz", @@ -4998,12 +4883,6 @@ "dev": true, "license": "MIT" }, - "node_modules/json-schema": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", - "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", - "license": "(AFL-2.1 OR BSD-3-Clause)" - }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", diff --git a/package.json b/package.json index 9bdd221..fceb3d5 100644 --- a/package.json +++ b/package.json @@ -50,8 +50,6 @@ "prepare": "husky" }, "dependencies": { - "@ai-sdk/openai": "^2.0.49", - "ai": "^5.0.68", "ai-motion": "^0.4.7", "chalk": "^5.6.2", "zod": "^4.1.12" diff --git a/src/PageAgent.ts b/src/PageAgent.ts index e37dafb..822afb4 100644 --- a/src/PageAgent.ts +++ b/src/PageAgent.ts @@ -2,21 +2,19 @@ * Copyright (C) 2025 Alibaba Group Holding Limited * All rights reserved. */ -import { tool } from 'ai' -import type { LanguageModelUsage, ToolSet } from 'ai' import chalk from 'chalk' import zod from 'zod' import type { PageAgentConfig } from './config' -import { MACRO_TOOL_NAME, MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants' +import { MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants' import * as dom from './dom' import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type' import { getPageInfo } from './dom/getPageInfo' import { I18n } from './i18n' -import { LLM } from './llms' +import { type InvokeResult, LLM, type Message, type Tool } from './llms' import { patchReact } from './patches/react' import SYSTEM_PROMPT from './prompts/system_prompt.md?raw' -import { tools } from './tools' +import { type PageAgentTool, tools } from './tools' import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel' import { SimulatorMask } from './ui/SimulatorMask' import { trimLines, uid, waitUntil } from './utils' @@ -32,14 +30,38 @@ export interface AgentBrain { next_goal: string } +/** + * MacroTool input structure + */ +export interface MacroToolInput { + evaluation_previous_goal?: string + memory?: string + next_goal?: string + action: Record +} + +/** + * MacroTool output structure + */ +export interface MacroToolResult { + input: MacroToolInput + output: string +} + export interface AgentHistory { brain: AgentBrain action: { name: string input: any - output: any + output: string + } + usage: { + promptTokens: number + completionTokens: number + totalTokens: number + cachedTokens?: number + reasoningTokens?: number } - usage: LanguageModelUsage } export interface ExecutionResult { @@ -148,19 +170,17 @@ export class PageAgent extends EventTarget { content: this.#assembleUserPrompt(), }, ], - // tools, - this.#packMacroTool(), + { AgentOutput: this.#packMacroTool() }, this.#abortController.signal ) - const toolResult = result.toolResult - const input = toolResult.input - const output = toolResult.output + const macroResult = result.toolResult as MacroToolResult + const input = macroResult.input + const output = macroResult.output const brain = { - thinking: input.thinking, - evaluation_previous_goal: input.evaluation_previous_goal, - memory: input.memory, - next_goal: input.next_goal, + evaluation_previous_goal: input.evaluation_previous_goal || '', + memory: input.memory || '', + next_goal: input.next_goal || '', } const actionName = Object.keys(input.action)[0] const action = { @@ -188,8 +208,8 @@ export class PageAgent extends EventTarget { } } if (actionName === 'done') { - const success = action.input.success || false - const text = action.input.text || 'no text provided' + const success = action.input?.success ?? false + const text = action.input?.text || 'no text provided' console.log(chalk.green.bold('Task completed'), success, text) this.#onDone(text, success) return { @@ -219,24 +239,8 @@ export class PageAgent extends EventTarget { * - action: { toolName: toolInput } * where action must be selected from tools defined in this.tools */ - #packMacroTool(): ToolSet { + #packMacroTool(): Tool { const tools = this.tools - // discriminated version - // @note Success rate ~0, model seems unable to understand discriminated union - - // // Create discriminated union schemas from tools - // const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => { - // return zod.object({ - // name: zod.literal(toolName), - // input: tool.inputSchema, - // }) - // }) - - // // Ensure at least one tool exists - // assert(actionSchemas.length, 'No tools available to create macro tool') - - // const actionSchema = zod.discriminatedUnion('name', actionSchemas as any) - // union version const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => { return zod.object({ @@ -244,89 +248,96 @@ export class PageAgent extends EventTarget { }) }) - const actionSchema = zod.union(actionSchemas) + const actionSchema = zod.union( + actionSchemas as unknown as [zod.ZodType, zod.ZodType, ...zod.ZodType[]] + ) + + const macroToolSchema = zod.object({ + // thinking: zod.string().optional(), + evaluation_previous_goal: zod.string().optional(), + memory: zod.string().optional(), + next_goal: zod.string().optional(), + action: actionSchema, + }) return { - [MACRO_TOOL_NAME]: tool({ - // description: 'Output the result of the agent', - inputSchema: zod.object({ - // thinking: zod.string().optional(), - evaluation_previous_goal: zod.string().optional(), - memory: zod.string().optional(), - next_goal: zod.string().optional(), - action: actionSchema, - }), - execute: async (input, options) => { - // abort - if (this.#abortController.signal.aborted) throw new Error('AbortError') - // pause - await waitUntil(() => !this.paused) + // name: MACRO_TOOL_NAME, + // description: 'Execute agent action', // @todo remote + inputSchema: macroToolSchema as zod.ZodType, + execute: async (input: MacroToolInput): Promise => { + // abort + if (this.#abortController.signal.aborted) throw new Error('AbortError') + // pause + await waitUntil(() => !this.paused) - console.log(chalk.blue.bold('MacroTool execute'), input) - const action = input.action! + console.log(chalk.blue.bold('MacroTool execute'), input) + const action = input.action - const toolName = Object.keys(action)[0] - const toolInput = action[toolName] - const brain = trimLines(`✅: ${input.evaluation_previous_goal} + const toolName = Object.keys(action)[0] + const toolInput = action[toolName] + const brain = trimLines(`✅: ${input.evaluation_previous_goal} 💾: ${input.memory} 🎯: ${input.next_goal} `) - console.log(brain) - this.bus.emit('panel:update', { - type: 'thinking', - displayText: brain, - }) + console.log(brain) + this.bus.emit('panel:update', { + type: 'thinking', + displayText: brain, + }) - // Find the corresponding tool - const tool = tools.get(toolName) - assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`) + // Find the corresponding tool + const tool = tools.get(toolName) + assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`) - console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput, options) + console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput) + this.bus.emit('panel:update', { + type: 'tool_executing', + toolName, + toolArgs: toolInput, + displayText: getToolExecutingText(toolName, toolInput, this.i18n), + }) + + const startTime = Date.now() + + // Execute tool, bind `this` to PageAgent + let result = await tool.execute.bind(this)(toolInput) + + const duration = Date.now() - startTime + console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result) + + if (toolName === 'wait') { + this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000) + result += `\n You have waited ${this.#totalWaitTime} seconds accumulatively.` + if (this.#totalWaitTime >= 3) + result += '\nDo NOT wait any longer unless you have a good reason.\n' + result += '' + } else { + // For other tools, reset wait time + this.#totalWaitTime = 0 + } + + // Briefly display execution result + const displayResult = getToolCompletedText(toolName, toolInput, this.i18n) + if (displayResult) this.bus.emit('panel:update', { type: 'tool_executing', toolName, toolArgs: toolInput, - displayText: getToolExecutingText(toolName, toolInput, this.i18n), + toolResult: result, + displayText: displayResult, + duration, }) - const startTime = Date.now() + // Wait a moment to let user see the result + await new Promise((resolve) => setTimeout(resolve, 100)) - // Execute tool, passing options parameter - let result = await tool.execute!.bind(this)(toolInput, options) - - const duration = Date.now() - startTime - console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result) - - if (toolName === 'wait') { - this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000) - result += `\n You have waited ${this.#totalWaitTime} seconds accumulatively.` - if (this.#totalWaitTime >= 3) - result += '\nDo NOT wait any longer unless you have a good reason.\n' - result += '' - } else { - // For other tools, reset wait time - this.#totalWaitTime = 0 - } - - // Briefly display execution result - const displayResult = getToolCompletedText(toolName, toolInput, this.i18n) - if (displayResult) - this.bus.emit('panel:update', { - type: 'tool_executing', - toolName, - toolArgs: toolInput, - toolResult: result, - displayText: displayResult, - duration, - }) - - // Wait a moment to let user see the result - await new Promise((resolve) => setTimeout(resolve, 100)) - - return result - }, - }), + // Return structured result + return { + input, + output: result, + } + }, } } diff --git a/src/config/constants.ts b/src/config/constants.ts index 6e01253..0360e85 100644 --- a/src/config/constants.ts +++ b/src/config/constants.ts @@ -48,6 +48,5 @@ export const DEFAULT_BASE_URL: string = // internal -export const MACRO_TOOL_NAME = 'AgentOutput' as const export const LLM_MAX_RETRIES = 2 export const MAX_STEPS = 20 diff --git a/src/config/index.ts b/src/config/index.ts index 860776b..5ffcf97 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -1,9 +1,17 @@ import type { DomConfig } from '@/dom' import type { SupportedLanguage } from '@/i18n' -import type { LLMConfig } from '@/llms' import { DEFAULT_API_KEY, DEFAULT_BASE_URL, DEFAULT_MODEL_NAME, LLM_MAX_RETRIES } from './constants' +export interface LLMConfig { + baseURL?: string + apiKey?: string + modelName?: string + temperature?: number + maxTokens?: number + maxRetries?: number +} + export interface UIConfig { // theme?: 'light' | 'dark' language?: SupportedLanguage @@ -16,6 +24,8 @@ export function parseLLMConfig(config: LLMConfig): Required { baseURL: config.baseURL ?? DEFAULT_BASE_URL, apiKey: config.apiKey ?? DEFAULT_API_KEY, modelName: config.modelName ?? DEFAULT_MODEL_NAME, + temperature: config.temperature ?? 0.0, + maxTokens: config.maxTokens ?? 4096, maxRetries: config.maxRetries ?? LLM_MAX_RETRIES, } } diff --git a/src/llms/OpenAIClient.ts b/src/llms/OpenAIClient.ts new file mode 100644 index 0000000..95df2da --- /dev/null +++ b/src/llms/OpenAIClient.ts @@ -0,0 +1,188 @@ +/** + * OpenAI Client implementation + */ +import { InvokeError, InvokeErrorType } from './errors' +import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types' +import { zodToOpenAITool } from './utils' + +export class OpenAIClient implements LLMClient { + config: OpenAIClientConfig + + constructor(config: OpenAIClientConfig) { + this.config = config + } + + async invoke( + messages: Message[], + tools: Record, + abortSignal?: AbortSignal + ): Promise { + // 1. Convert tools to OpenAI format + const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool)) + + // 2. Detect if Claude (auto-compatibility) + // const isClaude = this.config.model.toLowerCase().includes('claude') + + // 3. Call API + let response: Response + try { + response = await fetch(`${this.config.baseURL}/chat/completions`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.config.apiKey}`, + }, + body: JSON.stringify({ + model: this.config.model, + messages, + parallel_tool_calls: false, + tools: openaiTools, + tool_choice: 'required', + // tool_choice: { type: 'function', function: { name: 'my_function' } }, + // reasoning_effort: 'minimal', + // verbosity: 'low', + + // Claude doesn't support tool_choice: 'required', auto-omit + // ...(isClaude ? {} : { tool_choice: 'required' }), + temperature: this.config.temperature, + max_tokens: this.config.maxTokens, + }), + signal: abortSignal, + }) + } catch (error: unknown) { + // Network error + throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error) + } + + // 4. Handle HTTP errors + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) + const errorMessage = + (errorData as { error?: { message?: string } }).error?.message || response.statusText + + if (response.status === 401 || response.status === 403) { + throw new InvokeError( + InvokeErrorType.AUTH_ERROR, + `Authentication failed: ${errorMessage}`, + errorData + ) + } + if (response.status === 429) { + throw new InvokeError( + InvokeErrorType.RATE_LIMIT, + `Rate limit exceeded: ${errorMessage}`, + errorData + ) + } + if (response.status >= 500) { + throw new InvokeError( + InvokeErrorType.SERVER_ERROR, + `Server error: ${errorMessage}`, + errorData + ) + } + throw new InvokeError( + InvokeErrorType.UNKNOWN, + `HTTP ${response.status}: ${errorMessage}`, + errorData + ) + } + + const data = await response.json() + + // 5. Check finish_reason + const choice = data.choices?.[0] + if (!choice) { + throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data) + } + + switch (choice.finish_reason) { + case 'tool_calls': + // ✅ Normal + break + case 'length': + // ⚠️ Token limit reached + throw new InvokeError( + InvokeErrorType.CONTEXT_LENGTH, + 'Response truncated: max tokens reached', + data + ) + case 'content_filter': + // ❌ Content filtered + throw new InvokeError( + InvokeErrorType.CONTENT_FILTER, + 'Content filtered by safety system', + data + ) + case 'stop': + // ❌ Did not call tool (we require tool call) + throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'Model did not call any tool', data) + default: + throw new InvokeError( + InvokeErrorType.UNKNOWN, + `Unexpected finish_reason: ${choice.finish_reason}`, + data + ) + } + + // 6. Parse tool call + const toolCall = choice.message?.tool_calls?.[0] + if (!toolCall) { + throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'No tool call found in response', data) + } + + const toolName = toolCall.function.name + const tool = tools[toolName] + if (!tool) { + throw new InvokeError(InvokeErrorType.UNKNOWN, `Tool ${toolName} not found`, data) + } + + // 7. Parse and validate arguments + let toolArgs: unknown + try { + toolArgs = JSON.parse(toolCall.function.arguments) + } catch (e) { + throw new InvokeError(InvokeErrorType.INVALID_TOOL_ARGS, 'Invalid JSON in tool arguments', e) + } + + // Validate against zod schema + const validation = tool.inputSchema.safeParse(toolArgs) + if (!validation.success) { + throw new InvokeError( + InvokeErrorType.INVALID_TOOL_ARGS, + `Tool arguments validation failed: ${validation.error.message}`, + validation.error + ) + } + + // 8. Execute tool + let toolResult: unknown + try { + toolResult = await tool.execute(validation.data) + } catch (e) { + throw new InvokeError( + InvokeErrorType.TOOL_EXECUTION_ERROR, + `Tool execution failed: ${(e as Error).message}`, + e + ) + } + + // 9. Return result (including cache tokens) + return { + toolCall: { + id: toolCall.id, + name: toolName, + args: validation.data as Record, + }, + toolResult, + usage: { + promptTokens: data.usage?.prompt_tokens ?? 0, + completionTokens: data.usage?.completion_tokens ?? 0, + totalTokens: data.usage?.total_tokens ?? 0, + cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens, + reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens, + }, + rawResponse: data, + } + } +} diff --git a/src/llms/errors.ts b/src/llms/errors.ts new file mode 100644 index 0000000..5aa05e1 --- /dev/null +++ b/src/llms/errors.ts @@ -0,0 +1,50 @@ +/** + * Error types and error handling for LLM invocations + */ + +export const InvokeErrorType = { + // Retryable + NETWORK_ERROR: 'network_error', // Network error, retry + RATE_LIMIT: 'rate_limit', // Rate limit, retry + SERVER_ERROR: 'server_error', // 5xx, retry + NO_TOOL_CALL: 'no_tool_call', // Model did not call tool + INVALID_TOOL_ARGS: 'invalid_tool_args', // Tool args don't match schema + TOOL_EXECUTION_ERROR: 'tool_execution_error', // Tool execution error + + UNKNOWN: 'unknown', + + // Non-retryable + AUTH_ERROR: 'auth_error', // Authentication failed + CONTEXT_LENGTH: 'context_length', // Prompt too long + CONTENT_FILTER: 'content_filter', // Content filtered +} as const + +export type InvokeErrorType = (typeof InvokeErrorType)[keyof typeof InvokeErrorType] + +export class InvokeError extends Error { + type: InvokeErrorType + retryable: boolean + statusCode?: number + rawError?: unknown + + constructor(type: InvokeErrorType, message: string, rawError?: unknown) { + super(message) + this.name = 'InvokeError' + this.type = type + this.retryable = this.isRetryable(type) + this.rawError = rawError + } + + private isRetryable(type: InvokeErrorType): boolean { + const retryableTypes: InvokeErrorType[] = [ + InvokeErrorType.NETWORK_ERROR, + InvokeErrorType.RATE_LIMIT, + InvokeErrorType.SERVER_ERROR, + InvokeErrorType.NO_TOOL_CALL, + InvokeErrorType.INVALID_TOOL_ARGS, + InvokeErrorType.TOOL_EXECUTION_ERROR, + InvokeErrorType.UNKNOWN, + ] + return retryableTypes.includes(type) + } +} diff --git a/src/llms/index.ts b/src/llms/index.ts index 1ca8de6..6e6c9c5 100644 --- a/src/llms/index.ts +++ b/src/llms/index.ts @@ -31,29 +31,22 @@ * - 永远使用 tool call 来返回结构化数据,禁止模型直接返回(视为出错) * - 不能假设 tool 参数合法,必须有修复机制,而且修复也应该使用 tool call 返回 */ -import { OpenAIProvider, OpenAIResponsesProviderOptions, createOpenAI } from '@ai-sdk/openai' -import type { LanguageModelV2, LanguageModelV2ToolCall } from '@ai-sdk/provider' -import type { LanguageModelUsage, ModelMessage, TypedToolCall, TypedToolResult } from 'ai' -import { ToolSet, generateText, stepCountIs } from 'ai' import chalk from 'chalk' +import type { LLMConfig } from '@/config' import { parseLLMConfig } from '@/config' -import { MACRO_TOOL_NAME } from '@/config/constants' -import { assert } from '@/utils/assert' import { EventBus, getEventBus } from '@/utils/bus' -export interface LLMConfig { - baseURL?: string - apiKey?: string - modelName?: string - maxRetries?: number -} +import { OpenAIClient } from './OpenAIClient' +import { InvokeError } from './errors' +import type { InvokeResult, LLMClient, Message, Tool } from './types' + +export type { Message, Tool, InvokeResult, LLMClient } export class LLM { config: Required id: string - #openai: OpenAIProvider - #model: LanguageModelV2 + client: LLMClient #bus: EventBus constructor(config: LLMConfig, id: string) { @@ -62,11 +55,14 @@ export class LLM { this.#bus = getEventBus(id) - this.#openai = createOpenAI({ baseURL: this.config.baseURL, apiKey: this.config.apiKey }) - this.#model = this.#openai.chat(this.config.modelName) - - // @note Will throw JSON parsing error - // this.#model = this.#openai.responses(modelName) + // Default to OpenAI client + this.client = new OpenAIClient({ + model: this.config.modelName, + apiKey: this.config.apiKey, + baseURL: this.config.baseURL, + temperature: this.config.temperature, + maxTokens: this.config.maxTokens, + }) } /** @@ -74,96 +70,18 @@ export class LLM { * - invoke tool call *once* * - return the result of the tool */ - async invoke( - messages: ModelMessage[], - tools: T, + async invoke( + messages: Message[], + tools: Record, abortSignal: AbortSignal - ): Promise<{ - toolCall: TypedToolCall - toolResult: TypedToolResult - usage: LanguageModelUsage - }> { - const isClaude = this.config.modelName.slice(0, 8).includes('claude') - // const isQwen = this.config.modelName.slice(0, 6).includes('qwen') - // const isGPT = this.config.modelName.slice(0, 5).includes('gpt') - + ): Promise { return await withRetry( async () => { - const result = await generateText({ - model: this.#model, - messages, - tools, - abortSignal, - /** - * 文档中没有说明,从源码看,@facts - * - 只会重试被识别为 retryable 的 API_CALL_ERROR - * - 返回无法解析的 json 应该不会重试 - * - experimental_repairToolCall 只会执行一次,不算作重试 - * @facts - * - 许多 proxy 过的 openAI 兼容接口返回的错误格式并不规范,通常不会被识别为 retryable - * @conclusion - * - 看起来并不实用,不如完全手工控制粗粒度重试 - */ - // maxRetries: this.config.maxRetries, - maxRetries: 0, - // toolChoice: 'required', - // @note incompatible to Claude - toolChoice: isClaude ? undefined : { type: 'tool', toolName: MACRO_TOOL_NAME as any }, - /** - * controlled by main loop. our method only call api once - */ - // stopWhen: [hasToolCall('done'), stepCountIs(100)], - stopWhen: [stepCountIs(1)], - // stopWhen: [hasToolCall('AgentOutput')], - providerOptions: { - openai: { - // @note this one needs all fields in tool schema must be `required` - // strictJsonSchema: true, - // This way only at most one tool can be called at a time - parallelToolCalls: false, - reasoningEffort: 'minimal', - // @note not working - // serviceTier: 'priority', - textVerbosity: 'low', - // @note Optimize OpenAI model caching, should be unique per user, currently has no effect - promptCacheKey: 'page-agent:' + this.id, - } as OpenAIResponsesProviderOptions, - }, - /** - * schema 出错时执行一次,不确定是否计入重试 - * 目前看起来像是会直接抛错,被 withRetry 处理 - * @note - * 如果不提供,则 ai-sdk 会把 tool-error 加入 message 中重新调用一次, - * 配合 stepCountIs 或者 hasToolCall 都会导致错误被 silent,toolResults 永远为 0 - * 遗憾的是,这里没有办法抛错(抛错后回到默认逻辑),只要这里 repair 不好,就会导致 silent error - * 更糟糕的是,只要传入了 tools,无论 stopWhen 如何设置,都会被当作 multi-step, - * 本质上就和我们 single step 的逻辑冲突 - * 长远来看必须删掉 ai-sdk,直接用 openAI API 实现 - */ - // experimental_repairToolCall: (options): Promise => { - // console.error('hahhah', options) - // throw options.error - // }, - }) + const result = await this.client.invoke(messages, tools, abortSignal) console.log(chalk.blue.bold('LLM:invoke finished'), result) - const toolError: any = result.content.find((part) => part.type === 'tool-error') - if (toolError) throw toolError.error - - assert(!result.text, 'Model returned text without calling done tool', true) - assert(result.toolCalls.length === 1, 'Model must call exactly one tool', true) - assert(result.toolResults.length === 1, 'Step must have exactly one tool result', true) - - const toolCall = result.toolCalls[0] - const toolResult = result.toolResults[0] - const usage = result.totalUsage - - return { - toolCall, - toolResult, - usage, - } + return result }, // retry settings { @@ -203,12 +121,15 @@ async function withRetry( try { return await fn() - } catch (error: any) { + } catch (error: unknown) { console.error(error) settings.onError(error as Error, retries < settings.maxRetries) // do not retry if aborted by user - if (error?.name === 'AbortError') throw error + if ((error as { name?: string })?.name === 'AbortError') throw error + + // do not retry if error is not retryable (InvokeError) + if (error instanceof InvokeError && !error.retryable) throw error lastError = error as Error retries++ diff --git a/src/llms/types.ts b/src/llms/types.ts new file mode 100644 index 0000000..dd07112 --- /dev/null +++ b/src/llms/types.ts @@ -0,0 +1,77 @@ +/** + * Core types for LLM integration + */ +import type { z } from 'zod' + +/** + * Message format - OpenAI standard (industry standard) + */ +export interface Message { + role: 'system' | 'user' | 'assistant' | 'tool' + content?: string | null + tool_calls?: { + id: string + type: 'function' + function: { + name: string + arguments: string // JSON string + } + }[] + tool_call_id?: string + name?: string +} + +/** + * Tool definition - uses Zod schema (LLM-agnostic) + * Supports generics for type-safe parameters and return values + */ +export interface Tool { + // name: string + description?: string + inputSchema: z.ZodType + execute: (args: TParams) => Promise +} + +/** + * LLM Client interface + * Note: Does not use generics because each tool in the tools array has different types + */ +export interface LLMClient { + invoke( + messages: Message[], + tools: Record, + abortSignal?: AbortSignal + ): Promise +} + +/** + * Invoke result (strict typing, supports generics) + */ +export interface InvokeResult { + toolCall: { + id?: string // OpenAI's tool_call_id + name: string + args: Record + } + toolResult: TResult // Supports generics, but defaults to unknown + usage: { + promptTokens: number + completionTokens: number + totalTokens: number + cachedTokens?: number // Prompt cache hits + reasoningTokens?: number // OpenAI o1 series reasoning tokens + } + rawResponse?: unknown // Raw response for debugging +} + +/** + * OpenAI Client config + */ +export interface OpenAIClientConfig { + model: string + apiKey: string + baseURL: string + temperature?: number + maxTokens?: number + maxRetries?: number +} diff --git a/src/llms/utils.ts b/src/llms/utils.ts new file mode 100644 index 0000000..ce332b8 --- /dev/null +++ b/src/llms/utils.ts @@ -0,0 +1,21 @@ +/** + * Utility functions for LLM integration + */ +import { z } from 'zod' + +import type { Tool } from './types' + +/** + * Convert Zod schema to OpenAI tool format + * Uses Zod 4 native z.toJSONSchema() + */ +export function zodToOpenAITool(name: string, tool: Tool) { + return { + type: 'function' as const, + function: { + name, + description: tool.description, + parameters: z.toJSONSchema(tool.inputSchema, { target: 'openapi-3.0' }), + }, + } +} diff --git a/src/tools/index.ts b/src/tools/index.ts index 3c2e9dc..1267166 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -2,8 +2,7 @@ * Internal tools for PageAgent. * @note Adapted from browser-use */ -import { Tool, tool } from 'ai' -import zod from 'zod' +import zod, { type z } from 'zod' import type { PageAgent } from '@/PageAgent' @@ -24,9 +23,24 @@ import * as utils from './actions' window.utils = utils /** - * Internal tools for PageAgent. + * Internal tool definition that has access to PageAgent `this` context */ -export const tools = new Map() +export interface PageAgentTool { + // name: string + description: string + inputSchema: z.ZodType + execute: (this: PageAgent, args: TParams) => Promise +} + +export function tool(options: PageAgentTool): PageAgentTool { + return options +} + +/** + * Internal tools for PageAgent. + * Note: Using any to allow different parameter types for each tool + */ +export const tools = new Map() // tools.set( // 'get_current_html', @@ -49,9 +63,10 @@ tools.set( text: zod.string(), success: zod.boolean().default(true), }), - execute: function (this: PageAgent, input) { + execute: async function (this: PageAgent, input) { // @note main loop will handle this one // this.onDone(input.text, input.success) + return Promise.resolve('Task completed') }, }) ) @@ -143,7 +158,7 @@ tools.set( execute: async function (this: PageAgent, input) { const element = getElementByIndex(this, input.index) const elemText = this.elementTextMap.get(input.index) - await selectOptionElement(element as any, input.text) + await selectOptionElement(element as HTMLSelectElement, input.text) return ( `✅ Selected option (${input.text}) in element (${elemText ? elemText : input.index}).` + (await getSystemInfo())