refactor: remove ai-sdk

2025-10-17 18:43:41 +08:00
parent 16694ee86a
commit 6f332aa24a
12 changed files with 510 additions and 340 deletions
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -10,18 +10,19 @@ The development progress and future plans for PageAgent.
 - [x] **Multi model provider integration and testing**
 - [x] **UI with HITL** - Human-in-the-loop user interface
 - [x] **Landing and doc pages**
 - [x] **Remove ai-sdk** - Only one function is being used
 - [ ] **Robust LLM output**
 - [ ] **Hooks for Task and HITL**
 - [ ] **Hijacking `page_open` event**
 - [ ] **Custom knowledge base and instructions**
 - [ ] **Black/white-list safeguard**
 - [ ] **Data-masking**
- [ ] **Custom actions and hooks**
+- [ ] **Custom actions**
 - [ ] **Optimize for popular UI frameworks**
 - [ ] **Handling page open event**
 - [ ] **Free evaluation plan?**
 - [ ] **Working homepage with live LLM API**
 - [ ] **free CDN**
 - [ ] **Testing suits**
 - [ ] **Remove ai-sdk** - Only one function is being used
 - [ ] **Support custom llm fetch**
 - [ ] **Refactor: Separate Agent and Page-Controller** - Agent can run w/o dom
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,8 +9,6 @@
 			"version": "0.0.0",
 			"license": "MIT",
 			"dependencies": {
 				"@ai-sdk/openai": "^2.0.49",
 				"ai": "^5.0.68",
 				"ai-motion": "^0.4.7",
 				"chalk": "^5.6.2",
 				"zod": "^4.1.12"
@@ -50,68 +48,6 @@
 				"node": ">=20.0.0"
 			}
 		},
 		"node_modules/@ai-sdk/gateway": {
 			"version": "1.0.39",
 			"resolved": "https://registry.npmjs.org/@ai-sdk/gateway/-/gateway-1.0.39.tgz",
 			"integrity": "sha512-ijYCKG2sbn2RBVfIgaXNXvzHAf2HpFXxQODtjMI+T7Z4CLryflytchsZZ9qrGtsjiQVopKOV6m6kj4lq5fnbsg==",
 			"license": "Apache-2.0",
 			"dependencies": {
 				"@ai-sdk/provider": "2.0.0",
 				"@ai-sdk/provider-utils": "3.0.12",
 				"@vercel/oidc": "3.0.2"
 			},
 			"engines": {
 				"node": ">=18"
 			},
 			"peerDependencies": {
 				"zod": "^3.25.76 || ^4.1.8"
 			}
 		},
 		"node_modules/@ai-sdk/openai": {
 			"version": "2.0.49",
 			"resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-2.0.49.tgz",
 			"integrity": "sha512-BkeTl+gfeJAEap0srZEMK36t9IuQDGYWJyvQTWEgmWn8zoT70LAuO5EKWgAaJYUfjLY5OsM6QlYkjF1XfxCRgw==",
 			"license": "Apache-2.0",
 			"dependencies": {
 				"@ai-sdk/provider": "2.0.0",
 				"@ai-sdk/provider-utils": "3.0.12"
 			},
 			"engines": {
 				"node": ">=18"
 			},
 			"peerDependencies": {
 				"zod": "^3.25.76 || ^4.1.8"
 			}
 		},
 		"node_modules/@ai-sdk/provider": {
 			"version": "2.0.0",
 			"resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-2.0.0.tgz",
 			"integrity": "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==",
 			"license": "Apache-2.0",
 			"dependencies": {
 				"json-schema": "^0.4.0"
 			},
 			"engines": {
 				"node": ">=18"
 			}
 		},
 		"node_modules/@ai-sdk/provider-utils": {
 			"version": "3.0.12",
 			"resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-3.0.12.tgz",
 			"integrity": "sha512-ZtbdvYxdMoria+2SlNarEk6Hlgyf+zzcznlD55EAl+7VZvJaSg2sqPvwArY7L6TfDEDJsnCq0fdhBSkYo0Xqdg==",
 			"license": "Apache-2.0",
 			"dependencies": {
 				"@ai-sdk/provider": "2.0.0",
 				"@standard-schema/spec": "^1.0.0",
 				"eventsource-parser": "^3.0.5"
 			},
 			"engines": {
 				"node": ">=18"
 			},
 			"peerDependencies": {
 				"zod": "^3.25.76 || ^4.1.8"
 			}
 		},
 		"node_modules/@babel/code-frame": {
 			"version": "7.27.1",
 			"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
@@ -1663,15 +1599,6 @@
 				"node": ">= 8"
 			}
 		},
 		"node_modules/@opentelemetry/api": {
 			"version": "1.9.0",
 			"resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz",
 			"integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==",
 			"license": "Apache-2.0",
 			"engines": {
 				"node": ">=8.0.0"
 			}
 		},
 		"node_modules/@rolldown/pluginutils": {
 			"version": "1.0.0-beta.35",
 			"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.35.tgz",
@@ -2166,12 +2093,6 @@
 				"sprintf-js": "~1.0.2"
 			}
 		},
 		"node_modules/@standard-schema/spec": {
 			"version": "1.0.0",
 			"resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.0.0.tgz",
 			"integrity": "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==",
 			"license": "MIT"
 		},
 		"node_modules/@swc/core": {
 			"version": "1.13.20",
 			"resolved": "https://registry.npmjs.org/@swc/core/-/core-1.13.20.tgz",
@@ -3089,15 +3010,6 @@
 				"url": "https://opencollective.com/typescript-eslint"
 			}
 		},
 		"node_modules/@vercel/oidc": {
 			"version": "3.0.2",
 			"resolved": "https://registry.npmjs.org/@vercel/oidc/-/oidc-3.0.2.tgz",
 			"integrity": "sha512-JekxQ0RApo4gS4un/iMGsIL1/k4KUBe3HmnGcDvzHuFBdQdudEJgTqcsJC7y6Ul4Yw5CeykgvQbX2XeEJd0+DA==",
 			"license": "Apache-2.0",
 			"engines": {
 				"node": ">= 20"
 			}
 		},
 		"node_modules/@vitejs/plugin-react-swc": {
 			"version": "4.1.0",
 			"resolved": "https://registry.npmjs.org/@vitejs/plugin-react-swc/-/plugin-react-swc-4.1.0.tgz",
@@ -3167,24 +3079,6 @@
 				"acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
 			}
 		},
 		"node_modules/ai": {
 			"version": "5.0.68",
 			"resolved": "https://registry.npmjs.org/ai/-/ai-5.0.68.tgz",
 			"integrity": "sha512-SB6r+4TkKVlSg2ozGBSfuf6Is5hrcX/bpGBzOoyHIN3b4ILGhaly0IHEvP8+3GGIHXqtkPVEUmR6V05jKdjNlg==",
 			"license": "Apache-2.0",
 			"dependencies": {
 				"@ai-sdk/gateway": "1.0.39",
 				"@ai-sdk/provider": "2.0.0",
 				"@ai-sdk/provider-utils": "3.0.12",
 				"@opentelemetry/api": "1.9.0"
 			},
 			"engines": {
 				"node": ">=18"
 			},
 			"peerDependencies": {
 				"zod": "^3.25.76 || ^4.1.8"
 			}
 		},
 		"node_modules/ai-motion": {
 			"version": "0.4.7",
 			"resolved": "https://registry.npmjs.org/ai-motion/-/ai-motion-0.4.7.tgz",
@@ -4369,15 +4263,6 @@
 			"dev": true,
 			"license": "MIT"
 		},
 		"node_modules/eventsource-parser": {
 			"version": "3.0.6",
 			"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz",
 			"integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==",
 			"license": "MIT",
 			"engines": {
 				"node": ">=18.0.0"
 			}
 		},
 		"node_modules/exsolve": {
 			"version": "1.0.7",
 			"resolved": "https://registry.npmjs.org/exsolve/-/exsolve-1.0.7.tgz",
@@ -4998,12 +4883,6 @@
 			"dev": true,
 			"license": "MIT"
 		},
 		"node_modules/json-schema": {
 			"version": "0.4.0",
 			"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
 			"integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
 			"license": "(AFL-2.1 OR BSD-3-Clause)"
 		},
 		"node_modules/json-schema-traverse": {
 			"version": "1.0.0",
 			"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
--- a/package.json
+++ b/package.json
@@ -50,8 +50,6 @@
 		"prepare": "husky"
 	},
 	"dependencies": {
 		"@ai-sdk/openai": "^2.0.49",
 		"ai": "^5.0.68",
 		"ai-motion": "^0.4.7",
 		"chalk": "^5.6.2",
 		"zod": "^4.1.12"
--- a/src/PageAgent.ts
+++ b/src/PageAgent.ts
@@ -2,21 +2,19 @@
 * Copyright (C) 2025 Alibaba Group Holding Limited
 * All rights reserved.
 */
 import { tool } from 'ai'
 import type { LanguageModelUsage, ToolSet } from 'ai'
 import chalk from 'chalk'
 import zod from 'zod'
 import type { PageAgentConfig } from './config'
-import { MACRO_TOOL_NAME, MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
+import { MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
 import * as dom from './dom'
 import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
 import { getPageInfo } from './dom/getPageInfo'
 import { I18n } from './i18n'
-import { LLM } from './llms'
+import { type InvokeResult, LLM, type Message, type Tool } from './llms'
 import { patchReact } from './patches/react'
 import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
-import { tools } from './tools'
+import { type PageAgentTool, tools } from './tools'
 import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel'
 import { SimulatorMask } from './ui/SimulatorMask'
 import { trimLines, uid, waitUntil } from './utils'
@@ -32,14 +30,38 @@ export interface AgentBrain {
 	next_goal: string
 }
 /**
 * MacroTool input structure
 */
 export interface MacroToolInput {
 	evaluation_previous_goal?: string
 	memory?: string
 	next_goal?: string
 	action: Record<string, any>
 }
 /**
 * MacroTool output structure
 */
 export interface MacroToolResult {
 	input: MacroToolInput
 	output: string
 }
 export interface AgentHistory {
 	brain: AgentBrain
 	action: {
 		name: string
 		input: any
-		output: any
+		output: string
 	}
 	usage: {
 		promptTokens: number
 		completionTokens: number
 		totalTokens: number
 		cachedTokens?: number
 		reasoningTokens?: number
 	}
 	usage: LanguageModelUsage
 }
 export interface ExecutionResult {
@@ -148,19 +170,17 @@ export class PageAgent extends EventTarget {
 							content: this.#assembleUserPrompt(),
 						},
 					],
-					// tools,
+					{ AgentOutput: this.#packMacroTool() },
 					this.#packMacroTool(),
 					this.#abortController.signal
 				)
-				const toolResult = result.toolResult
+				const macroResult = result.toolResult as MacroToolResult
-				const input = toolResult.input
+				const input = macroResult.input
-				const output = toolResult.output
+				const output = macroResult.output
 				const brain = {
-					thinking: input.thinking,
+					evaluation_previous_goal: input.evaluation_previous_goal || '',
-					evaluation_previous_goal: input.evaluation_previous_goal,
+					memory: input.memory || '',
-					memory: input.memory,
+					next_goal: input.next_goal || '',
 					next_goal: input.next_goal,
 				}
 				const actionName = Object.keys(input.action)[0]
 				const action = {
@@ -188,8 +208,8 @@ export class PageAgent extends EventTarget {
 					}
 				}
 				if (actionName === 'done') {
-					const success = action.input.success || false
+					const success = action.input?.success ?? false
-					const text = action.input.text || 'no text provided'
+					const text = action.input?.text || 'no text provided'
 					console.log(chalk.green.bold('Task completed'), success, text)
 					this.#onDone(text, success)
 					return {
@@ -219,24 +239,8 @@ export class PageAgent extends EventTarget {
 	 * - action: { toolName: toolInput }
 	 * where action must be selected from tools defined in this.tools
 	 */
-	#packMacroTool(): ToolSet {
+	#packMacroTool(): Tool<MacroToolInput, MacroToolResult> {
 		const tools = this.tools
 		// discriminated version
 		// @note Success rate ~0, model seems unable to understand discriminated union
 		// // Create discriminated union schemas from tools
 		// const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
 		// 	return zod.object({
 		// 		name: zod.literal(toolName),
 		// 		input: tool.inputSchema,
 		// 	})
 		// })
 		// // Ensure at least one tool exists
 		// assert(actionSchemas.length, 'No tools available to create macro tool')
 		// const actionSchema = zod.discriminatedUnion('name', actionSchemas as any)
 		// union version
 		const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
 			return zod.object({
@@ -244,89 +248,96 @@ export class PageAgent extends EventTarget {
 			})
 		})
-		const actionSchema = zod.union(actionSchemas)
+		const actionSchema = zod.union(
 			actionSchemas as unknown as [zod.ZodType, zod.ZodType, ...zod.ZodType[]]
 		)
 		const macroToolSchema = zod.object({
 			// thinking: zod.string().optional(),
 			evaluation_previous_goal: zod.string().optional(),
 			memory: zod.string().optional(),
 			next_goal: zod.string().optional(),
 			action: actionSchema,
 		})
 		return {
-			[MACRO_TOOL_NAME]: tool({
+			// name: MACRO_TOOL_NAME,
-				// description: 'Output the result of the agent',
+			// description: 'Execute agent action', // @todo remote
-				inputSchema: zod.object({
+			inputSchema: macroToolSchema as zod.ZodType<MacroToolInput>,
-					// thinking: zod.string().optional(),
+			execute: async (input: MacroToolInput): Promise<MacroToolResult> => {
-					evaluation_previous_goal: zod.string().optional(),
+				// abort
-					memory: zod.string().optional(),
+				if (this.#abortController.signal.aborted) throw new Error('AbortError')
-					next_goal: zod.string().optional(),
+				// pause
-					action: actionSchema,
+				await waitUntil(() => !this.paused)
 				}),
 				execute: async (input, options) => {
 					// abort
 					if (this.#abortController.signal.aborted) throw new Error('AbortError')
 					// pause
 					await waitUntil(() => !this.paused)
-					console.log(chalk.blue.bold('MacroTool execute'), input)
+				console.log(chalk.blue.bold('MacroTool execute'), input)
-					const action = input.action!
+				const action = input.action
-					const toolName = Object.keys(action)[0]
+				const toolName = Object.keys(action)[0]
-					const toolInput = action[toolName]
+				const toolInput = action[toolName]
-					const brain = trimLines(`✅: ${input.evaluation_previous_goal}
+				const brain = trimLines(`✅: ${input.evaluation_previous_goal}
 						💾: ${input.memory}
 						🎯: ${input.next_goal}
 					`)
-					console.log(brain)
+				console.log(brain)
-					this.bus.emit('panel:update', {
+				this.bus.emit('panel:update', {
-						type: 'thinking',
+					type: 'thinking',
-						displayText: brain,
+					displayText: brain,
-					})
+				})
-					// Find the corresponding tool
+				// Find the corresponding tool
-					const tool = tools.get(toolName)
+				const tool = tools.get(toolName)
-					assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
+				assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
-					console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput, options)
+				console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput)
 				this.bus.emit('panel:update', {
 					type: 'tool_executing',
 					toolName,
 					toolArgs: toolInput,
 					displayText: getToolExecutingText(toolName, toolInput, this.i18n),
 				})
 				const startTime = Date.now()
 				// Execute tool, bind `this` to PageAgent
 				let result = await tool.execute.bind(this)(toolInput)
 				const duration = Date.now() - startTime
 				console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
 				if (toolName === 'wait') {
 					this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
 					result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
 					if (this.#totalWaitTime >= 3)
 						result += '\nDo NOT wait any longer unless you have a good reason.\n'
 					result += '</sys>'
 				} else {
 					// For other tools, reset wait time
 					this.#totalWaitTime = 0
 				}
 				// Briefly display execution result
 				const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
 				if (displayResult)
 					this.bus.emit('panel:update', {
 						type: 'tool_executing',
 						toolName,
 						toolArgs: toolInput,
-						displayText: getToolExecutingText(toolName, toolInput, this.i18n),
+						toolResult: result,
 						displayText: displayResult,
 						duration,
 					})
-					const startTime = Date.now()
+				// Wait a moment to let user see the result
 				await new Promise((resolve) => setTimeout(resolve, 100))
-					// Execute tool, passing options parameter
+				// Return structured result
-					let result = await tool.execute!.bind(this)(toolInput, options)
+				return {
-
+					input,
-					const duration = Date.now() - startTime
+					output: result,
-					console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
+				}
-
+			},
 					if (toolName === 'wait') {
 						this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
 						result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
 						if (this.#totalWaitTime >= 3)
 							result += '\nDo NOT wait any longer unless you have a good reason.\n'
 						result += '</sys>'
 					} else {
 						// For other tools, reset wait time
 						this.#totalWaitTime = 0
 					}
 					// Briefly display execution result
 					const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
 					if (displayResult)
 						this.bus.emit('panel:update', {
 							type: 'tool_executing',
 							toolName,
 							toolArgs: toolInput,
 							toolResult: result,
 							displayText: displayResult,
 							duration,
 						})
 					// Wait a moment to let user see the result
 					await new Promise((resolve) => setTimeout(resolve, 100))
 					return result
 				},
 			}),
 		}
 	}
--- a/src/config/constants.ts
+++ b/src/config/constants.ts
@@ -48,6 +48,5 @@ export const DEFAULT_BASE_URL: string =
 // internal
 export const MACRO_TOOL_NAME = 'AgentOutput' as const
 export const LLM_MAX_RETRIES = 2
 export const MAX_STEPS = 20
--- a/src/config/index.ts
+++ b/src/config/index.ts
@@ -1,9 +1,17 @@
 import type { DomConfig } from '@/dom'
 import type { SupportedLanguage } from '@/i18n'
 import type { LLMConfig } from '@/llms'
 import { DEFAULT_API_KEY, DEFAULT_BASE_URL, DEFAULT_MODEL_NAME, LLM_MAX_RETRIES } from './constants'
 export interface LLMConfig {
 	baseURL?: string
 	apiKey?: string
 	modelName?: string
 	temperature?: number
 	maxTokens?: number
 	maxRetries?: number
 }
 export interface UIConfig {
 	// theme?: 'light' | 'dark'
 	language?: SupportedLanguage
@@ -16,6 +24,8 @@ export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
 		baseURL: config.baseURL ?? DEFAULT_BASE_URL,
 		apiKey: config.apiKey ?? DEFAULT_API_KEY,
 		modelName: config.modelName ?? DEFAULT_MODEL_NAME,
 		temperature: config.temperature ?? 0.0,
 		maxTokens: config.maxTokens ?? 4096,
 		maxRetries: config.maxRetries ?? LLM_MAX_RETRIES,
 	}
 }
--- a/src/llms/OpenAIClient.ts
+++ b/src/llms/OpenAIClient.ts
@@ -0,0 +1,188 @@
 /**
 * OpenAI Client implementation
 */
 import { InvokeError, InvokeErrorType } from './errors'
 import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types'
 import { zodToOpenAITool } from './utils'
 export class OpenAIClient implements LLMClient {
 	config: OpenAIClientConfig
 	constructor(config: OpenAIClientConfig) {
 		this.config = config
 	}
 	async invoke(
 		messages: Message[],
 		tools: Record<string, Tool>,
 		abortSignal?: AbortSignal
 	): Promise<InvokeResult> {
 		// 1. Convert tools to OpenAI format
 		const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
 		// 2. Detect if Claude (auto-compatibility)
 		// const isClaude = this.config.model.toLowerCase().includes('claude')
 		// 3. Call API
 		let response: Response
 		try {
 			response = await fetch(`${this.config.baseURL}/chat/completions`, {
 				method: 'POST',
 				headers: {
 					'Content-Type': 'application/json',
 					Authorization: `Bearer ${this.config.apiKey}`,
 				},
 				body: JSON.stringify({
 					model: this.config.model,
 					messages,
 					parallel_tool_calls: false,
 					tools: openaiTools,
 					tool_choice: 'required',
 					// tool_choice: { type: 'function', function: { name: 'my_function' } },
 					// reasoning_effort: 'minimal',
 					// verbosity: 'low',
 					// Claude doesn't support tool_choice: 'required', auto-omit
 					// ...(isClaude ? {} : { tool_choice: 'required' }),
 					temperature: this.config.temperature,
 					max_tokens: this.config.maxTokens,
 				}),
 				signal: abortSignal,
 			})
 		} catch (error: unknown) {
 			// Network error
 			throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
 		}
 		// 4. Handle HTTP errors
 		if (!response.ok) {
 			const errorData = await response.json().catch(() => ({}))
 			const errorMessage =
 				(errorData as { error?: { message?: string } }).error?.message || response.statusText
 			if (response.status === 401 || response.status === 403) {
 				throw new InvokeError(
 					InvokeErrorType.AUTH_ERROR,
 					`Authentication failed: ${errorMessage}`,
 					errorData
 				)
 			}
 			if (response.status === 429) {
 				throw new InvokeError(
 					InvokeErrorType.RATE_LIMIT,
 					`Rate limit exceeded: ${errorMessage}`,
 					errorData
 				)
 			}
 			if (response.status >= 500) {
 				throw new InvokeError(
 					InvokeErrorType.SERVER_ERROR,
 					`Server error: ${errorMessage}`,
 					errorData
 				)
 			}
 			throw new InvokeError(
 				InvokeErrorType.UNKNOWN,
 				`HTTP ${response.status}: ${errorMessage}`,
 				errorData
 			)
 		}
 		const data = await response.json()
 		// 5. Check finish_reason
 		const choice = data.choices?.[0]
 		if (!choice) {
 			throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data)
 		}
 		switch (choice.finish_reason) {
 			case 'tool_calls':
 				// ✅ Normal
 				break
 			case 'length':
 				// ⚠️ Token limit reached
 				throw new InvokeError(
 					InvokeErrorType.CONTEXT_LENGTH,
 					'Response truncated: max tokens reached',
 					data
 				)
 			case 'content_filter':
 				// ❌ Content filtered
 				throw new InvokeError(
 					InvokeErrorType.CONTENT_FILTER,
 					'Content filtered by safety system',
 					data
 				)
 			case 'stop':
 				// ❌ Did not call tool (we require tool call)
 				throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'Model did not call any tool', data)
 			default:
 				throw new InvokeError(
 					InvokeErrorType.UNKNOWN,
 					`Unexpected finish_reason: ${choice.finish_reason}`,
 					data
 				)
 		}
 		// 6. Parse tool call
 		const toolCall = choice.message?.tool_calls?.[0]
 		if (!toolCall) {
 			throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'No tool call found in response', data)
 		}
 		const toolName = toolCall.function.name
 		const tool = tools[toolName]
 		if (!tool) {
 			throw new InvokeError(InvokeErrorType.UNKNOWN, `Tool ${toolName} not found`, data)
 		}
 		// 7. Parse and validate arguments
 		let toolArgs: unknown
 		try {
 			toolArgs = JSON.parse(toolCall.function.arguments)
 		} catch (e) {
 			throw new InvokeError(InvokeErrorType.INVALID_TOOL_ARGS, 'Invalid JSON in tool arguments', e)
 		}
 		// Validate against zod schema
 		const validation = tool.inputSchema.safeParse(toolArgs)
 		if (!validation.success) {
 			throw new InvokeError(
 				InvokeErrorType.INVALID_TOOL_ARGS,
 				`Tool arguments validation failed: ${validation.error.message}`,
 				validation.error
 			)
 		}
 		// 8. Execute tool
 		let toolResult: unknown
 		try {
 			toolResult = await tool.execute(validation.data)
 		} catch (e) {
 			throw new InvokeError(
 				InvokeErrorType.TOOL_EXECUTION_ERROR,
 				`Tool execution failed: ${(e as Error).message}`,
 				e
 			)
 		}
 		// 9. Return result (including cache tokens)
 		return {
 			toolCall: {
 				id: toolCall.id,
 				name: toolName,
 				args: validation.data as Record<string, unknown>,
 			},
 			toolResult,
 			usage: {
 				promptTokens: data.usage?.prompt_tokens ?? 0,
 				completionTokens: data.usage?.completion_tokens ?? 0,
 				totalTokens: data.usage?.total_tokens ?? 0,
 				cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
 				reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
 			},
 			rawResponse: data,
 		}
 	}
 }
--- a/src/llms/errors.ts
+++ b/src/llms/errors.ts
@@ -0,0 +1,50 @@
 /**
 * Error types and error handling for LLM invocations
 */
 export const InvokeErrorType = {
 	// Retryable
 	NETWORK_ERROR: 'network_error', // Network error, retry
 	RATE_LIMIT: 'rate_limit', // Rate limit, retry
 	SERVER_ERROR: 'server_error', // 5xx, retry
 	NO_TOOL_CALL: 'no_tool_call', // Model did not call tool
 	INVALID_TOOL_ARGS: 'invalid_tool_args', // Tool args don't match schema
 	TOOL_EXECUTION_ERROR: 'tool_execution_error', // Tool execution error
 	UNKNOWN: 'unknown',
 	// Non-retryable
 	AUTH_ERROR: 'auth_error', // Authentication failed
 	CONTEXT_LENGTH: 'context_length', // Prompt too long
 	CONTENT_FILTER: 'content_filter', // Content filtered
 } as const
 export type InvokeErrorType = (typeof InvokeErrorType)[keyof typeof InvokeErrorType]
 export class InvokeError extends Error {
 	type: InvokeErrorType
 	retryable: boolean
 	statusCode?: number
 	rawError?: unknown
 	constructor(type: InvokeErrorType, message: string, rawError?: unknown) {
 		super(message)
 		this.name = 'InvokeError'
 		this.type = type
 		this.retryable = this.isRetryable(type)
 		this.rawError = rawError
 	}
 	private isRetryable(type: InvokeErrorType): boolean {
 		const retryableTypes: InvokeErrorType[] = [
 			InvokeErrorType.NETWORK_ERROR,
 			InvokeErrorType.RATE_LIMIT,
 			InvokeErrorType.SERVER_ERROR,
 			InvokeErrorType.NO_TOOL_CALL,
 			InvokeErrorType.INVALID_TOOL_ARGS,
 			InvokeErrorType.TOOL_EXECUTION_ERROR,
 			InvokeErrorType.UNKNOWN,
 		]
 		return retryableTypes.includes(type)
 	}
 }
--- a/src/llms/index.ts
+++ b/src/llms/index.ts
@@ -31,29 +31,22 @@
 * - 永远使用 tool call 来返回结构化数据，禁止模型直接返回（视为出错）
 * - 不能假设 tool 参数合法，必须有修复机制，而且修复也应该使用 tool call 返回
 */
 import { OpenAIProvider, OpenAIResponsesProviderOptions, createOpenAI } from '@ai-sdk/openai'
 import type { LanguageModelV2, LanguageModelV2ToolCall } from '@ai-sdk/provider'
 import type { LanguageModelUsage, ModelMessage, TypedToolCall, TypedToolResult } from 'ai'
 import { ToolSet, generateText, stepCountIs } from 'ai'
 import chalk from 'chalk'
 import type { LLMConfig } from '@/config'
 import { parseLLMConfig } from '@/config'
 import { MACRO_TOOL_NAME } from '@/config/constants'
 import { assert } from '@/utils/assert'
 import { EventBus, getEventBus } from '@/utils/bus'
-export interface LLMConfig {
+import { OpenAIClient } from './OpenAIClient'
-	baseURL?: string
+import { InvokeError } from './errors'
-	apiKey?: string
+import type { InvokeResult, LLMClient, Message, Tool } from './types'
-	modelName?: string
+
-	maxRetries?: number
+export type { Message, Tool, InvokeResult, LLMClient }
 }
 export class LLM {
 	config: Required<LLMConfig>
 	id: string
-	#openai: OpenAIProvider
+	client: LLMClient
 	#model: LanguageModelV2
 	#bus: EventBus
 	constructor(config: LLMConfig, id: string) {
@@ -62,11 +55,14 @@ export class LLM {
 		this.#bus = getEventBus(id)
-		this.#openai = createOpenAI({ baseURL: this.config.baseURL, apiKey: this.config.apiKey })
+		// Default to OpenAI client
-		this.#model = this.#openai.chat(this.config.modelName)
+		this.client = new OpenAIClient({
-
+			model: this.config.modelName,
-		// @note Will throw JSON parsing error
+			apiKey: this.config.apiKey,
-		// this.#model = this.#openai.responses(modelName)
+			baseURL: this.config.baseURL,
 			temperature: this.config.temperature,
 			maxTokens: this.config.maxTokens,
 		})
 	}
 	/**
@@ -74,96 +70,18 @@ export class LLM {
 	 * - invoke tool call *once*
 	 * - return the result of the tool
 	 */
-	async invoke<T extends ToolSet>(
+	async invoke(
-		messages: ModelMessage[],
+		messages: Message[],
-		tools: T,
+		tools: Record<string, Tool>,
 		abortSignal: AbortSignal
-	): Promise<{
+	): Promise<InvokeResult> {
 		toolCall: TypedToolCall<T>
 		toolResult: TypedToolResult<T>
 		usage: LanguageModelUsage
 	}> {
 		const isClaude = this.config.modelName.slice(0, 8).includes('claude')
 		// const isQwen = this.config.modelName.slice(0, 6).includes('qwen')
 		// const isGPT = this.config.modelName.slice(0, 5).includes('gpt')
 		return await withRetry(
 			async () => {
-				const result = await generateText({
+				const result = await this.client.invoke(messages, tools, abortSignal)
 					model: this.#model,
 					messages,
 					tools,
 					abortSignal,
 					/**
 					 * 文档中没有说明，从源码看，@facts
 					 * - 只会重试被识别为 retryable 的 API_CALL_ERROR
 					 * - 返回无法解析的 json 应该不会重试
 					 * - experimental_repairToolCall 只会执行一次，不算作重试
 					 * @facts
 					 * - 许多 proxy 过的 openAI 兼容接口返回的错误格式并不规范，通常不会被识别为 retryable
 					 * @conclusion
 					 * - 看起来并不实用，不如完全手工控制粗粒度重试
 					 */
 					// maxRetries: this.config.maxRetries,
 					maxRetries: 0,
 					// toolChoice: 'required',
 					// @note incompatible to Claude
 					toolChoice: isClaude ? undefined : { type: 'tool', toolName: MACRO_TOOL_NAME as any },
 					/**
 					 * controlled by main loop. our method only call api once
 					 */
 					// stopWhen: [hasToolCall('done'), stepCountIs(100)],
 					stopWhen: [stepCountIs(1)],
 					// stopWhen: [hasToolCall('AgentOutput')],
 					providerOptions: {
 						openai: {
 							// @note this one needs all fields in tool schema must be `required`
 							// strictJsonSchema: true,
 							// This way only at most one tool can be called at a time
 							parallelToolCalls: false,
 							reasoningEffort: 'minimal',
 							// @note not working
 							// serviceTier: 'priority',
 							textVerbosity: 'low',
 							// @note Optimize OpenAI model caching, should be unique per user, currently has no effect
 							promptCacheKey: 'page-agent:' + this.id,
 						} as OpenAIResponsesProviderOptions,
 					},
 					/**
 					 * schema 出错时执行一次，不确定是否计入重试
 					 * 目前看起来像是会直接抛错，被 withRetry 处理
 					 * @note
 					 * 如果不提供，则 ai-sdk 会把 tool-error 加入 message 中重新调用一次，
 					 * 配合 stepCountIs 或者 hasToolCall 都会导致错误被 silent，toolResults 永远为 0
 					 * 遗憾的是，这里没有办法抛错（抛错后回到默认逻辑），只要这里 repair 不好，就会导致 silent error
 					 * 更糟糕的是，只要传入了 tools，无论 stopWhen 如何设置，都会被当作 multi-step，
 					 * 本质上就和我们 single step 的逻辑冲突
 					 * 长远来看必须删掉 ai-sdk，直接用 openAI API 实现
 					 */
 					// experimental_repairToolCall: (options): Promise<LanguageModelV2ToolCall | null> => {
 					// 	console.error('hahhah', options)
 					// 	throw options.error
 					// },
 				})
 				console.log(chalk.blue.bold('LLM:invoke finished'), result)
-				const toolError: any = result.content.find((part) => part.type === 'tool-error')
+				return result
 				if (toolError) throw toolError.error
 				assert(!result.text, 'Model returned text without calling done tool', true)
 				assert(result.toolCalls.length === 1, 'Model must call exactly one tool', true)
 				assert(result.toolResults.length === 1, 'Step must have exactly one tool result', true)
 				const toolCall = result.toolCalls[0]
 				const toolResult = result.toolResults[0]
 				const usage = result.totalUsage
 				return {
 					toolCall,
 					toolResult,
 					usage,
 				}
 			},
 			// retry settings
 			{
@@ -203,12 +121,15 @@ async function withRetry<T>(
 		try {
 			return await fn()
-		} catch (error: any) {
+		} catch (error: unknown) {
 			console.error(error)
 			settings.onError(error as Error, retries < settings.maxRetries)
 			// do not retry if aborted by user
-			if (error?.name === 'AbortError') throw error
+			if ((error as { name?: string })?.name === 'AbortError') throw error
 			// do not retry if error is not retryable (InvokeError)
 			if (error instanceof InvokeError && !error.retryable) throw error
 			lastError = error as Error
 			retries++
--- a/src/llms/types.ts
+++ b/src/llms/types.ts
@@ -0,0 +1,77 @@
 /**
 * Core types for LLM integration
 */
 import type { z } from 'zod'
 /**
 * Message format - OpenAI standard (industry standard)
 */
 export interface Message {
 	role: 'system' | 'user' | 'assistant' | 'tool'
 	content?: string | null
 	tool_calls?: {
 		id: string
 		type: 'function'
 		function: {
 			name: string
 			arguments: string // JSON string
 		}
 	}[]
 	tool_call_id?: string
 	name?: string
 }
 /**
 * Tool definition - uses Zod schema (LLM-agnostic)
 * Supports generics for type-safe parameters and return values
 */
 export interface Tool<TParams = any, TResult = any> {
 	// name: string
 	description?: string
 	inputSchema: z.ZodType<TParams>
 	execute: (args: TParams) => Promise<TResult>
 }
 /**
 * LLM Client interface
 * Note: Does not use generics because each tool in the tools array has different types
 */
 export interface LLMClient {
 	invoke(
 		messages: Message[],
 		tools: Record<string, Tool>,
 		abortSignal?: AbortSignal
 	): Promise<InvokeResult>
 }
 /**
 * Invoke result (strict typing, supports generics)
 */
 export interface InvokeResult<TResult = unknown> {
 	toolCall: {
 		id?: string // OpenAI's tool_call_id
 		name: string
 		args: Record<string, unknown>
 	}
 	toolResult: TResult // Supports generics, but defaults to unknown
 	usage: {
 		promptTokens: number
 		completionTokens: number
 		totalTokens: number
 		cachedTokens?: number // Prompt cache hits
 		reasoningTokens?: number // OpenAI o1 series reasoning tokens
 	}
 	rawResponse?: unknown // Raw response for debugging
 }
 /**
 * OpenAI Client config
 */
 export interface OpenAIClientConfig {
 	model: string
 	apiKey: string
 	baseURL: string
 	temperature?: number
 	maxTokens?: number
 	maxRetries?: number
 }
--- a/src/llms/utils.ts
+++ b/src/llms/utils.ts
@@ -0,0 +1,21 @@
 /**
 * Utility functions for LLM integration
 */
 import { z } from 'zod'
 import type { Tool } from './types'
 /**
 * Convert Zod schema to OpenAI tool format
 * Uses Zod 4 native z.toJSONSchema()
 */
 export function zodToOpenAITool(name: string, tool: Tool) {
 	return {
 		type: 'function' as const,
 		function: {
 			name,
 			description: tool.description,
 			parameters: z.toJSONSchema(tool.inputSchema, { target: 'openapi-3.0' }),
 		},
 	}
 }
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -2,8 +2,7 @@
 * Internal tools for PageAgent.
 * @note Adapted from browser-use
 */
-import { Tool, tool } from 'ai'
+import zod, { type z } from 'zod'
 import zod from 'zod'
 import type { PageAgent } from '@/PageAgent'
@@ -24,9 +23,24 @@ import * as utils from './actions'
 window.utils = utils
 /**
- * Internal tools for PageAgent.
+ * Internal tool definition that has access to PageAgent `this` context
 */
-export const tools = new Map<string, Tool>()
+export interface PageAgentTool<TParams = any> {
 	// name: string
 	description: string
 	inputSchema: z.ZodType<TParams>
 	execute: (this: PageAgent, args: TParams) => Promise<string>
 }
 export function tool<TParams>(options: PageAgentTool<TParams>): PageAgentTool<TParams> {
 	return options
 }
 /**
 * Internal tools for PageAgent.
 * Note: Using any to allow different parameter types for each tool
 */
 export const tools = new Map<string, PageAgentTool>()
 // tools.set(
 // 	'get_current_html',
@@ -49,9 +63,10 @@ tools.set(
 			text: zod.string(),
 			success: zod.boolean().default(true),
 		}),
-		execute: function (this: PageAgent, input) {
+		execute: async function (this: PageAgent, input) {
 			// @note main loop will handle this one
 			// this.onDone(input.text, input.success)
 			return Promise.resolve('Task completed')
 		},
 	})
 )
@@ -143,7 +158,7 @@ tools.set(
 		execute: async function (this: PageAgent, input) {
 			const element = getElementByIndex(this, input.index)
 			const elemText = this.elementTextMap.get(input.index)
-			await selectOptionElement(element as any, input.text)
+			await selectOptionElement(element as HTMLSelectElement, input.text)
 			return (
 				`✅ Selected option (${input.text}) in element (${elemText ? elemText : input.index}).` +
 				(await getSystemInfo())