refactor: remove ai-sdk

2025-10-17 18:43:41 +08:00
parent 16694ee86a
commit 6f332aa24a
12 changed files with 510 additions and 340 deletions
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -10,18 +10,19 @@ The development progress and future plans for PageAgent.
 - [x] **Multi model provider integration and testing**
 - [x] **UI with HITL** - Human-in-the-loop user interface
 - [x] **Landing and doc pages**
+- [x] **Remove ai-sdk** - Only one function is being used
 - [ ] **Robust LLM output**
+- [ ] **Hooks for Task and HITL**
+- [ ] **Hijacking `page_open` event**
 - [ ] **Custom knowledge base and instructions**
 - [ ] **Black/white-list safeguard**
 - [ ] **Data-masking**
- [ ] **Custom actions and hooks**
+- [ ] **Custom actions**
 - [ ] **Optimize for popular UI frameworks**
- [ ] **Handling page open event**
 - [ ] **Free evaluation plan?**
 - [ ] **Working homepage with live LLM API**
 - [ ] **free CDN**
 - [ ] **Testing suits**
- [ ] **Remove ai-sdk** - Only one function is being used
 - [ ] **Support custom llm fetch**
 - [ ] **Refactor: Separate Agent and Page-Controller** - Agent can run w/o dom

--- a/package-lock.json
+++ b/package-lock.json
@@ -9,8 +9,6 @@
 			"version": "0.0.0",
 			"license": "MIT",
 			"dependencies": {
-				"@ai-sdk/openai": "^2.0.49",
-				"ai": "^5.0.68",
 				"ai-motion": "^0.4.7",
 				"chalk": "^5.6.2",
 				"zod": "^4.1.12"
@@ -50,68 +48,6 @@
 				"node": ">=20.0.0"
 			}
 		},
-		"node_modules/@ai-sdk/gateway": {
-			"version": "1.0.39",
-			"resolved": "https://registry.npmjs.org/@ai-sdk/gateway/-/gateway-1.0.39.tgz",
-			"integrity": "sha512-ijYCKG2sbn2RBVfIgaXNXvzHAf2HpFXxQODtjMI+T7Z4CLryflytchsZZ9qrGtsjiQVopKOV6m6kj4lq5fnbsg==",
-			"license": "Apache-2.0",
-			"dependencies": {
-				"@ai-sdk/provider": "2.0.0",
-				"@ai-sdk/provider-utils": "3.0.12",
-				"@vercel/oidc": "3.0.2"
-			},
-			"engines": {
-				"node": ">=18"
-			},
-			"peerDependencies": {
-				"zod": "^3.25.76 || ^4.1.8"
-			}
-		},
-		"node_modules/@ai-sdk/openai": {
-			"version": "2.0.49",
-			"resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-2.0.49.tgz",
-			"integrity": "sha512-BkeTl+gfeJAEap0srZEMK36t9IuQDGYWJyvQTWEgmWn8zoT70LAuO5EKWgAaJYUfjLY5OsM6QlYkjF1XfxCRgw==",
-			"license": "Apache-2.0",
-			"dependencies": {
-				"@ai-sdk/provider": "2.0.0",
-				"@ai-sdk/provider-utils": "3.0.12"
-			},
-			"engines": {
-				"node": ">=18"
-			},
-			"peerDependencies": {
-				"zod": "^3.25.76 || ^4.1.8"
-			}
-		},
-		"node_modules/@ai-sdk/provider": {
-			"version": "2.0.0",
-			"resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-2.0.0.tgz",
-			"integrity": "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==",
-			"license": "Apache-2.0",
-			"dependencies": {
-				"json-schema": "^0.4.0"
-			},
-			"engines": {
-				"node": ">=18"
-			}
-		},
-		"node_modules/@ai-sdk/provider-utils": {
-			"version": "3.0.12",
-			"resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-3.0.12.tgz",
-			"integrity": "sha512-ZtbdvYxdMoria+2SlNarEk6Hlgyf+zzcznlD55EAl+7VZvJaSg2sqPvwArY7L6TfDEDJsnCq0fdhBSkYo0Xqdg==",
-			"license": "Apache-2.0",
-			"dependencies": {
-				"@ai-sdk/provider": "2.0.0",
-				"@standard-schema/spec": "^1.0.0",
-				"eventsource-parser": "^3.0.5"
-			},
-			"engines": {
-				"node": ">=18"
-			},
-			"peerDependencies": {
-				"zod": "^3.25.76 || ^4.1.8"
-			}
-		},
 		"node_modules/@babel/code-frame": {
 			"version": "7.27.1",
 			"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
@@ -1663,15 +1599,6 @@
 				"node": ">= 8"
 			}
 		},
-		"node_modules/@opentelemetry/api": {
-			"version": "1.9.0",
-			"resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz",
-			"integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==",
-			"license": "Apache-2.0",
-			"engines": {
-				"node": ">=8.0.0"
-			}
-		},
 		"node_modules/@rolldown/pluginutils": {
 			"version": "1.0.0-beta.35",
 			"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.35.tgz",
@@ -2166,12 +2093,6 @@
 				"sprintf-js": "~1.0.2"
 			}
 		},
-		"node_modules/@standard-schema/spec": {
-			"version": "1.0.0",
-			"resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.0.0.tgz",
-			"integrity": "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==",
-			"license": "MIT"
-		},
 		"node_modules/@swc/core": {
 			"version": "1.13.20",
 			"resolved": "https://registry.npmjs.org/@swc/core/-/core-1.13.20.tgz",
@@ -3089,15 +3010,6 @@
 				"url": "https://opencollective.com/typescript-eslint"
 			}
 		},
-		"node_modules/@vercel/oidc": {
-			"version": "3.0.2",
-			"resolved": "https://registry.npmjs.org/@vercel/oidc/-/oidc-3.0.2.tgz",
-			"integrity": "sha512-JekxQ0RApo4gS4un/iMGsIL1/k4KUBe3HmnGcDvzHuFBdQdudEJgTqcsJC7y6Ul4Yw5CeykgvQbX2XeEJd0+DA==",
-			"license": "Apache-2.0",
-			"engines": {
-				"node": ">= 20"
-			}
-		},
 		"node_modules/@vitejs/plugin-react-swc": {
 			"version": "4.1.0",
 			"resolved": "https://registry.npmjs.org/@vitejs/plugin-react-swc/-/plugin-react-swc-4.1.0.tgz",
@@ -3167,24 +3079,6 @@
 				"acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
 			}
 		},
-		"node_modules/ai": {
-			"version": "5.0.68",
-			"resolved": "https://registry.npmjs.org/ai/-/ai-5.0.68.tgz",
-			"integrity": "sha512-SB6r+4TkKVlSg2ozGBSfuf6Is5hrcX/bpGBzOoyHIN3b4ILGhaly0IHEvP8+3GGIHXqtkPVEUmR6V05jKdjNlg==",
-			"license": "Apache-2.0",
-			"dependencies": {
-				"@ai-sdk/gateway": "1.0.39",
-				"@ai-sdk/provider": "2.0.0",
-				"@ai-sdk/provider-utils": "3.0.12",
-				"@opentelemetry/api": "1.9.0"
-			},
-			"engines": {
-				"node": ">=18"
-			},
-			"peerDependencies": {
-				"zod": "^3.25.76 || ^4.1.8"
-			}
-		},
 		"node_modules/ai-motion": {
 			"version": "0.4.7",
 			"resolved": "https://registry.npmjs.org/ai-motion/-/ai-motion-0.4.7.tgz",
@@ -4369,15 +4263,6 @@
 			"dev": true,
 			"license": "MIT"
 		},
-		"node_modules/eventsource-parser": {
-			"version": "3.0.6",
-			"resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz",
-			"integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==",
-			"license": "MIT",
-			"engines": {
-				"node": ">=18.0.0"
-			}
-		},
 		"node_modules/exsolve": {
 			"version": "1.0.7",
 			"resolved": "https://registry.npmjs.org/exsolve/-/exsolve-1.0.7.tgz",
@@ -4998,12 +4883,6 @@
 			"dev": true,
 			"license": "MIT"
 		},
-		"node_modules/json-schema": {
-			"version": "0.4.0",
-			"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
-			"integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
-			"license": "(AFL-2.1 OR BSD-3-Clause)"
-		},
 		"node_modules/json-schema-traverse": {
 			"version": "1.0.0",
 			"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
--- a/package.json
+++ b/package.json
@@ -50,8 +50,6 @@
 		"prepare": "husky"
 	},
 	"dependencies": {
-		"@ai-sdk/openai": "^2.0.49",
-		"ai": "^5.0.68",
 		"ai-motion": "^0.4.7",
 		"chalk": "^5.6.2",
 		"zod": "^4.1.12"
--- a/src/PageAgent.ts
+++ b/src/PageAgent.ts
@@ -2,21 +2,19 @@
 * Copyright (C) 2025 Alibaba Group Holding Limited
 * All rights reserved.
 */
-import { tool } from 'ai'
-import type { LanguageModelUsage, ToolSet } from 'ai'
 import chalk from 'chalk'
 import zod from 'zod'

 import type { PageAgentConfig } from './config'
-import { MACRO_TOOL_NAME, MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
+import { MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
 import * as dom from './dom'
 import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
 import { getPageInfo } from './dom/getPageInfo'
 import { I18n } from './i18n'
-import { LLM } from './llms'
+import { type InvokeResult, LLM, type Message, type Tool } from './llms'
 import { patchReact } from './patches/react'
 import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
-import { tools } from './tools'
+import { type PageAgentTool, tools } from './tools'
 import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel'
 import { SimulatorMask } from './ui/SimulatorMask'
 import { trimLines, uid, waitUntil } from './utils'
@@ -32,14 +30,38 @@ export interface AgentBrain {
 	next_goal: string
 }

+/**
+ * MacroTool input structure
+ */
+export interface MacroToolInput {
+	evaluation_previous_goal?: string
+	memory?: string
+	next_goal?: string
+	action: Record<string, any>
+}
+
+/**
+ * MacroTool output structure
+ */
+export interface MacroToolResult {
+	input: MacroToolInput
+	output: string
+}
+
 export interface AgentHistory {
 	brain: AgentBrain
 	action: {
 		name: string
 		input: any
-		output: any
+		output: string
+	}
+	usage: {
+		promptTokens: number
+		completionTokens: number
+		totalTokens: number
+		cachedTokens?: number
+		reasoningTokens?: number
 	}
-	usage: LanguageModelUsage
 }

 export interface ExecutionResult {
@@ -148,19 +170,17 @@ export class PageAgent extends EventTarget {
 							content: this.#assembleUserPrompt(),
 						},
 					],
-					// tools,
-					this.#packMacroTool(),
+					{ AgentOutput: this.#packMacroTool() },
 					this.#abortController.signal
 				)

-				const toolResult = result.toolResult
-				const input = toolResult.input
-				const output = toolResult.output
+				const macroResult = result.toolResult as MacroToolResult
+				const input = macroResult.input
+				const output = macroResult.output
 				const brain = {
-					thinking: input.thinking,
-					evaluation_previous_goal: input.evaluation_previous_goal,
-					memory: input.memory,
-					next_goal: input.next_goal,
+					evaluation_previous_goal: input.evaluation_previous_goal || '',
+					memory: input.memory || '',
+					next_goal: input.next_goal || '',
 				}
 				const actionName = Object.keys(input.action)[0]
 				const action = {
@@ -188,8 +208,8 @@ export class PageAgent extends EventTarget {
 					}
 				}
 				if (actionName === 'done') {
-					const success = action.input.success || false
-					const text = action.input.text || 'no text provided'
+					const success = action.input?.success ?? false
+					const text = action.input?.text || 'no text provided'
 					console.log(chalk.green.bold('Task completed'), success, text)
 					this.#onDone(text, success)
 					return {
@@ -219,24 +239,8 @@ export class PageAgent extends EventTarget {
 	 * - action: { toolName: toolInput }
 	 * where action must be selected from tools defined in this.tools
 	 */
-	#packMacroTool(): ToolSet {
+	#packMacroTool(): Tool<MacroToolInput, MacroToolResult> {
 		const tools = this.tools
-		// discriminated version
-		// @note Success rate ~0, model seems unable to understand discriminated union
-
-		// // Create discriminated union schemas from tools
-		// const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
-		// 	return zod.object({
-		// 		name: zod.literal(toolName),
-		// 		input: tool.inputSchema,
-		// 	})
-		// })
-
-		// // Ensure at least one tool exists
-		// assert(actionSchemas.length, 'No tools available to create macro tool')
-
-		// const actionSchema = zod.discriminatedUnion('name', actionSchemas as any)
-
 		// union version
 		const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
 			return zod.object({
@@ -244,89 +248,96 @@ export class PageAgent extends EventTarget {
 			})
 		})

-		const actionSchema = zod.union(actionSchemas)
+		const actionSchema = zod.union(
+			actionSchemas as unknown as [zod.ZodType, zod.ZodType, ...zod.ZodType[]]
+		)
+
+		const macroToolSchema = zod.object({
+			// thinking: zod.string().optional(),
+			evaluation_previous_goal: zod.string().optional(),
+			memory: zod.string().optional(),
+			next_goal: zod.string().optional(),
+			action: actionSchema,
+		})

 		return {
-			[MACRO_TOOL_NAME]: tool({
-				// description: 'Output the result of the agent',
-				inputSchema: zod.object({
-					// thinking: zod.string().optional(),
-					evaluation_previous_goal: zod.string().optional(),
-					memory: zod.string().optional(),
-					next_goal: zod.string().optional(),
-					action: actionSchema,
-				}),
-				execute: async (input, options) => {
-					// abort
-					if (this.#abortController.signal.aborted) throw new Error('AbortError')
-					// pause
-					await waitUntil(() => !this.paused)
+			// name: MACRO_TOOL_NAME,
+			// description: 'Execute agent action', // @todo remote
+			inputSchema: macroToolSchema as zod.ZodType<MacroToolInput>,
+			execute: async (input: MacroToolInput): Promise<MacroToolResult> => {
+				// abort
+				if (this.#abortController.signal.aborted) throw new Error('AbortError')
+				// pause
+				await waitUntil(() => !this.paused)

-					console.log(chalk.blue.bold('MacroTool execute'), input)
-					const action = input.action!
+				console.log(chalk.blue.bold('MacroTool execute'), input)
+				const action = input.action

-					const toolName = Object.keys(action)[0]
-					const toolInput = action[toolName]
-					const brain = trimLines(`✅: ${input.evaluation_previous_goal}
+				const toolName = Object.keys(action)[0]
+				const toolInput = action[toolName]
+				const brain = trimLines(`✅: ${input.evaluation_previous_goal}
 						💾: ${input.memory}
 						🎯: ${input.next_goal}
 					`)

-					console.log(brain)
-					this.bus.emit('panel:update', {
-						type: 'thinking',
-						displayText: brain,
-					})
+				console.log(brain)
+				this.bus.emit('panel:update', {
+					type: 'thinking',
+					displayText: brain,
+				})

-					// Find the corresponding tool
-					const tool = tools.get(toolName)
-					assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
+				// Find the corresponding tool
+				const tool = tools.get(toolName)
+				assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)

-					console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput, options)
+				console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput)
+				this.bus.emit('panel:update', {
+					type: 'tool_executing',
+					toolName,
+					toolArgs: toolInput,
+					displayText: getToolExecutingText(toolName, toolInput, this.i18n),
+				})
+
+				const startTime = Date.now()
+
+				// Execute tool, bind `this` to PageAgent
+				let result = await tool.execute.bind(this)(toolInput)
+
+				const duration = Date.now() - startTime
+				console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
+
+				if (toolName === 'wait') {
+					this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
+					result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
+					if (this.#totalWaitTime >= 3)
+						result += '\nDo NOT wait any longer unless you have a good reason.\n'
+					result += '</sys>'
+				} else {
+					// For other tools, reset wait time
+					this.#totalWaitTime = 0
+				}
+
+				// Briefly display execution result
+				const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
+				if (displayResult)
 					this.bus.emit('panel:update', {
 						type: 'tool_executing',
 						toolName,
 						toolArgs: toolInput,
-						displayText: getToolExecutingText(toolName, toolInput, this.i18n),
+						toolResult: result,
+						displayText: displayResult,
+						duration,
 					})

-					const startTime = Date.now()
+				// Wait a moment to let user see the result
+				await new Promise((resolve) => setTimeout(resolve, 100))

-					// Execute tool, passing options parameter
-					let result = await tool.execute!.bind(this)(toolInput, options)
-
-					const duration = Date.now() - startTime
-					console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
-
-					if (toolName === 'wait') {
-						this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
-						result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
-						if (this.#totalWaitTime >= 3)
-							result += '\nDo NOT wait any longer unless you have a good reason.\n'
-						result += '</sys>'
-					} else {
-						// For other tools, reset wait time
-						this.#totalWaitTime = 0
-					}
-
-					// Briefly display execution result
-					const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
-					if (displayResult)
-						this.bus.emit('panel:update', {
-							type: 'tool_executing',
-							toolName,
-							toolArgs: toolInput,
-							toolResult: result,
-							displayText: displayResult,
-							duration,
-						})
-
-					// Wait a moment to let user see the result
-					await new Promise((resolve) => setTimeout(resolve, 100))
-
-					return result
-				},
-			}),
+				// Return structured result
+				return {
+					input,
+					output: result,
+				}
+			},
 		}
 	}

--- a/src/config/constants.ts
+++ b/src/config/constants.ts
@@ -48,6 +48,5 @@ export const DEFAULT_BASE_URL: string =

 // internal

-export const MACRO_TOOL_NAME = 'AgentOutput' as const
 export const LLM_MAX_RETRIES = 2
 export const MAX_STEPS = 20
--- a/src/config/index.ts
+++ b/src/config/index.ts
@@ -1,9 +1,17 @@
 import type { DomConfig } from '@/dom'
 import type { SupportedLanguage } from '@/i18n'
-import type { LLMConfig } from '@/llms'

 import { DEFAULT_API_KEY, DEFAULT_BASE_URL, DEFAULT_MODEL_NAME, LLM_MAX_RETRIES } from './constants'

+export interface LLMConfig {
+	baseURL?: string
+	apiKey?: string
+	modelName?: string
+	temperature?: number
+	maxTokens?: number
+	maxRetries?: number
+}
+
 export interface UIConfig {
 	// theme?: 'light' | 'dark'
 	language?: SupportedLanguage
@@ -16,6 +24,8 @@ export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
 		baseURL: config.baseURL ?? DEFAULT_BASE_URL,
 		apiKey: config.apiKey ?? DEFAULT_API_KEY,
 		modelName: config.modelName ?? DEFAULT_MODEL_NAME,
+		temperature: config.temperature ?? 0.0,
+		maxTokens: config.maxTokens ?? 4096,
 		maxRetries: config.maxRetries ?? LLM_MAX_RETRIES,
 	}
 }
--- a/src/llms/OpenAIClient.ts
+++ b/src/llms/OpenAIClient.ts
@@ -0,0 +1,188 @@
+/**
+ * OpenAI Client implementation
+ */
+import { InvokeError, InvokeErrorType } from './errors'
+import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types'
+import { zodToOpenAITool } from './utils'
+
+export class OpenAIClient implements LLMClient {
+	config: OpenAIClientConfig
+
+	constructor(config: OpenAIClientConfig) {
+		this.config = config
+	}
+
+	async invoke(
+		messages: Message[],
+		tools: Record<string, Tool>,
+		abortSignal?: AbortSignal
+	): Promise<InvokeResult> {
+		// 1. Convert tools to OpenAI format
+		const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
+
+		// 2. Detect if Claude (auto-compatibility)
+		// const isClaude = this.config.model.toLowerCase().includes('claude')
+
+		// 3. Call API
+		let response: Response
+		try {
+			response = await fetch(`${this.config.baseURL}/chat/completions`, {
+				method: 'POST',
+				headers: {
+					'Content-Type': 'application/json',
+					Authorization: `Bearer ${this.config.apiKey}`,
+				},
+				body: JSON.stringify({
+					model: this.config.model,
+					messages,
+					parallel_tool_calls: false,
+					tools: openaiTools,
+					tool_choice: 'required',
+					// tool_choice: { type: 'function', function: { name: 'my_function' } },
+					// reasoning_effort: 'minimal',
+					// verbosity: 'low',
+
+					// Claude doesn't support tool_choice: 'required', auto-omit
+					// ...(isClaude ? {} : { tool_choice: 'required' }),
+					temperature: this.config.temperature,
+					max_tokens: this.config.maxTokens,
+				}),
+				signal: abortSignal,
+			})
+		} catch (error: unknown) {
+			// Network error
+			throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
+		}
+
+		// 4. Handle HTTP errors
+		if (!response.ok) {
+			const errorData = await response.json().catch(() => ({}))
+			const errorMessage =
+				(errorData as { error?: { message?: string } }).error?.message || response.statusText
+
+			if (response.status === 401 || response.status === 403) {
+				throw new InvokeError(
+					InvokeErrorType.AUTH_ERROR,
+					`Authentication failed: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status === 429) {
+				throw new InvokeError(
+					InvokeErrorType.RATE_LIMIT,
+					`Rate limit exceeded: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status >= 500) {
+				throw new InvokeError(
+					InvokeErrorType.SERVER_ERROR,
+					`Server error: ${errorMessage}`,
+					errorData
+				)
+			}
+			throw new InvokeError(
+				InvokeErrorType.UNKNOWN,
+				`HTTP ${response.status}: ${errorMessage}`,
+				errorData
+			)
+		}
+
+		const data = await response.json()
+
+		// 5. Check finish_reason
+		const choice = data.choices?.[0]
+		if (!choice) {
+			throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data)
+		}
+
+		switch (choice.finish_reason) {
+			case 'tool_calls':
+				// ✅ Normal
+				break
+			case 'length':
+				// ⚠️ Token limit reached
+				throw new InvokeError(
+					InvokeErrorType.CONTEXT_LENGTH,
+					'Response truncated: max tokens reached',
+					data
+				)
+			case 'content_filter':
+				// ❌ Content filtered
+				throw new InvokeError(
+					InvokeErrorType.CONTENT_FILTER,
+					'Content filtered by safety system',
+					data
+				)
+			case 'stop':
+				// ❌ Did not call tool (we require tool call)
+				throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'Model did not call any tool', data)
+			default:
+				throw new InvokeError(
+					InvokeErrorType.UNKNOWN,
+					`Unexpected finish_reason: ${choice.finish_reason}`,
+					data
+				)
+		}
+
+		// 6. Parse tool call
+		const toolCall = choice.message?.tool_calls?.[0]
+		if (!toolCall) {
+			throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'No tool call found in response', data)
+		}
+
+		const toolName = toolCall.function.name
+		const tool = tools[toolName]
+		if (!tool) {
+			throw new InvokeError(InvokeErrorType.UNKNOWN, `Tool ${toolName} not found`, data)
+		}
+
+		// 7. Parse and validate arguments
+		let toolArgs: unknown
+		try {
+			toolArgs = JSON.parse(toolCall.function.arguments)
+		} catch (e) {
+			throw new InvokeError(InvokeErrorType.INVALID_TOOL_ARGS, 'Invalid JSON in tool arguments', e)
+		}
+
+		// Validate against zod schema
+		const validation = tool.inputSchema.safeParse(toolArgs)
+		if (!validation.success) {
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				`Tool arguments validation failed: ${validation.error.message}`,
+				validation.error
+			)
+		}
+
+		// 8. Execute tool
+		let toolResult: unknown
+		try {
+			toolResult = await tool.execute(validation.data)
+		} catch (e) {
+			throw new InvokeError(
+				InvokeErrorType.TOOL_EXECUTION_ERROR,
+				`Tool execution failed: ${(e as Error).message}`,
+				e
+			)
+		}
+
+		// 9. Return result (including cache tokens)
+		return {
+			toolCall: {
+				id: toolCall.id,
+				name: toolName,
+				args: validation.data as Record<string, unknown>,
+			},
+			toolResult,
+			usage: {
+				promptTokens: data.usage?.prompt_tokens ?? 0,
+				completionTokens: data.usage?.completion_tokens ?? 0,
+				totalTokens: data.usage?.total_tokens ?? 0,
+				cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
+				reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
+			},
+			rawResponse: data,
+		}
+	}
+}
--- a/src/llms/errors.ts
+++ b/src/llms/errors.ts
@@ -0,0 +1,50 @@
+/**
+ * Error types and error handling for LLM invocations
+ */
+
+export const InvokeErrorType = {
+	// Retryable
+	NETWORK_ERROR: 'network_error', // Network error, retry
+	RATE_LIMIT: 'rate_limit', // Rate limit, retry
+	SERVER_ERROR: 'server_error', // 5xx, retry
+	NO_TOOL_CALL: 'no_tool_call', // Model did not call tool
+	INVALID_TOOL_ARGS: 'invalid_tool_args', // Tool args don't match schema
+	TOOL_EXECUTION_ERROR: 'tool_execution_error', // Tool execution error
+
+	UNKNOWN: 'unknown',
+
+	// Non-retryable
+	AUTH_ERROR: 'auth_error', // Authentication failed
+	CONTEXT_LENGTH: 'context_length', // Prompt too long
+	CONTENT_FILTER: 'content_filter', // Content filtered
+} as const
+
+export type InvokeErrorType = (typeof InvokeErrorType)[keyof typeof InvokeErrorType]
+
+export class InvokeError extends Error {
+	type: InvokeErrorType
+	retryable: boolean
+	statusCode?: number
+	rawError?: unknown
+
+	constructor(type: InvokeErrorType, message: string, rawError?: unknown) {
+		super(message)
+		this.name = 'InvokeError'
+		this.type = type
+		this.retryable = this.isRetryable(type)
+		this.rawError = rawError
+	}
+
+	private isRetryable(type: InvokeErrorType): boolean {
+		const retryableTypes: InvokeErrorType[] = [
+			InvokeErrorType.NETWORK_ERROR,
+			InvokeErrorType.RATE_LIMIT,
+			InvokeErrorType.SERVER_ERROR,
+			InvokeErrorType.NO_TOOL_CALL,
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			InvokeErrorType.TOOL_EXECUTION_ERROR,
+			InvokeErrorType.UNKNOWN,
+		]
+		return retryableTypes.includes(type)
+	}
+}
--- a/src/llms/index.ts
+++ b/src/llms/index.ts
@@ -31,29 +31,22 @@
 * - 永远使用 tool call 来返回结构化数据，禁止模型直接返回（视为出错）
 * - 不能假设 tool 参数合法，必须有修复机制，而且修复也应该使用 tool call 返回
 */
-import { OpenAIProvider, OpenAIResponsesProviderOptions, createOpenAI } from '@ai-sdk/openai'
-import type { LanguageModelV2, LanguageModelV2ToolCall } from '@ai-sdk/provider'
-import type { LanguageModelUsage, ModelMessage, TypedToolCall, TypedToolResult } from 'ai'
-import { ToolSet, generateText, stepCountIs } from 'ai'
 import chalk from 'chalk'

+import type { LLMConfig } from '@/config'
 import { parseLLMConfig } from '@/config'
-import { MACRO_TOOL_NAME } from '@/config/constants'
-import { assert } from '@/utils/assert'
 import { EventBus, getEventBus } from '@/utils/bus'

-export interface LLMConfig {
-	baseURL?: string
-	apiKey?: string
-	modelName?: string
-	maxRetries?: number
-}
+import { OpenAIClient } from './OpenAIClient'
+import { InvokeError } from './errors'
+import type { InvokeResult, LLMClient, Message, Tool } from './types'
+
+export type { Message, Tool, InvokeResult, LLMClient }

 export class LLM {
 	config: Required<LLMConfig>
 	id: string
-	#openai: OpenAIProvider
-	#model: LanguageModelV2
+	client: LLMClient
 	#bus: EventBus

 	constructor(config: LLMConfig, id: string) {
@@ -62,11 +55,14 @@ export class LLM {

 		this.#bus = getEventBus(id)

-		this.#openai = createOpenAI({ baseURL: this.config.baseURL, apiKey: this.config.apiKey })
-		this.#model = this.#openai.chat(this.config.modelName)
-
-		// @note Will throw JSON parsing error
-		// this.#model = this.#openai.responses(modelName)
+		// Default to OpenAI client
+		this.client = new OpenAIClient({
+			model: this.config.modelName,
+			apiKey: this.config.apiKey,
+			baseURL: this.config.baseURL,
+			temperature: this.config.temperature,
+			maxTokens: this.config.maxTokens,
+		})
 	}

 	/**
@@ -74,96 +70,18 @@ export class LLM {
 	 * - invoke tool call *once*
 	 * - return the result of the tool
 	 */
-	async invoke<T extends ToolSet>(
-		messages: ModelMessage[],
-		tools: T,
+	async invoke(
+		messages: Message[],
+		tools: Record<string, Tool>,
 		abortSignal: AbortSignal
-	): Promise<{
-		toolCall: TypedToolCall<T>
-		toolResult: TypedToolResult<T>
-		usage: LanguageModelUsage
-	}> {
-		const isClaude = this.config.modelName.slice(0, 8).includes('claude')
-		// const isQwen = this.config.modelName.slice(0, 6).includes('qwen')
-		// const isGPT = this.config.modelName.slice(0, 5).includes('gpt')
-
+	): Promise<InvokeResult> {
 		return await withRetry(
 			async () => {
-				const result = await generateText({
-					model: this.#model,
-					messages,
-					tools,
-					abortSignal,
-					/**
-					 * 文档中没有说明，从源码看，@facts
-					 * - 只会重试被识别为 retryable 的 API_CALL_ERROR
-					 * - 返回无法解析的 json 应该不会重试
-					 * - experimental_repairToolCall 只会执行一次，不算作重试
-					 * @facts
-					 * - 许多 proxy 过的 openAI 兼容接口返回的错误格式并不规范，通常不会被识别为 retryable
-					 * @conclusion
-					 * - 看起来并不实用，不如完全手工控制粗粒度重试
-					 */
-					// maxRetries: this.config.maxRetries,
-					maxRetries: 0,
-					// toolChoice: 'required',
-					// @note incompatible to Claude
-					toolChoice: isClaude ? undefined : { type: 'tool', toolName: MACRO_TOOL_NAME as any },
-					/**
-					 * controlled by main loop. our method only call api once
-					 */
-					// stopWhen: [hasToolCall('done'), stepCountIs(100)],
-					stopWhen: [stepCountIs(1)],
-					// stopWhen: [hasToolCall('AgentOutput')],
-					providerOptions: {
-						openai: {
-							// @note this one needs all fields in tool schema must be `required`
-							// strictJsonSchema: true,
-							// This way only at most one tool can be called at a time
-							parallelToolCalls: false,
-							reasoningEffort: 'minimal',
-							// @note not working
-							// serviceTier: 'priority',
-							textVerbosity: 'low',
-							// @note Optimize OpenAI model caching, should be unique per user, currently has no effect
-							promptCacheKey: 'page-agent:' + this.id,
-						} as OpenAIResponsesProviderOptions,
-					},
-					/**
-					 * schema 出错时执行一次，不确定是否计入重试
-					 * 目前看起来像是会直接抛错，被 withRetry 处理
-					 * @note
-					 * 如果不提供，则 ai-sdk 会把 tool-error 加入 message 中重新调用一次，
-					 * 配合 stepCountIs 或者 hasToolCall 都会导致错误被 silent，toolResults 永远为 0
-					 * 遗憾的是，这里没有办法抛错（抛错后回到默认逻辑），只要这里 repair 不好，就会导致 silent error
-					 * 更糟糕的是，只要传入了 tools，无论 stopWhen 如何设置，都会被当作 multi-step，
-					 * 本质上就和我们 single step 的逻辑冲突
-					 * 长远来看必须删掉 ai-sdk，直接用 openAI API 实现
-					 */
-					// experimental_repairToolCall: (options): Promise<LanguageModelV2ToolCall | null> => {
-					// 	console.error('hahhah', options)
-					// 	throw options.error
-					// },
-				})
+				const result = await this.client.invoke(messages, tools, abortSignal)

 				console.log(chalk.blue.bold('LLM:invoke finished'), result)

-				const toolError: any = result.content.find((part) => part.type === 'tool-error')
-				if (toolError) throw toolError.error
-
-				assert(!result.text, 'Model returned text without calling done tool', true)
-				assert(result.toolCalls.length === 1, 'Model must call exactly one tool', true)
-				assert(result.toolResults.length === 1, 'Step must have exactly one tool result', true)
-
-				const toolCall = result.toolCalls[0]
-				const toolResult = result.toolResults[0]
-				const usage = result.totalUsage
-
-				return {
-					toolCall,
-					toolResult,
-					usage,
-				}
+				return result
 			},
 			// retry settings
 			{
@@ -203,12 +121,15 @@ async function withRetry<T>(

 		try {
 			return await fn()
-		} catch (error: any) {
+		} catch (error: unknown) {
 			console.error(error)
 			settings.onError(error as Error, retries < settings.maxRetries)

 			// do not retry if aborted by user
-			if (error?.name === 'AbortError') throw error
+			if ((error as { name?: string })?.name === 'AbortError') throw error
+
+			// do not retry if error is not retryable (InvokeError)
+			if (error instanceof InvokeError && !error.retryable) throw error

 			lastError = error as Error
 			retries++
--- a/src/llms/types.ts
+++ b/src/llms/types.ts
@@ -0,0 +1,77 @@
+/**
+ * Core types for LLM integration
+ */
+import type { z } from 'zod'
+
+/**
+ * Message format - OpenAI standard (industry standard)
+ */
+export interface Message {
+	role: 'system' | 'user' | 'assistant' | 'tool'
+	content?: string | null
+	tool_calls?: {
+		id: string
+		type: 'function'
+		function: {
+			name: string
+			arguments: string // JSON string
+		}
+	}[]
+	tool_call_id?: string
+	name?: string
+}
+
+/**
+ * Tool definition - uses Zod schema (LLM-agnostic)
+ * Supports generics for type-safe parameters and return values
+ */
+export interface Tool<TParams = any, TResult = any> {
+	// name: string
+	description?: string
+	inputSchema: z.ZodType<TParams>
+	execute: (args: TParams) => Promise<TResult>
+}
+
+/**
+ * LLM Client interface
+ * Note: Does not use generics because each tool in the tools array has different types
+ */
+export interface LLMClient {
+	invoke(
+		messages: Message[],
+		tools: Record<string, Tool>,
+		abortSignal?: AbortSignal
+	): Promise<InvokeResult>
+}
+
+/**
+ * Invoke result (strict typing, supports generics)
+ */
+export interface InvokeResult<TResult = unknown> {
+	toolCall: {
+		id?: string // OpenAI's tool_call_id
+		name: string
+		args: Record<string, unknown>
+	}
+	toolResult: TResult // Supports generics, but defaults to unknown
+	usage: {
+		promptTokens: number
+		completionTokens: number
+		totalTokens: number
+		cachedTokens?: number // Prompt cache hits
+		reasoningTokens?: number // OpenAI o1 series reasoning tokens
+	}
+	rawResponse?: unknown // Raw response for debugging
+}
+
+/**
+ * OpenAI Client config
+ */
+export interface OpenAIClientConfig {
+	model: string
+	apiKey: string
+	baseURL: string
+	temperature?: number
+	maxTokens?: number
+	maxRetries?: number
+}
--- a/src/llms/utils.ts
+++ b/src/llms/utils.ts
@@ -0,0 +1,21 @@
+/**
+ * Utility functions for LLM integration
+ */
+import { z } from 'zod'
+
+import type { Tool } from './types'
+
+/**
+ * Convert Zod schema to OpenAI tool format
+ * Uses Zod 4 native z.toJSONSchema()
+ */
+export function zodToOpenAITool(name: string, tool: Tool) {
+	return {
+		type: 'function' as const,
+		function: {
+			name,
+			description: tool.description,
+			parameters: z.toJSONSchema(tool.inputSchema, { target: 'openapi-3.0' }),
+		},
+	}
+}
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -2,8 +2,7 @@
 * Internal tools for PageAgent.
 * @note Adapted from browser-use
 */
-import { Tool, tool } from 'ai'
-import zod from 'zod'
+import zod, { type z } from 'zod'

 import type { PageAgent } from '@/PageAgent'

@@ -24,9 +23,24 @@ import * as utils from './actions'
 window.utils = utils

 /**
- * Internal tools for PageAgent.
+ * Internal tool definition that has access to PageAgent `this` context
 */
-export const tools = new Map<string, Tool>()
+export interface PageAgentTool<TParams = any> {
+	// name: string
+	description: string
+	inputSchema: z.ZodType<TParams>
+	execute: (this: PageAgent, args: TParams) => Promise<string>
+}
+
+export function tool<TParams>(options: PageAgentTool<TParams>): PageAgentTool<TParams> {
+	return options
+}
+
+/**
+ * Internal tools for PageAgent.
+ * Note: Using any to allow different parameter types for each tool
+ */
+export const tools = new Map<string, PageAgentTool>()

 // tools.set(
 // 	'get_current_html',
@@ -49,9 +63,10 @@ tools.set(
 			text: zod.string(),
 			success: zod.boolean().default(true),
 		}),
-		execute: function (this: PageAgent, input) {
+		execute: async function (this: PageAgent, input) {
 			// @note main loop will handle this one
 			// this.onDone(input.text, input.success)
+			return Promise.resolve('Task completed')
 		},
 	})
 )
@@ -143,7 +158,7 @@ tools.set(
 		execute: async function (this: PageAgent, input) {
 			const element = getElementByIndex(this, input.index)
 			const elemText = this.elementTextMap.get(input.index)
-			await selectOptionElement(element as any, input.text)
+			await selectOptionElement(element as HTMLSelectElement, input.text)
 			return (
 				`✅ Selected option (${input.text}) in element (${elemText ? elemText : input.index}).` +
 				(await getSystemInfo())