refactor: monorepo

2025-12-01 20:11:12 +08:00
parent 1b9970da14
commit adec9d8197
98 changed files with 1144 additions and 1129 deletions
--- a/packages/page-agent/env.d.ts
+++ b/packages/page-agent/env.d.ts
@@ -0,0 +1,20 @@
+/// <reference types="vite/client" />
+import type { PageAgent } from './src/PageAgent'
+
+declare module '*.module.css' {
+	const classes: Record<string, string>
+	export default classes
+}
+
+declare module '*.md?raw' {
+	const content: string
+	export default content
+}
+
+declare global {
+	interface Window {
+		pageAgent?: PageAgent
+		PageAgent: typeof PageAgent
+		__PAGE_AGENT_IDS__: string[]
+	}
+}
--- a/packages/page-agent/package.json
+++ b/packages/page-agent/package.json
@@ -0,0 +1,57 @@
+{
+	"name": "page-agent",
+	"private": false,
+	"version": "0.0.4",
+	"type": "module",
+	"main": "./dist/lib/page-agent.js",
+	"module": "./dist/lib/page-agent.js",
+	"types": "./dist/lib/PageAgent.d.ts",
+	"exports": {
+		".": {
+			"types": "./dist/lib/PageAgent.d.ts",
+			"import": "./dist/lib/page-agent.js",
+			"default": "./dist/lib/page-agent.js"
+		}
+	},
+	"files": [
+		"dist/",
+		"README.md",
+		"LICENSE",
+		"NOTICE"
+	],
+	"description": "AI-powered UI agent for web applications - add intelligent automation to any webpage with a single script tag",
+	"keywords": [
+		"ai",
+		"automation",
+		"ui-agent",
+		"browser-automation",
+		"web-agent",
+		"llm",
+		"dom-interaction",
+		"intelligent-ui"
+	],
+	"author": "Simon<gaomeng1900>",
+	"license": "MIT",
+	"repository": {
+		"type": "git",
+		"url": "https://github.com/alibaba/page-agent.git",
+		"directory": "packages/page-agent"
+	},
+	"homepage": "https://alibaba.github.io/page-agent/",
+	"scripts": {
+		"build": "MODE=lib vite build && MODE=umd vite build",
+		"build:lib": "MODE=lib vite build",
+		"build:umd": "MODE=umd vite build",
+		"build:watch": "MODE=lib vite build --watch"
+	},
+	"dependencies": {
+		"ai-motion": "^0.4.7",
+		"chalk": "^5.6.2",
+		"zod": "^4.1.12"
+	},
+	"devDependencies": {
+		"@microsoft/api-extractor": "^7.55.1",
+		"unplugin-dts": "^1.0.0-beta.6",
+		"vite-plugin-css-injected-by-js": "^3.5.2"
+	}
+}
--- a/packages/page-agent/src/PageAgent.ts
+++ b/packages/page-agent/src/PageAgent.ts
@@ -0,0 +1,537 @@
+/**
+ * Copyright (C) 2025 Alibaba Group Holding Limited
+ * All rights reserved.
+ */
+import chalk from 'chalk'
+import zod from 'zod'
+
+import type { PageAgentConfig } from './config'
+import { MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
+import * as dom from './dom'
+import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
+import { getPageInfo } from './dom/getPageInfo'
+import { I18n } from './i18n'
+import { LLM, type Tool } from './llms'
+import { patchReact } from './patches/react'
+import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
+import { tools } from './tools'
+import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel'
+import { SimulatorMask } from './ui/SimulatorMask'
+import { trimLines, uid, waitUntil } from './utils'
+import { assert } from './utils/assert'
+import { getEventBus } from './utils/bus'
+
+export type { PageAgentConfig }
+export { tool, type PageAgentTool } from './tools'
+
+export interface AgentBrain {
+	// thinking?: string
+	evaluation_previous_goal: string
+	memory: string
+	next_goal: string
+}
+
+/**
+ * MacroTool input structure
+ */
+export interface MacroToolInput {
+	evaluation_previous_goal?: string
+	memory?: string
+	next_goal?: string
+	action: Record<string, any>
+}
+
+/**
+ * MacroTool output structure
+ */
+export interface MacroToolResult {
+	input: MacroToolInput
+	output: string
+}
+
+export interface AgentHistory {
+	brain: AgentBrain
+	action: {
+		name: string
+		input: any
+		output: string
+	}
+	usage: {
+		promptTokens: number
+		completionTokens: number
+		totalTokens: number
+		cachedTokens?: number
+		reasoningTokens?: number
+	}
+}
+
+export interface ExecutionResult {
+	success: boolean
+	data: string
+	history: AgentHistory[]
+}
+
+export class PageAgent extends EventTarget {
+	config: PageAgentConfig
+	id = uid()
+	bus = getEventBus(this.id)
+	i18n: I18n
+	panel: Panel
+	tools: typeof tools
+	paused = false
+	disposed = false
+	task = ''
+	taskId = ''
+
+	#llm: LLM
+	#totalWaitTime = 0
+	#abortController = new AbortController()
+
+	/** Corresponds to eval_page in browser-use */
+	flatTree: FlatDomTree | null = null
+	/**
+	 * All highlighted index-mapped interactive elements
+	 * Corresponds to DOMState.selector_map in browser-use
+	 */
+	selectorMap = new Map<number, InteractiveElementDomNode>()
+	/** highlight index -> element text */
+	elementTextMap = new Map<number, string>()
+	/** Corresponds to clickable_elements_to_string in browser-use */
+	simplifiedHTML = '<EMPTY>'
+	/** last time the tree was updated */
+	lastTimeUpdate = 0
+
+	/** Fullscreen mask */
+	mask = new SimulatorMask()
+	/** History records */
+	history: AgentHistory[] = []
+
+	constructor(config: PageAgentConfig = {}) {
+		super()
+
+		this.config = config
+		this.#llm = new LLM(this.config, this.id)
+		this.i18n = new I18n(this.config.language)
+		this.panel = new Panel(this)
+		this.tools = new Map(tools)
+
+		if (this.config.customTools) {
+			for (const [name, tool] of Object.entries(this.config.customTools)) {
+				if (tool === null) {
+					this.tools.delete(name)
+					continue
+				}
+				this.tools.set(name, tool)
+			}
+		}
+
+		if (!this.config.experimentalScriptExecutionTool) {
+			this.tools.delete('execute_javascript')
+		}
+
+		patchReact(this)
+
+		window.addEventListener('beforeunload', (e) => {
+			if (!this.disposed) this.dispose('PAGE_UNLOADING')
+		})
+	}
+
+	/**
+	 * @todo maybe return something?
+	 */
+	async execute(task: string): Promise<ExecutionResult> {
+		if (!task) throw new Error('Task is required')
+		this.task = task
+		this.taskId = uid()
+
+		const onBeforeStep = this.config.onBeforeStep || (() => void 0)
+		const onAfterStep = this.config.onAfterStep || (() => void 0)
+		const onBeforeTask = this.config.onBeforeTask || (() => void 0)
+		const onAfterTask = this.config.onAfterTask || (() => void 0)
+
+		await onBeforeTask.call(this)
+
+		// Show mask and panel
+		this.mask.show()
+
+		this.bus.emit('panel:show')
+		this.bus.emit('panel:reset')
+
+		this.bus.emit('panel:update', {
+			type: 'input',
+			displayText: this.task,
+		})
+
+		if (this.#abortController) {
+			this.#abortController.abort()
+			this.#abortController = new AbortController()
+		}
+
+		this.history = []
+
+		try {
+			let step = 0
+
+			while (true) {
+				await onBeforeStep.call(this, step)
+
+				console.group(`step: ${step + 1}`)
+
+				// abort
+				if (this.#abortController.signal.aborted) throw new Error('AbortError')
+				// pause
+				await waitUntil(() => !this.paused)
+
+				// Update status to thinking
+				console.log(chalk.blue('Thinking...'))
+				this.bus.emit('panel:update', {
+					type: 'thinking',
+					displayText: this.i18n.t('ui.panel.thinking'),
+				})
+
+				const result = await this.#llm.invoke(
+					[
+						{
+							role: 'system',
+							content: this.#getSystemPrompt(),
+						},
+						{
+							role: 'user',
+							content: this.#assembleUserPrompt(),
+						},
+					],
+					{ AgentOutput: this.#packMacroTool() },
+					this.#abortController.signal
+				)
+
+				const macroResult = result.toolResult as MacroToolResult
+				const input = macroResult.input
+				const output = macroResult.output
+				const brain = {
+					evaluation_previous_goal: input.evaluation_previous_goal || '',
+					memory: input.memory || '',
+					next_goal: input.next_goal || '',
+				}
+				const actionName = Object.keys(input.action)[0]
+				const action = {
+					name: actionName,
+					input: input.action[actionName],
+					output: output,
+				}
+
+				this.history.push({
+					brain,
+					action,
+					usage: result.usage,
+				})
+
+				console.log(chalk.green('Step finished:'), actionName)
+				console.groupEnd()
+
+				await onAfterStep.call(this, step, this.history)
+
+				step++
+				if (step > MAX_STEPS) {
+					this.#onDone('Step count exceeded maximum limit', false)
+					const result: ExecutionResult = {
+						success: false,
+						data: 'Step count exceeded maximum limit',
+						history: this.history,
+					}
+					await onAfterTask.call(this, result)
+					return result
+				}
+				if (actionName === 'done') {
+					const success = action.input?.success ?? false
+					const text = action.input?.text || 'no text provided'
+					console.log(chalk.green.bold('Task completed'), success, text)
+					this.#onDone(text, success)
+					const result: ExecutionResult = {
+						success,
+						data: text,
+						history: this.history,
+					}
+					await onAfterTask.call(this, result)
+					return result
+				}
+			}
+		} catch (error: unknown) {
+			console.error('Task failed', error)
+			this.#onDone(String(error), false)
+			const result: ExecutionResult = {
+				success: false,
+				data: String(error),
+				history: this.history,
+			}
+			await onAfterTask.call(this, result)
+			return result
+		}
+	}
+
+	/**
+	 * Merge all tools into a single MacroTool with the following input:
+	 * - thinking: string
+	 * - evaluation_previous_goal: string
+	 * - memory: string
+	 * - next_goal: string
+	 * - action: { toolName: toolInput }
+	 * where action must be selected from tools defined in this.tools
+	 */
+	#packMacroTool(): Tool<MacroToolInput, MacroToolResult> {
+		const tools = this.tools
+
+		const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
+			return zod.object({
+				[toolName]: tool.inputSchema,
+			})
+		})
+
+		const actionSchema = zod.union(
+			actionSchemas as unknown as [zod.ZodType, zod.ZodType, ...zod.ZodType[]]
+		)
+
+		const macroToolSchema = zod.object({
+			// thinking: zod.string().optional(),
+			evaluation_previous_goal: zod.string().optional(),
+			memory: zod.string().optional(),
+			next_goal: zod.string().optional(),
+			action: actionSchema,
+		})
+
+		return {
+			inputSchema: macroToolSchema as zod.ZodType<MacroToolInput>,
+			execute: async (input: MacroToolInput): Promise<MacroToolResult> => {
+				// abort
+				if (this.#abortController.signal.aborted) throw new Error('AbortError')
+				// pause
+				await waitUntil(() => !this.paused)
+
+				console.log(chalk.blue.bold('MacroTool execute'), input)
+				const action = input.action
+
+				const toolName = Object.keys(action)[0]
+				const toolInput = action[toolName]
+				const brain = trimLines(`✅: ${input.evaluation_previous_goal}
+						💾: ${input.memory}
+						🎯: ${input.next_goal}
+					`)
+
+				console.log(brain)
+				this.bus.emit('panel:update', {
+					type: 'thinking',
+					displayText: brain,
+				})
+
+				// Find the corresponding tool
+				const tool = tools.get(toolName)
+				assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
+
+				console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput)
+				this.bus.emit('panel:update', {
+					type: 'tool_executing',
+					toolName,
+					toolArgs: toolInput,
+					displayText: getToolExecutingText(toolName, toolInput, this.i18n),
+				})
+
+				const startTime = Date.now()
+
+				// Execute tool, bind `this` to PageAgent
+				let result = await tool.execute.bind(this)(toolInput)
+
+				const duration = Date.now() - startTime
+				console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
+
+				if (toolName === 'wait') {
+					this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
+					result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
+					if (this.#totalWaitTime >= 3)
+						result += '\nDo NOT wait any longer unless you have a good reason.\n'
+					result += '</sys>'
+				} else {
+					// For other tools, reset wait time
+					this.#totalWaitTime = 0
+				}
+
+				// Briefly display execution result
+				const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
+				if (displayResult)
+					this.bus.emit('panel:update', {
+						type: 'tool_executing',
+						toolName,
+						toolArgs: toolInput,
+						toolResult: result,
+						displayText: displayResult,
+						duration,
+					})
+
+				// Wait a moment to let user see the result
+				await new Promise((resolve) => setTimeout(resolve, 100))
+
+				// Return structured result
+				return {
+					input,
+					output: result,
+				}
+			},
+		}
+	}
+
+	/**
+	 * Get system prompt, dynamically replace language settings based on configured language
+	 */
+	#getSystemPrompt(): string {
+		let systemPrompt = SYSTEM_PROMPT
+
+		const targetLanguage = this.config.language === 'zh-CN' ? '中文' : 'English'
+		systemPrompt = systemPrompt.replace(
+			/Default working language: \*\*.*?\*\*/,
+			`Default working language: **${targetLanguage}**`
+		)
+
+		return systemPrompt
+	}
+
+	#assembleUserPrompt(): string {
+		let prompt = ''
+
+		// <agent_history>
+		//  - <step_>
+
+		prompt += '<agent_history>\n'
+
+		this.history.forEach((history, index) => {
+			prompt += `<step_${index + 1}>
+				Evaluation of Previous Step: ${history.brain.evaluation_previous_goal}
+				Memory: ${history.brain.memory}
+				Next Goal: ${history.brain.next_goal}
+				Action Results: ${history.action.output}
+				</step_${index + 1}>
+			`
+		})
+
+		prompt += '</agent_history>\n\n'
+
+		// <agent_state>
+		//  - <user_request>
+		//  - <step_info>
+		// <agent_state>
+
+		prompt += `<agent_state>
+			<user_request>
+			${this.task}
+			</user_request>
+			<step_info>
+			Step ${this.history.length + 1} of ${MAX_STEPS} max possible steps
+			Current date and time: ${new Date().toISOString()}
+			</step_info>
+			</agent_state>
+		`
+
+		// <browser_state>
+
+		prompt += this.#getBrowserState()
+
+		return trimLines(prompt)
+	}
+
+	#onDone(text: string, success = true) {
+		dom.cleanUpHighlights()
+
+		// Update panel status
+		this.bus.emit('panel:update', {
+			type: success ? 'output' : 'error',
+			displayText: text,
+		})
+
+		// Task completed
+		this.bus.emit('panel:update', {
+			type: 'completed',
+			displayText: this.i18n.t('ui.panel.taskCompleted'),
+		})
+
+		this.mask.hide()
+
+		this.#abortController.abort()
+	}
+
+	#getBrowserState(): string {
+		const pageUrl = window.location.href
+		const pageTitle = document.title
+		const pi = getPageInfo()
+
+		this.#updateTree()
+
+		let prompt = trimLines(`<browser_state>
+			Current Page: [${pageTitle}](${pageUrl})
+
+			Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page
+
+			${VIEWPORT_EXPANSION === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'}
+
+		`)
+
+		// Page header info
+		const has_content_above = pi.pixels_above > 4
+		if (has_content_above && VIEWPORT_EXPANSION !== -1) {
+			prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n`
+		} else {
+			prompt += `[Start of page]\n`
+		}
+
+		// Current viewport info
+		prompt += this.simplifiedHTML
+		prompt += `\n`
+
+		// Page footer info
+		const has_content_below = pi.pixels_below > 4
+		if (has_content_below && VIEWPORT_EXPANSION !== -1) {
+			prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n`
+		} else {
+			prompt += `[End of page]\n`
+		}
+
+		prompt += `</browser_state>\n`
+
+		return prompt
+	}
+
+	/**
+	 * Update document tree
+	 */
+	#updateTree() {
+		this.dispatchEvent(new Event('beforeUpdate'))
+		this.lastTimeUpdate = Date.now()
+		dom.cleanUpHighlights()
+		this.mask.wrapper.style.pointerEvents = 'none'
+		this.flatTree = dom.getFlatTree({
+			...this.config,
+			interactiveBlacklist: [
+				...(this.config.interactiveBlacklist || []),
+				...document.querySelectorAll('[data-page-agent-not-interactive]').values(),
+			],
+		})
+		this.mask.wrapper.style.pointerEvents = 'auto'
+		this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes)
+		this.selectorMap.clear()
+		this.selectorMap = dom.getSelectorMap(this.flatTree)
+		this.elementTextMap.clear()
+		this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML)
+		this.dispatchEvent(new Event('afterUpdate'))
+	}
+
+	dispose(reason?: string) {
+		console.log('Disposing PageAgent...')
+		this.disposed = true
+		dom.cleanUpHighlights()
+		this.flatTree = null
+		this.selectorMap.clear()
+		this.elementTextMap.clear()
+		this.panel.dispose()
+		this.mask.dispose()
+		this.history = []
+		this.#abortController.abort(reason ?? 'PageAgent disposed')
+
+		this.config.onDispose?.call(this, reason)
+	}
+}
--- a/packages/page-agent/src/config/constants.ts
+++ b/packages/page-agent/src/config/constants.ts
@@ -0,0 +1,29 @@
+/**
+ * @note Since isTopElement depends on elementFromPoint,
+ * it returns null when out of viewport, this feature has no practical use, only differ between -1 and 0
+ */
+// export const VIEWPORT_EXPANSION = 100
+export const VIEWPORT_EXPANSION = -1
+
+// Dev environment: use .env config if available, otherwise fallback to testing api
+export const DEFAULT_MODEL_NAME: string =
+	import.meta.env.DEV && import.meta.env.LLM_MODEL_NAME
+		? import.meta.env.LLM_MODEL_NAME
+		: 'PAGE-AGENT-FREE-TESTING-RANDOM'
+
+export const DEFAULT_API_KEY: string =
+	import.meta.env.DEV && import.meta.env.LLM_API_KEY
+		? import.meta.env.LLM_API_KEY
+		: 'PAGE-AGENT-FREE-TESTING-RANDOM'
+
+export const DEFAULT_BASE_URL: string =
+	import.meta.env.DEV && import.meta.env.LLM_BASE_URL
+		? import.meta.env.LLM_BASE_URL
+		: 'https://hwcxiuzfylggtcktqgij.supabase.co/functions/v1/llm-testing-proxy'
+
+// internal
+
+export const LLM_MAX_RETRIES = 2
+export const MAX_STEPS = 20
+export const DEFAULT_TEMPERATURE = 0.7 // higher randomness helps auto-recovery
+export const DEFAULT_MAX_TOKENS = 4096
--- a/packages/page-agent/src/config/index.ts
+++ b/packages/page-agent/src/config/index.ts
@@ -0,0 +1,108 @@
+import type { AgentHistory, ExecutionResult, PageAgent } from '../PageAgent'
+import type { DomConfig } from '../dom'
+import type { SupportedLanguage } from '../i18n'
+import type { PageAgentTool } from '../tools'
+import {
+	DEFAULT_API_KEY,
+	DEFAULT_BASE_URL,
+	DEFAULT_MAX_TOKENS,
+	DEFAULT_MODEL_NAME,
+	DEFAULT_TEMPERATURE,
+	LLM_MAX_RETRIES,
+} from './constants'
+
+export interface LLMConfig {
+	baseURL?: string
+	apiKey?: string
+	model?: string
+	temperature?: number
+	maxTokens?: number
+	maxRetries?: number
+}
+
+export interface AgentConfig {
+	// theme?: 'light' | 'dark'
+	language?: SupportedLanguage
+
+	/**
+	 * Custom tools to extend PageAgent capabilities
+	 * @experimental
+	 * @note You can also override or remove internal tools by using the same name.
+	 * @see [tools](../tools/index.ts)
+	 *
+	 * @example
+	 * // override internal tool
+	 * import { tool } from 'page-agent'
+	 * const customTools = {
+	 * ask_user: tool({
+	 * 	description:
+	 * 		'Ask the user or parent model a question and wait for their answer. Use this if you need more information or clarification.',
+	 * 	inputSchema: zod.object({
+	 * 		question: zod.string(),
+	 * 	}),
+	 * 	execute: async function (this: PageAgent, input) {
+	 * 		const answer = await do_some_thing(input.question)
+	 * 		return "✅ Received user answer: " + answer
+	 * 	},
+	 * })
+	 * }
+	 *
+	 * @example
+	 * // remove internal tool
+	 * const customTools = {
+	 * 	ask_user: null // never ask user questions
+	 * }
+	 */
+	customTools?: Record<string, PageAgentTool | null>
+
+	// lifecycle hooks
+	// @todo: use event instead of hooks
+
+	onBeforeStep?: (this: PageAgent, stepCnt: number) => Promise<void> | void
+	onAfterStep?: (this: PageAgent, stepCnt: number, history: AgentHistory[]) => Promise<void> | void
+	onBeforeTask?: (this: PageAgent) => Promise<void> | void
+	onAfterTask?: (this: PageAgent, result: ExecutionResult) => Promise<void> | void
+
+	/**
+	 * @note this hook can block the disposal process
+	 * @note when dispose caused by page unload, reason will be 'PAGE_UNLOADING'. this method CANNOT block unloading. async operations may be cut.
+	 */
+	onDispose?: (this: PageAgent, reason?: string) => void
+
+	// page behavior hooks
+
+	/**
+	 * @experimental
+	 * Enable the experimental script execution tool that allows executing generated JavaScript code on the page.
+	 * @note Can cause unpredictable side effects.
+	 * @note May bypass some safe guards and data-masking mechanisms.
+	 */
+	experimentalScriptExecutionTool?: boolean
+
+	/**
+	 * TODO: @unimplemented
+	 * hook when action causes a new page to be opened
+	 * @note PageAgent will try to detect new pages and decide if it's caused by an action. But not very reliable.
+	 */
+	onNewPageOpen?: (this: PageAgent, url: string) => Promise<void> | void
+
+	/**
+	 * TODO: @unimplemented
+	 * try to navigate to a new page instead of opening a new tab/window.
+	 * @note will unload the current page when a action tries to open a new page. so that things keep in the same tab/window.
+	 */
+	experimentalPreventNewPage?: boolean
+}
+
+export type PageAgentConfig = LLMConfig & AgentConfig & DomConfig
+
+export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
+	return {
+		baseURL: config.baseURL ?? DEFAULT_BASE_URL,
+		apiKey: config.apiKey ?? DEFAULT_API_KEY,
+		model: config.model ?? DEFAULT_MODEL_NAME,
+		temperature: config.temperature ?? DEFAULT_TEMPERATURE,
+		maxTokens: config.maxTokens ?? DEFAULT_MAX_TOKENS,
+		maxRetries: config.maxRetries ?? LLM_MAX_RETRIES,
+	}
+}
--- a/packages/page-agent/src/dom/dom_tree/index.js
+++ b/packages/page-agent/src/dom/dom_tree/index.js
--- a/packages/page-agent/src/dom/dom_tree/type.ts
+++ b/packages/page-agent/src/dom/dom_tree/type.ts
@@ -0,0 +1,51 @@
+// FlatDomTree: 扁平化 DOM 树结构，适用于高效存储和遍历页面结构。
+// 每个节点通过 map 索引，支持文本节点和元素节点，字段区分 undefined 和 false。
+
+export interface FlatDomTree {
+	rootId: string
+	map: Record<string, DomNode>
+}
+
+export type DomNode = TextDomNode | ElementDomNode | InteractiveElementDomNode
+
+export interface TextDomNode {
+	type: 'TEXT_NODE'
+	text: string
+	isVisible: boolean
+	// 其他可选字段
+	[key: string]: unknown
+}
+
+export interface ElementDomNode {
+	tagName: string
+	attributes?: Record<string, string>
+	xpath?: string
+	children?: string[]
+	isVisible?: boolean
+	isTopElement?: boolean
+	isInViewport?: boolean
+	isNew?: boolean
+	isInteractive?: false
+	highlightIndex?: number
+	extra?: Record<string, any>
+	// 其他可选字段
+	[key: string]: unknown
+}
+
+export interface InteractiveElementDomNode {
+	tagName: string
+	attributes?: Record<string, string>
+	xpath?: string
+	children?: string[]
+	isVisible?: boolean
+	isTopElement?: boolean
+	isInViewport?: boolean
+	isInteractive: true
+	highlightIndex: number
+	/**
+	 * 可交互元素的 dom 引用
+	 */
+	ref: HTMLElement
+	// 其他可选字段
+	[key: string]: unknown
+}
--- a/packages/page-agent/src/dom/getPageInfo.ts
+++ b/packages/page-agent/src/dom/getPageInfo.ts
@@ -0,0 +1,42 @@
+export function getPageInfo() {
+	const viewport_width = window.innerWidth
+	const viewport_height = window.innerHeight
+
+	const page_width = Math.max(document.documentElement.scrollWidth, document.body.scrollWidth || 0)
+	const page_height = Math.max(
+		document.documentElement.scrollHeight,
+		document.body.scrollHeight || 0
+	)
+
+	const scroll_x = window.scrollX || window.pageXOffset || document.documentElement.scrollLeft || 0
+	const scroll_y = window.scrollY || window.pageYOffset || document.documentElement.scrollTop || 0
+
+	const pixels_below = Math.max(0, page_height - (window.innerHeight + scroll_y))
+	const pixels_right = Math.max(0, page_width - (window.innerWidth + scroll_x))
+
+	return {
+		// Current viewport dimensions
+		viewport_width,
+		viewport_height,
+
+		// Total page dimensions
+		page_width,
+		page_height,
+
+		// Current scroll position
+		scroll_x,
+		scroll_y,
+
+		pixels_above: scroll_y,
+		pixels_below,
+
+		pages_above: viewport_height > 0 ? scroll_y / viewport_height : 0,
+		pages_below: viewport_height > 0 ? pixels_below / viewport_height : 0,
+		total_pages: viewport_height > 0 ? page_height / viewport_height : 0,
+
+		current_page_position: scroll_y / Math.max(1, page_height - viewport_height),
+
+		pixels_left: scroll_x,
+		pixels_right,
+	}
+}
--- a/packages/page-agent/src/dom/index.ts
+++ b/packages/page-agent/src/dom/index.ts
@@ -0,0 +1,475 @@
+import { VIEWPORT_EXPANSION } from '../config/constants'
+import domTree from './dom_tree/index'
+import {
+	ElementDomNode,
+	FlatDomTree,
+	InteractiveElementDomNode,
+	TextDomNode,
+} from './dom_tree/type'
+
+export interface DomConfig {
+	interactiveBlacklist?: (Element | (() => Element))[]
+	interactiveWhitelist?: (Element | (() => Element))[]
+	include_attributes?: string[]
+	highlightOpacity?: number
+	highlightLabelOpacity?: number
+}
+
+/**
+ * 用于检测可交互元素是否是新出现的。
+ */
+const newElementsCache = new WeakMap<HTMLElement, string>()
+
+export function getFlatTree(config: DomConfig): FlatDomTree {
+	const interactiveBlacklist = [] as Element[]
+	for (const item of config.interactiveBlacklist || []) {
+		if (typeof item === 'function') {
+			interactiveBlacklist.push(item())
+		} else {
+			interactiveBlacklist.push(item)
+		}
+	}
+
+	const interactiveWhitelist = [] as Element[]
+	for (const item of config.interactiveWhitelist || []) {
+		if (typeof item === 'function') {
+			interactiveWhitelist.push(item())
+		} else {
+			interactiveWhitelist.push(item)
+		}
+	}
+
+	const elements = domTree({
+		doHighlightElements: true,
+		debugMode: true,
+		focusHighlightIndex: -1,
+		viewportExpansion: VIEWPORT_EXPANSION,
+		interactiveBlacklist,
+		interactiveWhitelist,
+		highlightOpacity: config.highlightOpacity ?? 0.0,
+		highlightLabelOpacity: config.highlightLabelOpacity ?? 0.1,
+	}) as FlatDomTree
+
+	const currentUrl = window.location.href
+
+	/**
+	 * 标记新出现的元素
+	 * @todo browser-use 使用 hash(位置，属性等信息) 来判断是否同一个元素，
+	 *       能够解决 1. 元素被删除后重新添加 2. 页面卸载 等问题。
+	 *       这里先简单做.
+	 */
+	for (const nodeId in elements.map) {
+		const node = elements.map[nodeId]
+		if (node.isInteractive && node.ref) {
+			const ref = node.ref as HTMLElement
+			// @note 这样太严格，元素是可以跨页面存在的
+			// if (newElementsCache.get(ref) !== currentUrl) {
+			if (!newElementsCache.has(ref)) {
+				newElementsCache.set(ref, currentUrl)
+				node.isNew = true
+			}
+		}
+	}
+
+	return elements
+}
+
+/**
+ * elementsToString 内部使用的类型
+ */
+interface TreeNode {
+	type: 'text' | 'element'
+	parent: TreeNode | null
+	children: TreeNode[]
+	isVisible: boolean
+	// Text node properties
+	text?: string
+	// Element node properties
+	tagName?: string
+	attributes?: Record<string, string>
+	isInteractive?: boolean
+	isTopElement?: boolean
+	isNew?: boolean
+	highlightIndex?: number
+	extra?: Record<string, any>
+}
+
+/**
+ * 对应 python 中的 views::clickable_elements_to_string,
+ * 将 dom 信息处理成适合 llm 阅读的文本格式
+ * @形如
+ * ``` text
+ * [0]<a aria-label=page-agent.js 首页 />
+ * [1]<div >P />
+ * [2]<div >page-agent.js
+ * UI Agent in your webpage />
+ * [3]<a >文档 />
+ * [4]<a aria-label=查看源码（在新窗口打开）>源码 />
+ * UI Agent in your webpage
+ * 用户输入需求，AI 理解页面并自动操作。
+ * [5]<a role=button>快速开始 />
+ * [6]<a role=button>查看文档 />
+ * 无需后端
+ * ```
+ * 其中可交互元素用序号标出，提示llm可以用序号操作。
+ * 缩进代表父子关系。
+ * 普通文本则直接列出来。
+ *
+ * @todo 数据脱敏过滤器
+ */
+export function flatTreeToString(flatTree: FlatDomTree, include_attributes?: string[]): string {
+	const DEFAULT_INCLUDE_ATTRIBUTES = [
+		'title',
+		'type',
+		'checked',
+		'name',
+		'role',
+		'value',
+		'placeholder',
+		'data-date-format',
+		'alt',
+		'aria-label',
+		'aria-expanded',
+		'data-state',
+		'aria-checked',
+
+		// @edit added for better form handling
+		'id',
+		'for',
+
+		// for jump check
+		'target',
+
+		// absolute 定位的下拉菜单
+		'aria-haspopup',
+		'aria-controls',
+		'aria-owns',
+	]
+
+	const includeAttrs = [...(include_attributes || []), ...DEFAULT_INCLUDE_ATTRIBUTES]
+
+	// Helper function to cap text length
+	const capTextLength = (text: string, maxLength: number): string => {
+		if (text.length > maxLength) {
+			return text.substring(0, maxLength) + '...'
+		}
+		return text
+	}
+
+	// Build tree structure from flat map
+	const buildTreeNode = (nodeId: string): TreeNode | null => {
+		const node = flatTree.map[nodeId]
+		if (!node) return null
+
+		if (node.type === 'TEXT_NODE') {
+			const textNode = node as TextDomNode
+			return {
+				type: 'text',
+				text: textNode.text,
+				isVisible: textNode.isVisible,
+				parent: null,
+				children: [],
+			}
+		} else {
+			const elementNode = node as ElementDomNode
+			const children: TreeNode[] = []
+
+			if (elementNode.children) {
+				for (const childId of elementNode.children) {
+					const child = buildTreeNode(childId)
+					if (child) {
+						child.parent = null // Will be set later
+						children.push(child)
+					}
+				}
+			}
+
+			return {
+				type: 'element',
+				tagName: elementNode.tagName,
+				attributes: elementNode.attributes ?? {},
+				isVisible: elementNode.isVisible ?? false,
+				isInteractive: elementNode.isInteractive ?? false,
+				isTopElement: elementNode.isTopElement ?? false,
+				isNew: elementNode.isNew ?? false,
+				highlightIndex: elementNode.highlightIndex,
+				parent: null,
+				children,
+				extra: elementNode.extra ?? {},
+			}
+		}
+	}
+
+	// Set parent references
+	const setParentReferences = (node: TreeNode, parent: TreeNode | null = null) => {
+		node.parent = parent
+		for (const child of node.children) {
+			setParentReferences(child, node)
+		}
+	}
+
+	// Build root node
+	const rootNode = buildTreeNode(flatTree.rootId)
+	if (!rootNode) return ''
+
+	setParentReferences(rootNode)
+
+	// Helper to check if text node has parent with highlight index
+	const hasParentWithHighlightIndex = (node: TreeNode): boolean => {
+		let current = node.parent
+		while (current) {
+			if (current.type === 'element' && current.highlightIndex !== undefined) {
+				return true
+			}
+			current = current.parent
+		}
+		return false
+	}
+
+	// Helper to check if parent is top element
+	// const isParentTopElement = (node: TreeNode): boolean => {
+	// 	return node.parent?.type === 'element' && node.parent.isTopElement === true
+	// }
+
+	// Main processing function
+	const processNode = (node: TreeNode, depth: number, result: string[]): void => {
+		let nextDepth = depth
+		const depthStr = '\t'.repeat(depth)
+
+		if (node.type === 'element') {
+			// Add element with highlight_index
+			if (node.highlightIndex !== undefined) {
+				nextDepth += 1
+
+				const text = getAllTextTillNextClickableElement(node)
+				let attributesHtmlStr = ''
+
+				if (includeAttrs.length > 0 && node.attributes) {
+					const attributesToInclude: Record<string, string> = {}
+
+					// Filter attributes
+					for (const key of includeAttrs) {
+						const value = node.attributes[key]
+						if (value && value.trim() !== '') {
+							attributesToInclude[key] = value.trim()
+						}
+					}
+
+					// Remove duplicate values (for attributes longer than 5 chars)
+					const orderedKeys = includeAttrs.filter((key) => key in attributesToInclude)
+					if (orderedKeys.length > 1) {
+						const keysToRemove = new Set<string>()
+						const seenValues: Record<string, string> = {}
+
+						for (const key of orderedKeys) {
+							const value = attributesToInclude[key]
+							if (value.length > 5) {
+								if (value in seenValues) {
+									keysToRemove.add(key)
+								} else {
+									seenValues[value] = key
+								}
+							}
+						}
+
+						for (const key of keysToRemove) {
+							delete attributesToInclude[key]
+						}
+					}
+
+					// Remove role if it matches tagName
+					if (attributesToInclude.role === node.tagName) {
+						delete attributesToInclude.role
+					}
+
+					// Remove attributes that duplicate text content
+					const attrsToRemoveIfTextMatches = ['aria-label', 'placeholder', 'title']
+					for (const attr of attrsToRemoveIfTextMatches) {
+						if (
+							attributesToInclude[attr] &&
+							attributesToInclude[attr].toLowerCase().trim() === text.toLowerCase().trim()
+						) {
+							delete attributesToInclude[attr]
+						}
+					}
+
+					if (Object.keys(attributesToInclude).length > 0) {
+						attributesHtmlStr = Object.entries(attributesToInclude)
+							.map(([key, value]) => `${key}=${capTextLength(value, 20)}`)
+							.join(' ')
+					}
+				}
+
+				// Build the line
+				const highlightIndicator = node.isNew
+					? `*[${node.highlightIndex}]`
+					: `[${node.highlightIndex}]`
+				let line = `${depthStr}${highlightIndicator}<${node.tagName ?? ''}`
+
+				if (attributesHtmlStr) {
+					line += ` ${attributesHtmlStr}`
+				}
+
+				/**
+				 * @edit scrollable 数据
+				 */
+				if (node.extra) {
+					if (node.extra.scrollable) {
+						let scrollDataText = ''
+						if (node.extra.scrollData?.left)
+							scrollDataText += `left=${node.extra.scrollData.left}, `
+						if (node.extra.scrollData?.top) scrollDataText += `top=${node.extra.scrollData.top}, `
+						if (node.extra.scrollData?.right)
+							scrollDataText += `right=${node.extra.scrollData.right}, `
+						if (node.extra.scrollData?.bottom)
+							scrollDataText += `bottom=${node.extra.scrollData.bottom}`
+
+						line += ` data-scrollable="${scrollDataText}"`
+					}
+				}
+
+				if (text) {
+					const trimmedText = text.trim()
+					if (!attributesHtmlStr) {
+						line += ' '
+					}
+					line += `>${trimmedText}`
+				} else if (!attributesHtmlStr) {
+					line += ' '
+				}
+
+				line += ' />'
+				result.push(line)
+			}
+
+			// Process children regardless
+			for (const child of node.children) {
+				processNode(child, nextDepth, result)
+			}
+		} else if (node.type === 'text') {
+			// Add text only if it doesn't have a highlighted parent
+			if (hasParentWithHighlightIndex(node)) {
+				return
+			}
+
+			if (
+				node.parent &&
+				node.parent.type === 'element' &&
+				node.parent.isVisible &&
+				node.parent.isTopElement
+			) {
+				result.push(`${depthStr}${node.text ?? ''}`)
+			}
+		}
+	}
+
+	const result: string[] = []
+	processNode(rootNode, 0, result)
+	return result.join('\n')
+}
+
+// Get all text until next clickable element
+export const getAllTextTillNextClickableElement = (node: TreeNode, maxDepth = -1): string => {
+	const textParts: string[] = []
+
+	const collectText = (currentNode: TreeNode, currentDepth: number) => {
+		if (maxDepth !== -1 && currentDepth > maxDepth) {
+			return
+		}
+
+		// Skip this branch if we hit a highlighted element (except for the current node)
+		if (
+			currentNode.type === 'element' &&
+			currentNode !== node &&
+			currentNode.highlightIndex !== undefined
+		) {
+			return
+		}
+
+		if (currentNode.type === 'text' && currentNode.text) {
+			textParts.push(currentNode.text)
+		} else if (currentNode.type === 'element') {
+			for (const child of currentNode.children) {
+				collectText(child, currentDepth + 1)
+			}
+		}
+	}
+
+	collectText(node, 0)
+	return textParts.join('\n').trim()
+}
+
+export function getSelectorMap(flatTree: FlatDomTree): Map<number, InteractiveElementDomNode> {
+	const selectorMap = new Map<number, InteractiveElementDomNode>()
+
+	const keys = Object.keys(flatTree.map)
+	for (const key of keys) {
+		const node = flatTree.map[key]
+		if (node.isInteractive && typeof node.highlightIndex === 'number') {
+			selectorMap.set(node.highlightIndex, node as InteractiveElementDomNode)
+		}
+	}
+
+	return selectorMap
+}
+
+export function getElementTextMap(simplifiedHTML: string) {
+	const lines = simplifiedHTML
+		.split('\n')
+		.map((line) => line.trim())
+		.filter((line) => line.length > 0)
+	const elementTextMap = new Map<number, string>()
+	for (const line of lines) {
+		const regex = /^\[(\d+)\]<[^>]+>([^<]*)/
+		const match = regex.exec(line)
+		if (match) {
+			const index = parseInt(match[1], 10)
+			elementTextMap.set(index, line)
+		}
+	}
+
+	return elementTextMap
+}
+
+export function cleanUpHighlights() {
+	const cleanupFunctions = (window as any)._highlightCleanupFunctions || []
+	for (const cleanup of cleanupFunctions) {
+		if (typeof cleanup === 'function') {
+			cleanup()
+		}
+	}
+
+	;(window as any)._highlightCleanupFunctions = []
+}
+
+// 监听 URL 的任何变化，立刻清空 highLights
+window.addEventListener('popstate', () => {
+	// console.log('URL changed (popstate), highlights cleaned up.')
+	cleanUpHighlights()
+})
+window.addEventListener('hashchange', () => {
+	// console.log('URL changed (hashchange), highlights cleaned up.')
+	cleanUpHighlights()
+})
+window.addEventListener('beforeunload', () => {
+	// console.log('Page is unloading, highlights cleaned up.')
+	cleanUpHighlights()
+})
+
+const navigation = (window as any).navigation
+if (navigation && typeof navigation.addEventListener === 'function') {
+	navigation.addEventListener('navigate', () => {
+		// console.log('Navigation event detected, highlights cleaned up.')
+		cleanUpHighlights()
+	})
+} else {
+	// 定时器
+	let currentUrl = window.location.href
+	setInterval(() => {
+		if (window.location.href !== currentUrl) {
+			currentUrl = window.location.href
+			// console.log('URL changed (interval), highlights cleaned up.')
+			cleanUpHighlights()
+		}
+	}, 500)
+}
--- a/packages/page-agent/src/entry.ts
+++ b/packages/page-agent/src/entry.ts
@@ -0,0 +1,40 @@
+/**
+ * Auto-run entry for page-agent.js. Insert this script into your page to get page-agent functionality.
+ */
+import { PageAgent, type PageAgentConfig } from './PageAgent'
+
+// Clean up existing instances to prevent multiple injections from bookmarklet
+if (window.pageAgent) {
+	window.pageAgent.dispose()
+}
+
+// Mount to global window object
+window.PageAgent = PageAgent
+
+// Export for ES module usage
+// export { PageAgent }
+
+console.log('🚀 page-agent.js loaded!')
+
+const DEMO_MODEL = 'PAGE-AGENT-FREE-TESTING-RANDOM'
+const DEMO_BASE_URL = 'https://hwcxiuzfylggtcktqgij.supabase.co/functions/v1/llm-testing-proxy'
+const DEMO_API_KEY = 'PAGE-AGENT-FREE-TESTING-RANDOM'
+
+const currentScript = document.currentScript as HTMLScriptElement | null
+if (currentScript) {
+	console.log('🚀 page-agent.js detected current script:', currentScript.src)
+	const url = new URL(currentScript.src)
+	const model = url.searchParams.get('model') || DEMO_MODEL
+	const baseURL = url.searchParams.get('baseURL') || DEMO_BASE_URL
+	const apiKey = url.searchParams.get('apiKey') || DEMO_API_KEY
+	const language = (url.searchParams.get('lang') as 'zh-CN' | 'en-US') || 'zh-CN'
+	const config: PageAgentConfig = { model, baseURL, apiKey, language }
+	window.pageAgent = new PageAgent(config)
+} else {
+	console.log('🚀 page-agent.js no current script detected, using default demo config')
+	window.pageAgent = new PageAgent()
+}
+
+console.log('🚀 page-agent.js initialized with config:', window.pageAgent.config)
+
+window.pageAgent.bus.emit('panel:show') // Show panel
--- a/packages/page-agent/src/i18n/index.ts
+++ b/packages/page-agent/src/i18n/index.ts
@@ -0,0 +1,50 @@
+import {
+	type SupportedLanguage,
+	type TranslationKey,
+	type TranslationParams,
+	type TranslationSchema,
+	locales,
+} from './locales'
+
+export class I18n {
+	private language: SupportedLanguage
+	private translations: TranslationSchema
+
+	constructor(language: SupportedLanguage = 'en-US') {
+		this.language = language in locales ? language : 'en-US'
+		this.translations = locales[language]
+	}
+
+	// 类型安全的翻译方法
+	t(key: TranslationKey, params?: TranslationParams): string {
+		const value = this.getNestedValue(this.translations, key)
+		if (!value) {
+			console.warn(`Translation key "${key}" not found for language "${this.language}"`)
+			return key
+		}
+
+		if (params) {
+			return this.interpolate(value, params)
+		}
+		return value
+	}
+
+	private getNestedValue(obj: any, path: string): string | undefined {
+		return path.split('.').reduce((current, key) => current?.[key], obj)
+	}
+
+	private interpolate(template: string, params: TranslationParams): string {
+		return template.replace(/\{\{(\w+)\}\}/g, (match, key) => {
+			// Use != null to check for both null and undefined, allow empty strings
+			return params[key] != null ? params[key].toString() : match
+		})
+	}
+
+	getLanguage(): SupportedLanguage {
+		return this.language
+	}
+}
+
+// 导出类型和实例创建函数
+export type { TranslationKey, SupportedLanguage, TranslationParams }
+export { locales }
--- a/packages/page-agent/src/i18n/locales.ts
+++ b/packages/page-agent/src/i18n/locales.ts
@@ -0,0 +1,126 @@
+// English translations (base/reference language)
+const enUS = {
+	ui: {
+		panel: {
+			ready: 'Ready',
+			thinking: 'Thinking...',
+			paused: 'Paused',
+			taskInput: 'Enter new task, describe steps in detail, press Enter to submit',
+			userAnswerPrompt: 'Please answer the question above, press Enter to submit',
+			taskTerminated: 'Task terminated',
+			taskCompleted: 'Task completed',
+			continueExecution: 'Continue execution',
+			userAnswer: 'User answer: {{input}}',
+			question: 'Question: {{question}}',
+			waitingPlaceholder: 'Waiting for task to start...',
+			pause: 'Pause',
+			continue: 'Continue',
+			stop: 'Stop',
+			expand: 'Expand history',
+			collapse: 'Collapse history',
+			step: 'Step {{number}} · {{time}}{{duration}}',
+		},
+		tools: {
+			clicking: 'Clicking element [{{index}}]...',
+			inputting: 'Inputting text to element [{{index}}]...',
+			selecting: 'Selecting option "{{text}}"...',
+			scrolling: 'Scrolling page...',
+			waiting: 'Waiting {{seconds}} seconds...',
+			done: 'Task done',
+			clicked: '🖱️ Clicked element [{{index}}]',
+			inputted: '⌨️ Inputted text "{{text}}"',
+			selected: '☑️ Selected option "{{text}}"',
+			scrolled: '🛞 Page scrolled',
+			waited: '⌛️ Wait completed',
+			executing: 'Executing {{toolName}}...',
+			resultSuccess: 'success',
+			resultFailure: 'failed',
+			resultError: 'error',
+		},
+		errors: {
+			elementNotFound: 'No interactive element found at index {{index}}',
+			taskRequired: 'Task description is required',
+			executionFailed: 'Task execution failed',
+			notInputElement: 'Element is not an input or textarea',
+			notSelectElement: 'Element is not a select element',
+			optionNotFound: 'Option "{{text}}" not found',
+		},
+	},
+} as const
+
+// Chinese translations (must match the structure of enUS)
+const zhCN = {
+	ui: {
+		panel: {
+			ready: '准备就绪',
+			thinking: '正在思考...',
+			paused: '暂停中，稍后',
+			taskInput: '输入新任务，详细描述步骤，回车提交',
+			userAnswerPrompt: '请回答上面问题，回车提交',
+			taskTerminated: '任务已终止',
+			taskCompleted: '任务结束',
+			continueExecution: '继续执行',
+			userAnswer: '用户回答: {{input}}',
+			question: '询问: {{question}}',
+			waitingPlaceholder: '等待任务开始...',
+			pause: '暂停',
+			continue: '继续',
+			stop: '终止',
+			expand: '展开历史',
+			collapse: '收起历史',
+			step: '步骤 {{number}} · {{time}}{{duration}}',
+		},
+		tools: {
+			clicking: '正在点击元素 [{{index}}]...',
+			inputting: '正在输入文本到元素 [{{index}}]...',
+			selecting: '正在选择选项 "{{text}}"...',
+			scrolling: '正在滚动页面...',
+			waiting: '等待 {{seconds}} 秒...',
+			done: '结束任务',
+			clicked: '🖱️ 已点击元素 [{{index}}]',
+			inputted: '⌨️ 已输入文本 "{{text}}"',
+			selected: '☑️ 已选择选项 "{{text}}"',
+			scrolled: '🛞 页面滚动完成',
+			waited: '⌛️ 等待完成',
+			executing: '正在执行 {{toolName}}...',
+			resultSuccess: '成功',
+			resultFailure: '失败',
+			resultError: '错误',
+		},
+		errors: {
+			elementNotFound: '未找到索引为 {{index}} 的交互元素',
+			taskRequired: '任务描述不能为空',
+			executionFailed: '任务执行失败',
+			notInputElement: '元素不是输入框或文本域',
+			notSelectElement: '元素不是选择框',
+			optionNotFound: '未找到选项 "{{text}}"',
+		},
+	},
+} as const
+
+// Type definitions generated from English base structure (but with string values)
+type DeepStringify<T> = {
+	[K in keyof T]: T[K] extends string ? string : T[K] extends object ? DeepStringify<T[K]> : T[K]
+}
+
+export type TranslationSchema = DeepStringify<typeof enUS>
+
+// Utility type: Extract all nested paths from translation object
+type NestedKeyOf<ObjectType extends object> = {
+	[Key in keyof ObjectType & (string | number)]: ObjectType[Key] extends object
+		? `${Key}` | `${Key}.${NestedKeyOf<ObjectType[Key]>}`
+		: `${Key}`
+}[keyof ObjectType & (string | number)]
+
+// Extract all possible key paths from translation structure
+export type TranslationKey = NestedKeyOf<TranslationSchema>
+
+// Parameterized translation types
+export type TranslationParams = Record<string, string | number>
+
+export const locales = {
+	'en-US': enUS,
+	'zh-CN': zhCN,
+} as const
+
+export type SupportedLanguage = keyof typeof locales
--- a/packages/page-agent/src/llms/OpenAIClient.ts
+++ b/packages/page-agent/src/llms/OpenAIClient.ts
@@ -0,0 +1,188 @@
+/**
+ * OpenAI Client implementation
+ */
+import { InvokeError, InvokeErrorType } from './errors'
+import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types'
+import { modelPatch, zodToOpenAITool } from './utils'
+
+export class OpenAIClient implements LLMClient {
+	config: OpenAIClientConfig
+
+	constructor(config: OpenAIClientConfig) {
+		this.config = config
+	}
+
+	async invoke(
+		messages: Message[],
+		tools: Record<string, Tool>,
+		abortSignal?: AbortSignal
+	): Promise<InvokeResult> {
+		// 1. Convert tools to OpenAI format
+		const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
+
+		// 2. Call API
+		let response: Response
+		try {
+			response = await fetch(`${this.config.baseURL}/chat/completions`, {
+				method: 'POST',
+				headers: {
+					'Content-Type': 'application/json',
+					Authorization: `Bearer ${this.config.apiKey}`,
+				},
+				body: JSON.stringify(
+					modelPatch({
+						model: this.config.model,
+						temperature: this.config.temperature,
+						max_tokens: this.config.maxTokens,
+						messages,
+
+						tools: openaiTools,
+						// tool_choice: 'required',
+						tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
+
+						// model specific params
+
+						// reasoning_effort: 'minimal',
+						// verbosity: 'low',
+						parallel_tool_calls: false,
+					})
+				),
+				signal: abortSignal,
+			})
+		} catch (error: unknown) {
+			// Network error
+			throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
+		}
+
+		// 3. Handle HTTP errors
+		if (!response.ok) {
+			const errorData = await response.json().catch()
+			const errorMessage =
+				(errorData as { error?: { message?: string } }).error?.message || response.statusText
+
+			if (response.status === 401 || response.status === 403) {
+				throw new InvokeError(
+					InvokeErrorType.AUTH_ERROR,
+					`Authentication failed: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status === 429) {
+				throw new InvokeError(
+					InvokeErrorType.RATE_LIMIT,
+					`Rate limit exceeded: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status >= 500) {
+				throw new InvokeError(
+					InvokeErrorType.SERVER_ERROR,
+					`Server error: ${errorMessage}`,
+					errorData
+				)
+			}
+			throw new InvokeError(
+				InvokeErrorType.UNKNOWN,
+				`HTTP ${response.status}: ${errorMessage}`,
+				errorData
+			)
+		}
+
+		const data = await response.json()
+
+		// 4. Check finish_reason
+		const choice = data.choices?.[0]
+		if (!choice) {
+			throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', data)
+		}
+
+		switch (choice.finish_reason) {
+			case 'tool_calls':
+				// ✅ Normal
+				break
+			case 'length':
+				// ⚠️ Token limit reached
+				throw new InvokeError(
+					InvokeErrorType.CONTEXT_LENGTH,
+					'Response truncated: max tokens reached',
+					data
+				)
+			case 'content_filter':
+				// ❌ Content filtered
+				throw new InvokeError(
+					InvokeErrorType.CONTENT_FILTER,
+					'Content filtered by safety system',
+					data
+				)
+			case 'stop':
+				// ❌ Did not call tool (we require tool call)
+				throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'Model did not call any tool', data)
+			default:
+				throw new InvokeError(
+					InvokeErrorType.UNKNOWN,
+					`Unexpected finish_reason: ${choice.finish_reason}`,
+					data
+				)
+		}
+
+		// 5. Parse tool call
+		const toolCall = choice.message?.tool_calls?.[0]
+		if (!toolCall) {
+			throw new InvokeError(InvokeErrorType.NO_TOOL_CALL, 'No tool call found in response', data)
+		}
+
+		const toolName = toolCall.function.name
+		const tool = tools[toolName]
+		if (!tool) {
+			throw new InvokeError(InvokeErrorType.UNKNOWN, `Tool ${toolName} not found`, data)
+		}
+
+		// 6. Parse and validate arguments
+		let toolArgs: unknown
+		try {
+			toolArgs = JSON.parse(toolCall.function.arguments)
+		} catch (e) {
+			throw new InvokeError(InvokeErrorType.INVALID_TOOL_ARGS, 'Invalid JSON in tool arguments', e)
+		}
+
+		// Validate against zod schema
+		const validation = tool.inputSchema.safeParse(toolArgs)
+		if (!validation.success) {
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				`Tool arguments validation failed: ${validation.error.message}`,
+				validation.error
+			)
+		}
+
+		// 7. Execute tool
+		let toolResult: unknown
+		try {
+			toolResult = await tool.execute(validation.data)
+		} catch (e) {
+			throw new InvokeError(
+				InvokeErrorType.TOOL_EXECUTION_ERROR,
+				`Tool execution failed: ${(e as Error).message}`,
+				e
+			)
+		}
+
+		// 8. Return result (including cache tokens)
+		return {
+			toolCall: {
+				// id: toolCall.id,
+				name: toolName,
+				args: validation.data as Record<string, unknown>,
+			},
+			toolResult,
+			usage: {
+				promptTokens: data.usage?.prompt_tokens ?? 0,
+				completionTokens: data.usage?.completion_tokens ?? 0,
+				totalTokens: data.usage?.total_tokens ?? 0,
+				cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
+				reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
+			},
+			rawResponse: data,
+		}
+	}
+}
--- a/packages/page-agent/src/llms/OpenAILenientClient.ts
+++ b/packages/page-agent/src/llms/OpenAILenientClient.ts
@@ -0,0 +1,128 @@
+/**
+ * OpenAI Client implementation
+ */
+import type { MacroToolInput } from '../PageAgent'
+import { InvokeError, InvokeErrorType } from './errors'
+import type { InvokeResult, LLMClient, Message, OpenAIClientConfig, Tool } from './types'
+import { lenientParseMacroToolCall, modelPatch, zodToOpenAITool } from './utils'
+
+export class OpenAIClient implements LLMClient {
+	config: OpenAIClientConfig
+
+	constructor(config: OpenAIClientConfig) {
+		this.config = config
+	}
+
+	async invoke(
+		messages: Message[],
+		tools: { AgentOutput: Tool<MacroToolInput> },
+		abortSignal?: AbortSignal
+	): Promise<InvokeResult> {
+		// 1. Convert tools to OpenAI format
+		const openaiTools = Object.entries(tools).map(([name, tool]) => zodToOpenAITool(name, tool))
+
+		// 2. Call API
+		let response: Response
+		try {
+			response = await fetch(`${this.config.baseURL}/chat/completions`, {
+				method: 'POST',
+				headers: {
+					'Content-Type': 'application/json',
+					Authorization: `Bearer ${this.config.apiKey}`,
+				},
+				body: JSON.stringify(
+					modelPatch({
+						model: this.config.model,
+						temperature: this.config.temperature,
+						max_tokens: this.config.maxTokens,
+						messages,
+
+						tools: openaiTools,
+						// tool_choice: 'required',
+						tool_choice: { type: 'function', function: { name: 'AgentOutput' } },
+
+						// model specific params
+
+						// reasoning_effort: 'minimal',
+						// verbosity: 'low',
+						parallel_tool_calls: false,
+					})
+				),
+				signal: abortSignal,
+			})
+		} catch (error: unknown) {
+			// Network error
+			throw new InvokeError(InvokeErrorType.NETWORK_ERROR, 'Network request failed', error)
+		}
+
+		// 3. Handle HTTP errors
+		if (!response.ok) {
+			const errorData = await response.json().catch()
+			const errorMessage =
+				(errorData as { error?: { message?: string } }).error?.message || response.statusText
+
+			if (response.status === 401 || response.status === 403) {
+				throw new InvokeError(
+					InvokeErrorType.AUTH_ERROR,
+					`Authentication failed: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status === 429) {
+				throw new InvokeError(
+					InvokeErrorType.RATE_LIMIT,
+					`Rate limit exceeded: ${errorMessage}`,
+					errorData
+				)
+			}
+			if (response.status >= 500) {
+				throw new InvokeError(
+					InvokeErrorType.SERVER_ERROR,
+					`Server error: ${errorMessage}`,
+					errorData
+				)
+			}
+			throw new InvokeError(
+				InvokeErrorType.UNKNOWN,
+				`HTTP ${response.status}: ${errorMessage}`,
+				errorData
+			)
+		}
+
+		// parse response
+
+		const data = await response.json()
+		const tool = tools.AgentOutput
+		const macroToolInput = lenientParseMacroToolCall(data, tool.inputSchema as any)
+
+		// Execute tool
+		let toolResult: unknown
+		try {
+			toolResult = await tool.execute(macroToolInput)
+		} catch (e) {
+			throw new InvokeError(
+				InvokeErrorType.TOOL_EXECUTION_ERROR,
+				`Tool execution failed: ${(e as Error).message}`,
+				e
+			)
+		}
+
+		// Return result (including cache tokens)
+		return {
+			toolCall: {
+				// id: toolCall.id,
+				name: 'AgentOutput',
+				args: macroToolInput,
+			},
+			toolResult,
+			usage: {
+				promptTokens: data.usage?.prompt_tokens ?? 0,
+				completionTokens: data.usage?.completion_tokens ?? 0,
+				totalTokens: data.usage?.total_tokens ?? 0,
+				cachedTokens: data.usage?.prompt_tokens_details?.cached_tokens,
+				reasoningTokens: data.usage?.completion_tokens_details?.reasoning_tokens,
+			},
+			rawResponse: data,
+		}
+	}
+}
--- a/packages/page-agent/src/llms/errors.ts
+++ b/packages/page-agent/src/llms/errors.ts
@@ -0,0 +1,50 @@
+/**
+ * Error types and error handling for LLM invocations
+ */
+
+export const InvokeErrorType = {
+	// Retryable
+	NETWORK_ERROR: 'network_error', // Network error, retry
+	RATE_LIMIT: 'rate_limit', // Rate limit, retry
+	SERVER_ERROR: 'server_error', // 5xx, retry
+	NO_TOOL_CALL: 'no_tool_call', // Model did not call tool
+	INVALID_TOOL_ARGS: 'invalid_tool_args', // Tool args don't match schema
+	TOOL_EXECUTION_ERROR: 'tool_execution_error', // Tool execution error
+
+	UNKNOWN: 'unknown',
+
+	// Non-retryable
+	AUTH_ERROR: 'auth_error', // Authentication failed
+	CONTEXT_LENGTH: 'context_length', // Prompt too long
+	CONTENT_FILTER: 'content_filter', // Content filtered
+} as const
+
+export type InvokeErrorType = (typeof InvokeErrorType)[keyof typeof InvokeErrorType]
+
+export class InvokeError extends Error {
+	type: InvokeErrorType
+	retryable: boolean
+	statusCode?: number
+	rawError?: unknown
+
+	constructor(type: InvokeErrorType, message: string, rawError?: unknown) {
+		super(message)
+		this.name = 'InvokeError'
+		this.type = type
+		this.retryable = this.isRetryable(type)
+		this.rawError = rawError
+	}
+
+	private isRetryable(type: InvokeErrorType): boolean {
+		const retryableTypes: InvokeErrorType[] = [
+			InvokeErrorType.NETWORK_ERROR,
+			InvokeErrorType.RATE_LIMIT,
+			InvokeErrorType.SERVER_ERROR,
+			InvokeErrorType.NO_TOOL_CALL,
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			InvokeErrorType.TOOL_EXECUTION_ERROR,
+			InvokeErrorType.UNKNOWN,
+		]
+		return retryableTypes.includes(type)
+	}
+}
--- a/packages/page-agent/src/llms/index.ts
+++ b/packages/page-agent/src/llms/index.ts
@@ -0,0 +1,137 @@
+/**
+ * @topic LLM 与主流程的隔离
+ * @reasoning
+ * 将 llm 的调用和主流程分开是复杂的，
+ * 因为 agent 的 tool call 通常集成在 llm 模块中，而而先得到 llm 返回，然后处理工具调用
+ * tools 和 llm 调用的逻辑不可避免地耦合在一起，tool 的执行又和主流程耦合在一起
+ * 而 history 的维护和更新逻辑，又必须嵌入多轮 tool call 中
+ * @reasoning
+ * - 放弃框架提供的自动的多轮调用，每轮调用都由主流程发起
+ * - 理想情况下，llm 调用应该获得 structured output，然后由额外的模块触发 tool call，目前模型和框架都无法实现
+ * - 当前只能将 llm api 和 本地 tool call 耦合在一起，不关心其中的衔接方式
+ * @conclusion
+ * - @llm responsibility boundary:
+ *   - call llm api with given messages and tools
+ *   - invoke tool call and get the result of the tool
+ *   - return the result to main loop
+ * - @main_loop responsibility boundary:
+ *   - maintain all behaviors of an **agent**
+ * @conclusion
+ * - 这里的 llm 模块不是 agent，只负责一轮 llm 调用和工具调用，无状态
+ */
+/**
+ * @topic 结构化输出
+ * @facts
+ * - 几乎所有模型都支持 tool call schema
+ * - 几乎所有模型都支持返回 json
+ *   - 只有 openAI/grok/gemini 支持 schema 并保证格式
+ * - 主流模型都支持 tool_choice: required
+ *   - 除了 qwen 必须指定一个函数名 (9月上新后支持)
+ * @conclusion
+ * - 永远使用 tool call 来返回结构化数据，禁止模型直接返回（视为出错）
+ * - 不能假设 tool 参数合法，必须有修复机制，而且修复也应该使用 tool call 返回
+ */
+import type { LLMConfig } from '../config'
+import { parseLLMConfig } from '../config'
+import { EventBus, getEventBus } from '../utils/bus'
+import { OpenAIClient } from './OpenAILenientClient'
+import { InvokeError } from './errors'
+import type { InvokeResult, LLMClient, Message, Tool } from './types'
+
+export type { Message, Tool, InvokeResult, LLMClient }
+
+export class LLM {
+	config: Required<LLMConfig>
+	id: string
+	client: LLMClient
+	#bus: EventBus
+
+	constructor(config: LLMConfig, id: string) {
+		this.config = parseLLMConfig(config)
+		this.id = id
+
+		this.#bus = getEventBus(id)
+
+		// Default to OpenAI client
+		this.client = new OpenAIClient({
+			model: this.config.model,
+			apiKey: this.config.apiKey,
+			baseURL: this.config.baseURL,
+			temperature: this.config.temperature,
+			maxTokens: this.config.maxTokens,
+		})
+	}
+
+	/**
+	 * - call llm api *once*
+	 * - invoke tool call *once*
+	 * - return the result of the tool
+	 */
+	async invoke(
+		messages: Message[],
+		tools: Record<string, Tool>,
+		abortSignal: AbortSignal
+	): Promise<InvokeResult> {
+		return await withRetry(
+			async () => {
+				const result = await this.client.invoke(messages, tools, abortSignal)
+
+				return result
+			},
+			// retry settings
+			{
+				maxRetries: this.config.maxRetries,
+				onRetry: (retries: number) => {
+					this.#bus.emit('panel:update', {
+						type: 'retry',
+						displayText: `retry-ing (${retries} / ${this.config.maxRetries})`,
+					})
+				},
+				onError: (error: Error, withRetry: boolean) => {
+					this.#bus.emit('panel:update', {
+						type: 'error',
+						displayText: `step failed: ${(error as Error).message}`,
+					})
+				},
+			}
+		)
+	}
+}
+
+async function withRetry<T>(
+	fn: () => Promise<T>,
+	settings: {
+		maxRetries: number
+		onRetry: (retries: number) => void
+		onError: (error: Error, withRetry: boolean) => void
+	}
+): Promise<T> {
+	let retries = 0
+	let lastError: Error | null = null
+	while (retries <= settings.maxRetries) {
+		if (retries > 0) {
+			settings.onRetry(retries)
+			await new Promise((resolve) => setTimeout(resolve, 100))
+		}
+
+		try {
+			return await fn()
+		} catch (error: unknown) {
+			console.error(error)
+			settings.onError(error as Error, retries < settings.maxRetries)
+
+			// do not retry if aborted by user
+			if ((error as { name?: string })?.name === 'AbortError') throw error
+
+			// do not retry if error is not retryable (InvokeError)
+			if (error instanceof InvokeError && !error.retryable) throw error
+
+			lastError = error as Error
+			retries++
+
+			await new Promise((resolve) => setTimeout(resolve, 100))
+		}
+	}
+
+	throw lastError!
+}
--- a/packages/page-agent/src/llms/types.ts
+++ b/packages/page-agent/src/llms/types.ts
@@ -0,0 +1,77 @@
+/**
+ * Core types for LLM integration
+ */
+import type { z } from 'zod'
+
+/**
+ * Message format - OpenAI standard (industry standard)
+ */
+export interface Message {
+	role: 'system' | 'user' | 'assistant' | 'tool'
+	content?: string | null
+	tool_calls?: {
+		id: string
+		type: 'function'
+		function: {
+			name: string
+			arguments: string // JSON string
+		}
+	}[]
+	tool_call_id?: string
+	name?: string
+}
+
+/**
+ * Tool definition - uses Zod schema (LLM-agnostic)
+ * Supports generics for type-safe parameters and return values
+ */
+export interface Tool<TParams = any, TResult = any> {
+	// name: string
+	description?: string
+	inputSchema: z.ZodType<TParams>
+	execute: (args: TParams) => Promise<TResult>
+}
+
+/**
+ * LLM Client interface
+ * Note: Does not use generics because each tool in the tools array has different types
+ */
+export interface LLMClient {
+	invoke(
+		messages: Message[],
+		tools: Record<string, Tool>,
+		abortSignal?: AbortSignal
+	): Promise<InvokeResult>
+}
+
+/**
+ * Invoke result (strict typing, supports generics)
+ */
+export interface InvokeResult<TResult = unknown> {
+	toolCall: {
+		// id?: string // OpenAI's tool_call_id
+		name: string
+		args: any
+	}
+	toolResult: TResult // Supports generics, but defaults to unknown
+	usage: {
+		promptTokens: number
+		completionTokens: number
+		totalTokens: number
+		cachedTokens?: number // Prompt cache hits
+		reasoningTokens?: number // OpenAI o1 series reasoning tokens
+	}
+	rawResponse?: unknown // Raw response for debugging
+}
+
+/**
+ * OpenAI Client config
+ */
+export interface OpenAIClientConfig {
+	model: string
+	apiKey: string
+	baseURL: string
+	temperature?: number
+	maxTokens?: number
+	maxRetries?: number
+}
--- a/packages/page-agent/src/llms/utils.ts
+++ b/packages/page-agent/src/llms/utils.ts
@@ -0,0 +1,214 @@
+/**
+ * Utility functions for LLM integration
+ */
+import chalk from 'chalk'
+import { z } from 'zod'
+
+import type { MacroToolInput } from '../PageAgent'
+import { InvokeError, InvokeErrorType } from './errors'
+import type { Tool } from './types'
+
+/**
+ * Convert Zod schema to OpenAI tool format
+ * Uses Zod 4 native z.toJSONSchema()
+ */
+export function zodToOpenAITool(name: string, tool: Tool) {
+	return {
+		type: 'function' as const,
+		function: {
+			name,
+			description: tool.description,
+			parameters: z.toJSONSchema(tool.inputSchema, { target: 'openapi-3.0' }),
+		},
+	}
+}
+
+/**
+ * Although some models cannot guarantee correct response. Common issues are fixable:
+ * - Instead of returning a proper tool call. Return the tool call parameters in the message content.
+ * - Returned tool calls or messages don't follow the nested MacroToolInput format.
+ */
+export function lenientParseMacroToolCall(
+	responseData: any,
+	inputSchema: z.ZodObject<MacroToolInput & Record<string, any>>
+): MacroToolInput {
+	// check
+	const choice = responseData.choices?.[0]
+	if (!choice) {
+		throw new InvokeError(InvokeErrorType.UNKNOWN, 'No choices in response', responseData)
+	}
+
+	// check
+	switch (choice.finish_reason) {
+		case 'tool_calls':
+		case 'function_call': // gemini
+		case 'stop': // will try a robust parse
+			// ✅ Normal
+			break
+		case 'length':
+			// ⚠️ Token limit reached
+			throw new InvokeError(
+				InvokeErrorType.CONTEXT_LENGTH,
+				'Response truncated: max tokens reached'
+			)
+		case 'content_filter':
+			// ❌ Content filtered
+			throw new InvokeError(InvokeErrorType.CONTENT_FILTER, 'Content filtered by safety system')
+		default:
+			throw new InvokeError(
+				InvokeErrorType.UNKNOWN,
+				`Unexpected finish_reason: ${choice.finish_reason}`
+			)
+	}
+
+	// Extract action schema from MacroToolInput schema
+	const actionSchema = inputSchema.shape.action
+	if (!actionSchema) {
+		throw new Error('inputSchema must have an "action" field')
+	}
+
+	// patch stopReason mis-format
+
+	let arg: string | null = null
+
+	// try to use tool call
+	const toolCall = choice.message?.tool_calls?.[0]?.function
+	arg = toolCall?.arguments ?? null
+
+	if (arg && toolCall.name !== 'AgentOutput') {
+		// TODO: check if toolCall.name is a valid action name
+		// case: instead of AgentOutput, the model returned a action name as tool call
+		console.log(chalk.yellow('lenientParseMacroToolCall: #1 fixing incorrect tool call'))
+		let tmpArg
+		try {
+			tmpArg = JSON.parse(arg)
+		} catch (error) {
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				'Failed to parse tool arguments as JSON',
+				error
+			)
+		}
+		arg = JSON.stringify({ action: { [toolCall.name]: tmpArg } })
+	}
+
+	if (!arg) {
+		// try to use message content as JSON
+		arg = choice.message?.content.trim() || null
+	}
+
+	if (!arg) {
+		throw new InvokeError(
+			InvokeErrorType.NO_TOOL_CALL,
+			'No tool call or content found in response',
+			responseData
+		)
+	}
+
+	// make sure is valid JSON
+
+	let parsedArgs: any
+	try {
+		parsedArgs = JSON.parse(arg)
+	} catch (error) {
+		throw new InvokeError(
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			'Failed to parse tool arguments as JSON',
+			error
+		)
+	}
+
+	// patch incomplete formats
+
+	if (parsedArgs.action || parsedArgs.evaluation_previous_goal || parsedArgs.next_goal) {
+		// case: nested MacroToolInput format (correct format)
+
+		// some models may give a empty action (they may think reasoning and action should be separate)
+		if (!parsedArgs.action) {
+			console.log(chalk.yellow('lenientParseMacroToolCall: #2 fixing incorrect tool call'))
+			parsedArgs.action = {
+				wait: { seconds: 1 },
+			}
+		}
+	} else if (parsedArgs.type && parsedArgs.function) {
+		// case: upper level function call format provided. only keep its arguments
+		// TODO: check if function name is a valid action name
+		if (parsedArgs.function.name !== 'AgentOutput')
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				`Expected function name "AgentOutput", got "${parsedArgs.function.name}"`,
+				null
+			)
+
+		console.log(chalk.yellow('lenientParseMacroToolCall: #3 fixing incorrect tool call'))
+		parsedArgs = parsedArgs.function.arguments
+	} else if (parsedArgs.name && parsedArgs.arguments) {
+		// case: upper level function call format provided. only keep its arguments
+		// TODO: check if function name is a valid action name
+		if (parsedArgs.name !== 'AgentOutput')
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				`Expected function name "AgentOutput", got "${parsedArgs.name}"`,
+				null
+			)
+
+		console.log(chalk.yellow('lenientParseMacroToolCall: #4 fixing incorrect tool call'))
+		parsedArgs = parsedArgs.arguments
+	} else {
+		// case: only action parameters provided, wrap into MacroToolInput
+		// TODO: check if action name is valid
+		console.log(chalk.yellow('lenientParseMacroToolCall: #5 fixing incorrect tool call'))
+		parsedArgs = { action: parsedArgs } as MacroToolInput
+	}
+
+	// make sure it's not wrapped as string
+	if (typeof parsedArgs === 'string') {
+		console.log(chalk.yellow('lenientParseMacroToolCall: #6 fixing incorrect tool call'))
+		try {
+			parsedArgs = JSON.parse(parsedArgs)
+		} catch (error) {
+			throw new InvokeError(
+				InvokeErrorType.INVALID_TOOL_ARGS,
+				'Failed to parse nested tool arguments as JSON',
+				error
+			)
+		}
+	}
+
+	const validation = inputSchema.safeParse(parsedArgs)
+	if (validation.success) {
+		return validation.data as unknown as MacroToolInput
+	} else {
+		const action = parsedArgs.action ?? {}
+		const actionName = Object.keys(action)[0] || 'unknown'
+		const actionArgs = JSON.stringify(action[actionName] || 'unknown')
+
+		// TODO: check if action name is valid. give a readable error message
+
+		throw new InvokeError(
+			InvokeErrorType.INVALID_TOOL_ARGS,
+			`Tool arguments validation failed: action "${actionName}" with args ${actionArgs}`,
+			validation.error
+		)
+	}
+}
+
+export function modelPatch(body: Record<string, any>) {
+	const model: string = body.model || ''
+
+	if (model.toLowerCase().startsWith('claude')) {
+		body.tool_choice = { type: 'tool', name: 'AgentOutput' }
+		body.thinking = { type: 'disabled' }
+		// body.reasoning = { enabled: 'disabled' }
+	}
+
+	if (model.toLowerCase().includes('grok')) {
+		console.log('Applying Grok patch: removing tool_choice')
+		delete body.tool_choice
+		console.log('Applying Grok patch: disable reasoning and thinking')
+		body.thinking = { type: 'disabled', effort: 'minimal' }
+		body.reasoning = { enabled: false, effort: 'low' }
+	}
+
+	return body
+}
--- a/packages/page-agent/src/patches/antd.ts
+++ b/packages/page-agent/src/patches/antd.ts
@@ -0,0 +1,20 @@
+import type { PageAgent } from '../PageAgent'
+
+const clearFunctions = [] as (() => void)[]
+
+/**
+ * antd 的 select 是 div 包 input 的结构，所有信息都在 input 标签上，
+ * 但是 input 不可见，也不会出现在清洗后的树里，因此这里把他提上来
+ */
+function fixAntdSelect() {
+	const selects = [...document.querySelectorAll('input[role="combobox"]')]
+	// for (const select of selects) {}
+}
+
+export function patchAntd(pageAgent: PageAgent) {
+	pageAgent.addEventListener('beforeUpdate', fixAntdSelect)
+	pageAgent.addEventListener('afterUpdate', () => {
+		for (const fn of clearFunctions) fn()
+		clearFunctions.length = 0
+	})
+}
--- a/packages/page-agent/src/patches/react.ts
+++ b/packages/page-agent/src/patches/react.ts
@@ -0,0 +1,16 @@
+import type { PageAgent } from '../PageAgent'
+
+// Find common React root elements and add data-page-agent-not-interactive attribute
+export function patchReact(pageAgent: PageAgent) {
+	const reactRootElements = document.querySelectorAll(
+		'[data-reactroot], [data-reactid], [data-react-checksum], #root, #app, [id^="root-"], [id^="app-"], #adex-wrapper, #adex-root'
+	)
+
+	for (const element of reactRootElements) {
+		element.setAttribute('data-page-agent-not-interactive', 'true')
+	}
+}
+
+/**
+ * @todo (Heavy, might have false negatives) Interaction detection, if element width/height equals body offsetWidth/Height, consider it root element and non-interactive (React often attaches many events to root elements, causing false positives)
+ */
--- a/packages/page-agent/src/prompts/system_prompt.md
+++ b/packages/page-agent/src/prompts/system_prompt.md
@@ -0,0 +1,156 @@
+You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
+
+<intro>
+You excel at following tasks:
+1. Navigating complex websites and extracting precise information
+2. Automating form submissions and interactive web actions
+3. Gathering and saving information 
+4. Operate effectively in an agent loop
+5. Efficiently performing diverse web tasks
+</intro>
+
+<language_settings>
+- Default working language: **中文**
+- Use the language that user is using. Return in user's language.
+</language_settings>
+
+<input>
+At every step, your input will consist of: 
+1. <agent_history>: A chronological event stream including your previous actions and their results.
+2. <agent_state>: Current <user_request> and <step_info>.
+3. <browser_state>: Current URL, interactive elements indexed for actions, and visible page content.
+</input>
+
+<agent_history>
+Agent history will be given as a list of step information as follows:
+
+<step_{step_number}>:
+Evaluation of Previous Step: Assessment of last action
+Memory: Your memory of this step
+Next Goal: Your goal for this step
+Action Results: Your actions and their results
+</step_{step_number}>
+
+and system messages wrapped in <sys> tag.
+</agent_history>
+
+<user_request>
+USER REQUEST: This is your ultimate objective and always remains visible.
+- This has the highest priority. Make the user happy.
+- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
+- If the task is open ended you can plan yourself how to get it done.
+</user_request>
+
+<browser_state>
+1. Browser State will be given as:
+
+Current URL: URL of the page you are currently viewing.
+Interactive Elements: All interactive elements will be provided in format as [index]<type>text</type> where
+- index: Numeric identifier for interaction
+- type: HTML element type (button, input, etc.)
+- text: Element description
+
+Examples:
+[33]<div>User form</div>
+\t*[35]<button aria-label='Submit form'>Submit</button>
+
+Note that:
+- Only elements with numeric indexes in [] are interactive
+- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
+- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
+- Pure text elements without [] are not interactive.
+</browser_state>
+
+<browser_rules>
+Strictly follow these rules while using the browser and navigating the web:
+- Only interact with elements that have a numeric [index] assigned.
+- Only use indexes that are explicitly provided.
+- If the page changes after, for example, an input text action, analyze if you need to interact with new elements, e.g. selecting the right option from the list.
+- By default, only elements in the visible viewport are listed. Use scrolling actions if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
+- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
+- All the elements that are scrollable are marked with `data-scrollable` attribute. Including the scrollable distance in every directions. You can scroll *the element* in case some area are overflowed.
+- If a captcha appears, tell user you can not solve captcha. finished the task and ask user to solve it.
+- If expected elements are missing, try scrolling, or navigating back.
+- If the page is not fully loaded, use the `wait` action.
+- Do not repeat one action for more than 3 times unless some conditions changed.
+- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
+- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
+- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
+- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
+- Don't login into a page if you don't have to. Don't login if you don't have the credentials. 
+- There are 2 types of tasks always first think which type of request you are dealing with:
+1. Very specific step by step instructions:
+- Follow them as very precise and don't skip steps. Try to complete everything as requested.
+2. Open ended tasks. Plan yourself, be creative in achieving them.
+- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
+</browser_rules>
+
+<capability>
+- You can only handle single page app. Do not jump out of current page.
+- Do not click on link if it will open in a new page (etc. <a target="_blank">)
+- It is ok to fail the task.
+	- User can be wrong. If the request of user is not achievable, inappropriate or you do not have enough information or tools to achieve it. Tell user to make a better request.
+	- Webpage can be broken. All webpages or apps have bugs. Some bug will make it hard for your job. It's encouraged to tell user the problem of current page. Your feedbacks (including failing) are valuable for user.
+	- Trying to hard can be harmful. Repeating some action back and forth or pushing for a complex procedure with little knowledge can cause unwanted result and harmful side-effects. User would rather you to complete the task with a fail.
+- If you are not clear about the request or steps. `ask_user` to clarify it.
+- If you do not have knowledge for the current webpage or task. You must require user to give specific instructions and detailed steps.
+</capability>
+
+<task_completion_rules>
+You must call the `done` action in one of three cases:
+- When you have fully completed the USER REQUEST.
+- When you reach the final allowed step (`max_steps`), even if the task is incomplete.
+- When you feel stuck or unable to solve user request. Or user request is not clear or contains inappropriate content.
+- If it is ABSOLUTELY IMPOSSIBLE to continue.
+
+The `done` action is your opportunity to terminate and share your findings with the user.
+- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components.
+- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`.
+- You can use the `text` field of the `done` action to communicate your findings and to provide a coherent reply to the user and fulfill the USER REQUEST.
+- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions.
+- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer.
+- If the user asks for a structured output, your `done` action's schema may be modified. Take this schema into account when solving the task!
+</task_completion_rules>
+
+<reasoning_rules>
+Exhibit the following reasoning patterns to successfully achieve the <user_request>:
+
+- Reason about <agent_history> to track progress and context toward <user_request>.
+- Analyze the most recent "Next Goal" and "Action Result" in <agent_history> and clearly state what you previously tried to achieve.
+- Analyze all relevant items in <agent_history> and <browser_state> to understand your state.
+- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in <agent_history>. If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery.
+- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or ask user for help.
+- `ask_user` for help if you have any difficulty. Users want to be kept in the loop.
+- If you see information relevant to <user_request>, plan saving the information to memory.
+- Always reason about the <user_request>. Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajectory with the user request and think carefully if thats how the user requested it.
+</reasoning_rules>
+
+<examples>
+Here are examples of good output patterns. Use them as reference but never copy them directly.
+
+<evaluation_examples>
+- Positive Examples:
+"evaluation_previous_goal": "Successfully navigated to the product page and found the target information. Verdict: Success"
+"evaluation_previous_goal": "Clicked the login button and user authentication form appeared. Verdict: Success"
+</evaluation_examples>
+
+<memory_examples>
+"memory": "Found many pending reports that need to be analyzed in the main page. Successfully processed the first 2 reports on quarterly sales data and moving on to inventory analysis and customer feedback reports."
+</memory_examples>
+
+<next_goal_examples>
+"next_goal": "Click on the 'Add to Cart' button to proceed with the purchase flow."
+"next_goal": "Extract details from the first item on the page."
+</next_goal_examples>
+</examples>
+
+<output>
+You must ALWAYS respond with a valid JSON in this exact format:
+
+{
+  "evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
+  "memory": "1-3 concise sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
+  "next_goal": "State the next immediate goal and action to achieve it, in one clear sentence."
+  "action":{"one_action_name": {// action-specific parameter}}
+}
+</output>
--- a/packages/page-agent/src/tools/actions.ts
+++ b/packages/page-agent/src/tools/actions.ts
@@ -0,0 +1,430 @@
+/**
+ * Copyright (C) 2025 Alibaba Group Holding Limited
+ * All rights reserved.
+ */
+import type { PageAgent } from '../PageAgent'
+
+// ======= general utils =======
+
+export async function waitFor(seconds: number): Promise<void> {
+	await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
+}
+
+let currentUrl = window.location.href
+export async function getSystemInfo() {
+	// If current URL is already up to date, no need to add message
+	if (currentUrl === window.location.href) return ''
+
+	await waitFor(0.3) // Wait a bit longer for page to load
+
+	currentUrl = window.location.href
+
+	return `\n<sys> Current URL changed to: ${currentUrl} </sys>`
+}
+
+// ======= dom utils =======
+
+export async function movePointerToElement(element: HTMLElement) {
+	const rect = element.getBoundingClientRect()
+	const x = rect.left + rect.width / 2
+	const y = rect.top + rect.height / 2
+
+	window.dispatchEvent(new CustomEvent('PageAgent::MovePointerTo', { detail: { x, y } }))
+
+	await waitFor(0.3)
+}
+
+/**
+ * Get the HTMLElement by index from the selectorMap in PageAgent.
+ */
+export function getElementByIndex(pageAgent: PageAgent, index: number): HTMLElement {
+	const interactiveNode = pageAgent.selectorMap.get(index)
+	if (!interactiveNode) {
+		throw new Error(`No interactive element found at index ${index}`)
+	}
+
+	const element = interactiveNode.ref
+	if (!element) {
+		throw new Error(`Element at index ${index} does not have a reference`)
+	}
+
+	if (!(element instanceof HTMLElement)) {
+		throw new Error(`Element at index ${index} is not an HTMLElement`)
+	}
+
+	return element
+}
+
+let lastClickedElement: HTMLElement | null = null
+
+function blurLastClickedElement() {
+	if (lastClickedElement) {
+		lastClickedElement.blur()
+		lastClickedElement.dispatchEvent(
+			new MouseEvent('mouseout', { bubbles: true, cancelable: true })
+		)
+		lastClickedElement = null
+	}
+}
+
+/**
+ * Simulate a click on the element
+ */
+export async function clickElement(element: HTMLElement) {
+	blurLastClickedElement()
+
+	lastClickedElement = element
+	await scrollIntoViewIfNeeded(element)
+	await movePointerToElement(element)
+	window.dispatchEvent(new CustomEvent('PageAgent::ClickPointer'))
+	await waitFor(0.1)
+
+	// hover it
+	element.dispatchEvent(new MouseEvent('mouseenter', { bubbles: true, cancelable: true }))
+	element.dispatchEvent(new MouseEvent('mouseover', { bubbles: true, cancelable: true }))
+
+	// dispatch a sequence of events to ensure all listeners are triggered
+	element.dispatchEvent(new MouseEvent('mousedown', { bubbles: true, cancelable: true }))
+
+	// focus it to ensure it gets the click event
+	element.focus()
+
+	element.dispatchEvent(new MouseEvent('mouseup', { bubbles: true, cancelable: true }))
+	element.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }))
+
+	// dispatch a click event
+	// element.click()
+
+	await waitFor(0.1) // Wait to ensure click event processing completes
+}
+
+// eslint-disable-next-line @typescript-eslint/unbound-method
+const nativeInputValueSetter = Object.getOwnPropertyDescriptor(
+	window.HTMLInputElement.prototype,
+	'value'
+)!.set!
+
+// eslint-disable-next-line @typescript-eslint/unbound-method
+const nativeTextAreaValueSetter = Object.getOwnPropertyDescriptor(
+	window.HTMLTextAreaElement.prototype,
+	'value'
+)!.set!
+
+/**
+ * create a synthetic keyboard event
+ * with key keycode code
+ */
+export async function createSyntheticInputEvent(elem: HTMLElement, key: string) {
+	elem.dispatchEvent(new KeyboardEvent('keydown', { bubbles: true, cancelable: true, key }))
+	await waitFor(0.01)
+
+	if (elem instanceof HTMLInputElement || elem instanceof HTMLTextAreaElement) {
+		elem.dispatchEvent(new Event('beforeinput', { bubbles: true }))
+		await waitFor(0.01)
+		elem.dispatchEvent(new Event('input', { bubbles: true }))
+		await waitFor(0.01)
+	}
+
+	elem.dispatchEvent(new KeyboardEvent('keyup', { bubbles: true, cancelable: true, key }))
+}
+
+export async function inputTextElement(element: HTMLElement, text: string) {
+	if (!(element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement)) {
+		throw new Error('Element is not an input or textarea')
+	}
+
+	await clickElement(element)
+
+	if (element instanceof HTMLTextAreaElement) {
+		nativeTextAreaValueSetter.call(element, text)
+	} else {
+		nativeInputValueSetter.call(element, text)
+	}
+
+	const inputEvent = new Event('input', { bubbles: true })
+	element.dispatchEvent(inputEvent)
+
+	await waitFor(0.1) // Wait to ensure input event processing completes
+
+	blurLastClickedElement()
+}
+
+/**
+ * @todo browser-use version is very complex and supports menu tags, need to follow up
+ */
+export async function selectOptionElement(selectElement: HTMLSelectElement, optionText: string) {
+	if (!(selectElement instanceof HTMLSelectElement)) {
+		throw new Error('Element is not a select element')
+	}
+
+	const options = Array.from(selectElement.options)
+	const option = options.find((opt) => opt.textContent?.trim() === optionText.trim())
+
+	if (!option) {
+		throw new Error(`Option with text "${optionText}" not found in select element`)
+	}
+
+	selectElement.value = option.value
+	selectElement.dispatchEvent(new Event('change', { bubbles: true }))
+
+	await waitFor(0.1) // Wait to ensure change event processing completes
+}
+
+// eslint-disable-next-line @typescript-eslint/require-await
+export async function scrollIntoViewIfNeeded(element: HTMLElement) {
+	const el = element as any
+	if (el.scrollIntoViewIfNeeded) {
+		el.scrollIntoViewIfNeeded()
+		// await waitFor(0.5) // Animation playback
+	} else {
+		// @todo visibility check
+		el.scrollIntoView({ behavior: 'auto', block: 'center', inline: 'nearest' })
+		// await waitFor(0.5) // Animation playback
+	}
+}
+
+export async function scrollVertically(
+	down: boolean,
+	scroll_amount: number,
+	element?: HTMLElement | null
+) {
+	// Element-specific scrolling if element is provided
+	if (element) {
+		const targetElement = element
+		console.log(
+			'[SCROLL DEBUG] Starting direct container scroll for element:',
+			targetElement.tagName
+		)
+
+		let currentElement = targetElement as HTMLElement | null
+		let scrollSuccess = false
+		let scrolledElement: HTMLElement | null = null
+		let scrollDelta = 0
+		let attempts = 0
+		const dy = scroll_amount
+
+		while (currentElement && attempts < 10) {
+			const computedStyle = window.getComputedStyle(currentElement)
+			const hasScrollableY = /(auto|scroll|overlay)/.test(computedStyle.overflowY)
+			const canScrollVertically = currentElement.scrollHeight > currentElement.clientHeight
+
+			console.log(
+				'[SCROLL DEBUG] Checking element:',
+				currentElement.tagName,
+				'hasScrollableY:',
+				hasScrollableY,
+				'canScrollVertically:',
+				canScrollVertically,
+				'scrollHeight:',
+				currentElement.scrollHeight,
+				'clientHeight:',
+				currentElement.clientHeight
+			)
+
+			if (hasScrollableY && canScrollVertically) {
+				const beforeScroll = currentElement.scrollTop
+				const maxScroll = currentElement.scrollHeight - currentElement.clientHeight
+
+				let scrollAmount = dy / 3
+
+				if (scrollAmount > 0) {
+					scrollAmount = Math.min(scrollAmount, maxScroll - beforeScroll)
+				} else {
+					scrollAmount = Math.max(scrollAmount, -beforeScroll)
+				}
+
+				currentElement.scrollTop = beforeScroll + scrollAmount
+
+				const afterScroll = currentElement.scrollTop
+				const actualScrollDelta = afterScroll - beforeScroll
+
+				console.log(
+					'[SCROLL DEBUG] Scroll attempt:',
+					currentElement.tagName,
+					'before:',
+					beforeScroll,
+					'after:',
+					afterScroll,
+					'delta:',
+					actualScrollDelta
+				)
+
+				if (Math.abs(actualScrollDelta) > 0.5) {
+					scrollSuccess = true
+					scrolledElement = currentElement
+					scrollDelta = actualScrollDelta
+					console.log(
+						'[SCROLL DEBUG] Successfully scrolled container:',
+						currentElement.tagName,
+						'delta:',
+						actualScrollDelta
+					)
+					break
+				}
+			}
+
+			if (currentElement === document.body || currentElement === document.documentElement) {
+				break
+			}
+			currentElement = currentElement.parentElement
+			attempts++
+		}
+
+		if (scrollSuccess) {
+			return `Scrolled container (${scrolledElement?.tagName}) by ${scrollDelta}px`
+		} else {
+			return `No scrollable container found for element (${targetElement.tagName})`
+		}
+	}
+
+	// Page-level scrolling (default or fallback)
+
+	const dy = scroll_amount
+	const bigEnough = (el: HTMLElement) => el.clientHeight >= window.innerHeight * 0.5
+	const canScroll = (el: HTMLElement | null) =>
+		el &&
+		/(auto|scroll|overlay)/.test(getComputedStyle(el).overflowY) &&
+		el.scrollHeight > el.clientHeight &&
+		bigEnough(el)
+
+	let el: HTMLElement | null = document.activeElement as HTMLElement | null
+	while (el && !canScroll(el) && el !== document.body) el = el.parentElement
+
+	el = canScroll(el)
+		? el
+		: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
+			(document.scrollingElement as HTMLElement) ||
+			(document.documentElement as HTMLElement)
+
+	if (el === document.scrollingElement || el === document.documentElement || el === document.body) {
+		window.scrollBy(0, dy)
+		return `✅ Scrolled page by ${dy}px.`
+	} else {
+		el!.scrollBy({ top: dy, behavior: 'smooth' })
+		await waitFor(0.1) // Animation playback
+		return `✅ Scrolled container (${el!.tagName}) by ${dy}px.`
+	}
+}
+
+export async function scrollHorizontally(
+	right: boolean,
+	scroll_amount: number,
+	element?: HTMLElement | null
+) {
+	// Element-specific scrolling if element is provided
+	if (element) {
+		const targetElement = element
+		console.log(
+			'[SCROLL DEBUG] Starting direct container scroll for element:',
+			targetElement.tagName
+		)
+
+		let currentElement = targetElement as HTMLElement | null
+		let scrollSuccess = false
+		let scrolledElement: HTMLElement | null = null
+		let scrollDelta = 0
+		let attempts = 0
+		const dx = right ? scroll_amount : -scroll_amount
+
+		while (currentElement && attempts < 10) {
+			const computedStyle = window.getComputedStyle(currentElement)
+			const hasScrollableX = /(auto|scroll|overlay)/.test(computedStyle.overflowX)
+			const canScrollHorizontally = currentElement.scrollWidth > currentElement.clientWidth
+
+			console.log(
+				'[SCROLL DEBUG] Checking element:',
+				currentElement.tagName,
+				'hasScrollableX:',
+				hasScrollableX,
+				'canScrollHorizontally:',
+				canScrollHorizontally,
+				'scrollWidth:',
+				currentElement.scrollWidth,
+				'clientWidth:',
+				currentElement.clientWidth
+			)
+
+			if (hasScrollableX && canScrollHorizontally) {
+				const beforeScroll = currentElement.scrollLeft
+				const maxScroll = currentElement.scrollWidth - currentElement.clientWidth
+
+				let scrollAmount = dx / 3
+
+				if (scrollAmount > 0) {
+					scrollAmount = Math.min(scrollAmount, maxScroll - beforeScroll)
+				} else {
+					scrollAmount = Math.max(scrollAmount, -beforeScroll)
+				}
+
+				currentElement.scrollLeft = beforeScroll + scrollAmount
+
+				const afterScroll = currentElement.scrollLeft
+				const actualScrollDelta = afterScroll - beforeScroll
+
+				console.log(
+					'[SCROLL DEBUG] Scroll attempt:',
+					currentElement.tagName,
+					'before:',
+					beforeScroll,
+					'after:',
+					afterScroll,
+					'delta:',
+					actualScrollDelta
+				)
+
+				if (Math.abs(actualScrollDelta) > 0.5) {
+					scrollSuccess = true
+					scrolledElement = currentElement
+					scrollDelta = actualScrollDelta
+					console.log(
+						'[SCROLL DEBUG] Successfully scrolled container:',
+						currentElement.tagName,
+						'delta:',
+						actualScrollDelta
+					)
+					break
+				}
+			}
+
+			if (currentElement === document.body || currentElement === document.documentElement) {
+				break
+			}
+			currentElement = currentElement.parentElement
+			attempts++
+		}
+
+		if (scrollSuccess) {
+			return `Scrolled container (${scrolledElement?.tagName}) horizontally by ${scrollDelta}px`
+		} else {
+			return `No horizontally scrollable container found for element (${targetElement.tagName})`
+		}
+	}
+
+	// Page-level scrolling (default or fallback)
+
+	const dx = right ? scroll_amount : -scroll_amount
+	const bigEnough = (el: HTMLElement) => el.clientWidth >= window.innerWidth * 0.5
+	const canScroll = (el: HTMLElement | null) =>
+		el &&
+		/(auto|scroll|overlay)/.test(getComputedStyle(el).overflowX) &&
+		el.scrollWidth > el.clientWidth &&
+		bigEnough(el)
+
+	let el: HTMLElement | null = document.activeElement as HTMLElement | null
+	while (el && !canScroll(el) && el !== document.body) el = el.parentElement
+
+	el = canScroll(el)
+		? el
+		: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
+			(document.scrollingElement as HTMLElement) ||
+			(document.documentElement as HTMLElement)
+
+	if (el === document.scrollingElement || el === document.documentElement || el === document.body) {
+		window.scrollBy(dx, 0)
+		return `✅ Scrolled page horizontally by ${dx}px`
+	} else {
+		el!.scrollBy({ left: dx, behavior: 'smooth' })
+		await waitFor(0.1) // Animation playback
+		return `✅ Scrolled container (${el!.tagName}) horizontally by ${dx}px`
+	}
+}
--- a/packages/page-agent/src/tools/index.ts
+++ b/packages/page-agent/src/tools/index.ts
@@ -0,0 +1,243 @@
+/**
+ * Internal tools for PageAgent.
+ * @note Adapted from browser-use
+ */
+import zod, { type z } from 'zod'
+
+import type { PageAgent } from '../PageAgent'
+import {
+	clickElement,
+	getElementByIndex,
+	getSystemInfo,
+	inputTextElement,
+	scrollHorizontally,
+	scrollVertically,
+	selectOptionElement,
+	waitFor,
+} from './actions'
+// debug
+import * as utils from './actions'
+
+// @ts-expect-error debug only
+window.utils = utils
+
+/**
+ * Internal tool definition that has access to PageAgent `this` context
+ */
+export interface PageAgentTool<TParams = any> {
+	// name: string
+	description: string
+	inputSchema: z.ZodType<TParams>
+	execute: (this: PageAgent, args: TParams) => Promise<string>
+}
+
+export function tool<TParams>(options: PageAgentTool<TParams>): PageAgentTool<TParams> {
+	return options
+}
+
+/**
+ * Internal tools for PageAgent.
+ * Note: Using any to allow different parameter types for each tool
+ */
+export const tools = new Map<string, PageAgentTool>()
+
+// tools.set(
+// 	'get_current_html',
+// 	tool({
+// 		description: 'Get the current (updated) simplified HTML of the page',
+// 		inputSchema: zod.object({}),
+// 		execute: function (this: PageAgent) {
+// 			this.updateTree()
+// 			return this.simplifiedHTML
+// 		},
+// 	})
+// )
+
+tools.set(
+	'done',
+	tool({
+		description:
+			'Complete task - provide a summary of results for the user. Set success=True if task completed successfully, false otherwise. Text should be your response to the user summarizing results.',
+		inputSchema: zod.object({
+			text: zod.string(),
+			success: zod.boolean().default(true),
+		}),
+		execute: async function (this: PageAgent, input) {
+			// @note main loop will handle this one
+			// this.onDone(input.text, input.success)
+			return Promise.resolve('Task completed')
+		},
+	})
+)
+
+tools.set(
+	'wait',
+	tool({
+		description:
+			'Wait for x seconds. default 1s (max 10 seconds, min 1 second). This can be used to wait until the page or data is fully loaded.',
+		inputSchema: zod.object({
+			seconds: zod.number().min(1).max(10).default(1),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const lastTimeUpdate = this.lastTimeUpdate
+			const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
+			console.log(`actualWaitTime: ${actualWaitTime} seconds`)
+			await waitFor(actualWaitTime)
+			return `✅ Waited for ${input.seconds} seconds.` + (await getSystemInfo())
+		},
+	})
+)
+
+tools.set(
+	'ask_user',
+	tool({
+		description:
+			'Ask the user a question and wait for their answer. Use this if you need more information or clarification.',
+		inputSchema: zod.object({
+			question: zod.string(),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const answer = await this.panel.askUser(input.question)
+			return `✅ Received user answer: ${answer}` + (await getSystemInfo())
+		},
+	})
+)
+
+tools.set(
+	'click_element_by_index',
+	tool({
+		description: 'Click element by index',
+		inputSchema: zod.object({
+			index: zod.int().min(0),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const element = getElementByIndex(this, input.index)
+			const elemText = this.elementTextMap.get(input.index)
+			await clickElement(element)
+
+			// @workaround: Handle links that open in new tabs
+			if (element instanceof HTMLAnchorElement && element.target === '_blank') {
+				return `⚠️ Clicked link that opens in a new tab (${elemText ? elemText : input.index}). You are not capable of reading new tabs.`
+			}
+
+			return `✅ Clicked element (${elemText ? elemText : input.index}).` + (await getSystemInfo())
+		},
+	})
+)
+
+tools.set(
+	'input_text',
+	tool({
+		description: 'Click and input text into a input interactive element',
+		inputSchema: zod.object({
+			index: zod.int().min(0),
+			text: zod.string(),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const element = getElementByIndex(this, input.index)
+			const elemText = this.elementTextMap.get(input.index)
+			await inputTextElement(element, input.text)
+			return (
+				`✅ Input text (${input.text}) into element (${elemText ? elemText : input.index}).` +
+				(await getSystemInfo())
+			)
+		},
+	})
+)
+
+tools.set(
+	'select_dropdown_option',
+	tool({
+		description:
+			'Select dropdown option for interactive element index by the text of the option you want to select',
+		inputSchema: zod.object({
+			index: zod.int().min(0),
+			text: zod.string(),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const element = getElementByIndex(this, input.index)
+			const elemText = this.elementTextMap.get(input.index)
+			await selectOptionElement(element as HTMLSelectElement, input.text)
+			return (
+				`✅ Selected option (${input.text}) in element (${elemText ? elemText : input.index}).` +
+				(await getSystemInfo())
+			)
+		},
+	})
+)
+
+/**
+ * @note Reference from browser-use
+ */
+tools.set(
+	'scroll',
+	tool({
+		description:
+			'Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 1.0 for one page, etc.). Optional index parameter to scroll within a specific element or its scroll container (works well for dropdowns and custom UI components). Optional pixels parameter to scroll by a specific number of pixels instead of pages.',
+		inputSchema: zod.object({
+			down: zod.boolean().default(true),
+			num_pages: zod.number().min(0).max(10).optional().default(0.1),
+			pixels: zod.number().int().min(0).optional(),
+			index: zod.number().int().min(0).optional(),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const { down, num_pages, index, pixels } = input
+
+			const scroll_amount = pixels ? pixels : num_pages * (down ? 1 : -1) * window.innerHeight
+
+			const element = index !== undefined ? getElementByIndex(this, index) : null
+
+			return (await scrollVertically(down, scroll_amount, element)) + (await getSystemInfo())
+		},
+	})
+)
+
+tools.set(
+	'scroll_horizontally',
+	tool({
+		description:
+			'Scroll the page or element horizontally (set right=True to scroll right, right=False to scroll left, pixels=number of pixels to scroll). Optional index parameter to scroll within a specific element or its scroll container (works well for wide tables).',
+		inputSchema: zod.object({
+			right: zod.boolean().default(true),
+			pixels: zod.number().int().min(0),
+			index: zod.number().int().min(0).optional(),
+		}),
+		execute: async function (this: PageAgent, input) {
+			const { right, pixels, index } = input
+
+			const scroll_amount = pixels * (right ? 1 : -1)
+
+			const element = index !== undefined ? getElementByIndex(this, index) : null
+
+			return (await scrollHorizontally(right, scroll_amount, element)) + (await getSystemInfo())
+		},
+	})
+)
+
+tools.set(
+	'execute_javascript',
+	tool({
+		description:
+			'Execute JavaScript code on the current page. Supports async/await syntax. Use with caution!',
+		inputSchema: zod.object({
+			script: zod.string(),
+		}),
+		execute: async function (this: PageAgent, input) {
+			try {
+				// Wrap script in async function to support await
+				const asyncFunction = eval(`(async () => { ${input.script} })`)
+				const result = await asyncFunction()
+				return `✅ Executed JavaScript. Result: ${result}` + (await getSystemInfo())
+			} catch (error) {
+				return `❌ Error executing JavaScript: ${error}` + (await getSystemInfo())
+			}
+		},
+	})
+)
+
+// @todo get_dropdown_options
+// @todo select_dropdown_option
+// @todo send_keys
+// @todo upload_file
+// @todo go_back
+// @todo extract_structured_data
--- a/packages/page-agent/src/ui/Panel.module.css
+++ b/packages/page-agent/src/ui/Panel.module.css
@@ -0,0 +1,597 @@
+.wrapper {
+	position: fixed;
+	bottom: 100px;
+	left: 50%;
+	transform: translateX(-50%) translateY(20px);
+	opacity: 0;
+	z-index: 2147483642; /* 比 SimulatorMask 高一层 */
+	box-sizing: border-box;
+
+	overflow: visible;
+
+	* {
+		box-sizing: border-box;
+	}
+
+	--width: 360px;
+	--height: 40px;
+	--border-radius: 12px;
+
+	--side-space: 12px; /* 控制栏两侧的间距 */
+	--history-width: calc(var(--width) - var(--side-space) * 2);
+
+	--color-1: rgb(57, 182, 255);
+	--color-2: rgb(189, 69, 251);
+	--color-3: rgb(255, 87, 51);
+	--color-4: rgb(255, 214, 0);
+
+	width: var(--width);
+	height: var(--height);
+
+	transition: all 0.3s ease-in-out;
+
+	/* 响应式设计 */
+	@media (max-width: 480px) {
+		width: calc(100vw - 40px);
+		--width: calc(100vw - 40px);
+	}
+
+	.background {
+		position: absolute;
+		inset: -2px -8px;
+		border-radius: calc(var(--border-radius) + 4px);
+		filter: blur(16px);
+		overflow: hidden;
+		/* mix-blend-mode: lighten; */
+		/* display: none; */
+
+		&::before {
+			content: '';
+			z-index: -1;
+			pointer-events: none;
+			position: absolute;
+			width: 100%;
+			height: 100%;
+			/* left: -100%; */
+			left: 0;
+			top: 0;
+
+			background-image: linear-gradient(
+				to bottom left,
+				var(--color-1),
+				var(--color-2),
+				var(--color-1)
+			);
+			animation: mask-running 2s linear infinite;
+		}
+		&::after {
+			content: '';
+			z-index: -1;
+			pointer-events: none;
+			position: absolute;
+			width: 100%;
+			height: 100%;
+			left: 0;
+			top: 0;
+
+			background-image: linear-gradient(
+				to bottom left,
+				var(--color-2),
+				var(--color-1),
+				var(--color-2)
+			);
+			animation: mask-running 2s linear infinite;
+			animation-delay: 1s;
+		}
+	}
+}
+
+@keyframes mask-running {
+	from {
+		transform: translateX(-100%);
+	}
+	to {
+		transform: translateX(100%);
+	}
+}
+
+/* 控制栏 */
+.header {
+	display: flex;
+	align-items: center;
+	justify-content: space-between;
+	padding: 8px 12px;
+	user-select: none;
+
+	position: absolute;
+	inset: 0;
+
+	cursor: pointer;
+	flex-shrink: 0; /* 防止 header 被压缩 */
+
+	background: rgba(0, 0, 0, 0.5);
+	backdrop-filter: blur(10px);
+	border-radius: var(--border-radius);
+	background-clip: padding-box;
+
+	box-shadow:
+		0 0 0px 2px rgba(255, 255, 255, 0.4),
+		0 0 5px 1px rgba(255, 255, 255, 0.3);
+
+	.statusSection {
+		display: flex;
+		align-items: center;
+		gap: 8px;
+		flex: 1;
+		min-height: 24px; /* 确保垂直居中 */
+
+		.indicator {
+			width: 6px;
+			height: 6px;
+			border-radius: 50%;
+			background: rgba(255, 255, 255, 0.5);
+			flex-shrink: 0;
+			animation: none; /* 默认无动画 */
+
+			/* 运行状态 - 有动画 */
+			&.thinking {
+				background: rgb(57, 182, 255);
+				animation: pulse 0.8s ease-in-out infinite;
+			}
+
+			&.tool_executing {
+				background: rgb(189, 69, 251);
+				animation: pulse 0.6s ease-in-out infinite;
+			}
+
+			&.retry {
+				background: rgb(255, 214, 0);
+				animation: retryPulse 1s ease-in-out infinite;
+			}
+
+			/* 静止状态 - 无动画 */
+			&.completed,
+			&.input,
+			&.output {
+				background: rgb(34, 197, 94);
+				animation: none;
+			}
+
+			&.error {
+				background: rgb(239, 68, 68);
+				animation: none;
+			}
+		}
+
+		.statusText {
+			color: white;
+			font-size: 12px;
+			line-height: 1;
+			font-weight: 500;
+			transition: all 0.3s ease-in-out;
+			position: relative;
+			overflow: hidden;
+			display: flex;
+			align-items: center;
+			min-height: 24px; /* 确保垂直居中 */
+
+			&.fadeOut {
+				animation: statusTextFadeOut 0.3s ease forwards;
+			}
+
+			&.fadeIn {
+				animation: statusTextFadeIn 0.3s ease forwards;
+			}
+		}
+	}
+
+	.controls {
+		display: flex;
+		align-items: center;
+		gap: 4px;
+
+		.controlButton {
+			width: 24px;
+			height: 24px;
+			border: none;
+			border-radius: 4px;
+			background: rgba(255, 255, 255, 0.1);
+			color: white;
+			cursor: pointer;
+			display: flex;
+			align-items: center;
+			justify-content: center;
+			font-size: 12px;
+			line-height: 1;
+
+			&:hover {
+				background: rgba(255, 255, 255, 0.2);
+			}
+		}
+
+		.pauseButton {
+			font-weight: 600;
+			&.paused {
+				background: rgba(34, 197, 94, 0.2); /* 绿色背景表示可以继续 */
+				color: rgb(34, 197, 94);
+
+				&:hover {
+					background: rgba(34, 197, 94, 0.3);
+				}
+			}
+		}
+
+		.stopButton {
+			background: rgba(239, 68, 68, 0.2);
+			color: rgb(255, 41, 41);
+			font-weight: 600;
+
+			&:hover {
+				background: rgba(239, 68, 68, 0.3);
+			}
+		}
+	}
+}
+
+@keyframes statusTextFadeIn {
+	0% {
+		opacity: 0;
+		transform: translateY(5px);
+	}
+	100% {
+		opacity: 1;
+		transform: translateY(0);
+	}
+}
+
+@keyframes statusTextFadeOut {
+	0% {
+		opacity: 1;
+		transform: translateY(0);
+	}
+	100% {
+		opacity: 0;
+		transform: translateY(-5px);
+	}
+}
+
+.historySectionWrapper {
+	position: absolute;
+	width: var(--history-width);
+	bottom: var(--height);
+	left: var(--side-space);
+	z-index: -2;
+
+	padding-top: 0px;
+	visibility: collapse;
+	overflow: hidden;
+
+	transition: all 0.2s;
+
+	background: rgba(2, 0, 20, 0.5);
+	/* background: rgba(186, 186, 186, 0.2); */
+	backdrop-filter: blur(10px);
+
+	text-shadow: 0 0 1px rgba(0, 0, 0, 0.2);
+
+	border-top-left-radius: calc(var(--border-radius) + 4px);
+	border-top-right-radius: calc(var(--border-radius) + 4px);
+
+	/* border: 2px solid rgba(255, 255, 255, 0.8); */
+	border: 2px solid rgba(255, 255, 255, 0.4);
+	box-shadow: 0 4px 16px rgba(0, 0, 0, 0.6);
+
+	/* @media (prefers-color-scheme: dark) {
+		box-shadow:
+			0 8px 32px 0 rgba(0, 0, 0, 0.85),
+			0 2px 12px 0 rgba(57, 182, 255, 0.1);
+	} */
+
+	.expanded & {
+		padding-top: 8px;
+		visibility: visible;
+	}
+
+	.historySection {
+		position: relative;
+		overflow-y: auto;
+		overscroll-behavior: contain;
+		scrollbar-width: none;
+		max-height: 0;
+		padding-inline: 8px;
+
+		transition: max-height 0.2s;
+
+		.expanded & {
+			max-height: 400px;
+		}
+
+		.historyItem {
+			/* backdrop-filter: blur(10px); */
+			padding: 8px 10px;
+			margin-bottom: 6px;
+			background: linear-gradient(135deg, rgba(255, 255, 255, 0.08), rgba(255, 255, 255, 0.03));
+			border-radius: 8px;
+			border-left: 2px solid rgba(57, 182, 255, 0.5);
+			font-size: 12px;
+			color: white;
+			/* color: black; */
+			line-height: 1.3;
+			position: relative;
+			overflow: hidden;
+
+			/* 微妙的内阴影 */
+			box-shadow:
+				inset 0 1px 0 rgba(255, 255, 255, 0.1),
+				0 1px 3px rgba(0, 0, 0, 0.1);
+
+			&::before {
+				content: '';
+				position: absolute;
+				top: 0;
+				left: 0;
+				right: 0;
+				height: 1px;
+				background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
+			}
+
+			&:hover {
+				background: linear-gradient(135deg, rgba(255, 255, 255, 0.12), rgba(255, 255, 255, 0.06));
+				/* transform: translateY(-1px); */
+				box-shadow:
+					inset 0 1px 0 rgba(255, 255, 255, 0.15),
+					0 2px 4px rgba(0, 0, 0, 0.15);
+			}
+
+			&:last-child {
+				margin-bottom: 10px;
+			}
+
+			&.completed,
+			&.input,
+			&.output {
+				border-left-color: rgb(34, 197, 94);
+				background: linear-gradient(135deg, rgba(34, 197, 94, 0.1), rgba(34, 197, 94, 0.05));
+			}
+
+			&.error {
+				border-left-color: rgb(239, 68, 68);
+				background: linear-gradient(135deg, rgba(239, 68, 68, 0.1), rgba(239, 68, 68, 0.05));
+			}
+
+			&.retry {
+				border-left-color: rgb(255, 214, 0);
+				background: linear-gradient(135deg, rgba(255, 214, 0, 0.1), rgba(255, 214, 0, 0.05));
+			}
+
+			/* 突出显示 done 成功结果 */
+			&.doneSuccess {
+				background: linear-gradient(
+					135deg,
+					rgba(34, 197, 94, 0.25),
+					rgba(34, 197, 94, 0.15),
+					rgba(34, 197, 94, 0.08)
+				);
+				border: none;
+				border-left: 4px solid rgb(34, 197, 94);
+				box-shadow:
+					0 4px 12px rgba(34, 197, 94, 0.3),
+					inset 0 1px 0 rgba(255, 255, 255, 0.2),
+					0 0 20px rgba(34, 197, 94, 0.1);
+				font-weight: 600;
+				color: rgb(220, 252, 231);
+				padding: 10px 12px;
+				margin-bottom: 8px;
+				border-radius: 8px;
+				position: relative;
+				overflow: hidden;
+
+				&::before {
+					background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.4), transparent);
+				}
+
+				&::after {
+					content: '';
+					position: absolute;
+					top: 0;
+					left: -100%;
+					width: 100%;
+					height: 100%;
+					background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.1), transparent);
+					animation: shimmer 2s ease-in-out infinite;
+				}
+
+				.historyContent {
+					.statusIcon {
+						font-size: 16px;
+						animation: celebrate 0.8s ease-in-out;
+						filter: drop-shadow(0 2px 4px rgba(34, 197, 94, 0.5));
+					}
+				}
+			}
+
+			/* 突出显示 done 失败结果 */
+			&.doneError {
+				background: linear-gradient(
+					135deg,
+					rgba(239, 68, 68, 0.25),
+					rgba(239, 68, 68, 0.15),
+					rgba(239, 68, 68, 0.08)
+				);
+				border: none;
+				border-left: 4px solid rgb(239, 68, 68);
+				box-shadow:
+					0 4px 12px rgba(239, 68, 68, 0.3),
+					inset 0 1px 0 rgba(255, 255, 255, 0.2),
+					0 0 20px rgba(239, 68, 68, 0.1);
+				font-weight: 600;
+				color: rgb(254, 226, 226);
+				padding: 10px 12px;
+				margin-bottom: 8px;
+				border-radius: 8px;
+				position: relative;
+				overflow: hidden;
+
+				&::before {
+					background: linear-gradient(90deg, transparent, rgba(239, 68, 68, 0.4), transparent);
+				}
+
+				.historyContent {
+					.statusIcon {
+						font-size: 16px;
+						filter: drop-shadow(0 2px 4px rgba(239, 68, 68, 0.5));
+					}
+				}
+			}
+
+			.historyContent {
+				display: flex;
+				align-items: center;
+				gap: 8px;
+
+				word-break: break-all;
+				white-space: pre-wrap;
+
+				/* overflow-x: auto; */
+
+				.statusIcon {
+					font-size: 12px;
+					flex-shrink: 0;
+					line-height: 1;
+					transition: all 0.3s ease;
+				}
+			}
+
+			.historyMeta {
+				font-size: 10px;
+				color: rgba(255, 255, 255, 0.6);
+				/* color: rgb(61, 61, 61); */
+				margin-top: 8px;
+				line-height: 1;
+			}
+		}
+	}
+}
+
+/* 动画关键帧 - 更快的闪烁 */
+@keyframes pulse {
+	0%,
+	100% {
+		opacity: 1;
+		transform: scale(1);
+	}
+	50% {
+		opacity: 0.4;
+		transform: scale(1.3);
+	}
+}
+
+/* 重试动画 - 旋转脉冲 */
+@keyframes retryPulse {
+	0%,
+	100% {
+		opacity: 1;
+		transform: scale(1) rotate(0deg);
+	}
+	25% {
+		opacity: 0.6;
+		transform: scale(1.2) rotate(90deg);
+	}
+	50% {
+		opacity: 0.8;
+		transform: scale(1.1) rotate(180deg);
+	}
+	75% {
+		opacity: 0.6;
+		transform: scale(1.2) rotate(270deg);
+	}
+}
+
+/* 庆祝动画 */
+@keyframes celebrate {
+	0%,
+	100% {
+		transform: scale(1);
+	}
+	25% {
+		transform: scale(1.2) rotate(-5deg);
+	}
+	75% {
+		transform: scale(1.2) rotate(5deg);
+	}
+}
+
+/* done 卡片的光泽效果 */
+@keyframes shimmer {
+	0% {
+		left: -100%;
+	}
+	100% {
+		left: 100%;
+	}
+}
+
+/* 输入区域样式 */
+.inputSectionWrapper {
+	position: absolute;
+	width: var(--history-width);
+	top: var(--height);
+	left: var(--side-space);
+	z-index: -1;
+
+	visibility: visible;
+	overflow: hidden;
+
+	height: 48px;
+
+	transition: all 0.2s;
+
+	background: rgba(186, 186, 186, 0.2);
+	backdrop-filter: blur(10px);
+
+	border-bottom-left-radius: calc(var(--border-radius) + 4px);
+	border-bottom-right-radius: calc(var(--border-radius) + 4px);
+
+	border: 2px solid rgba(255, 255, 255, 0.3);
+	box-shadow: 0 1px 16px rgba(0, 0, 0, 0.4);
+
+	&.hidden {
+		visibility: collapse;
+		height: 0;
+	}
+
+	.inputSection {
+		display: flex;
+		align-items: center;
+		gap: 4px;
+		padding: 8px 8px;
+
+		.taskInput {
+			flex: 1;
+			background: rgba(255, 255, 255, 0.4);
+			border: 1px solid rgba(255, 255, 255, 0.3);
+			border-radius: 10px;
+			padding-inline: 10px;
+			color: rgb(20, 20, 20);
+			font-size: 12px;
+			height: 28px;
+			line-height: 1;
+			outline: none;
+			transition: all 0.2s ease;
+
+			/* text-shadow: 0 0 2px rgba(255, 255, 255, 0.8); */
+
+			/* border-color: rgba(57, 182, 255, 0.3); */
+
+			&::placeholder {
+				color: rgb(53, 53, 53);
+			}
+
+			&:focus {
+				background: rgba(255, 255, 255, 0.8);
+				border-color: rgba(57, 182, 255, 0.6);
+				box-shadow: 0 0 0 2px rgba(57, 182, 255, 0.2);
+			}
+		}
+	}
+}
--- a/packages/page-agent/src/ui/Panel.ts
+++ b/packages/page-agent/src/ui/Panel.ts
@@ -0,0 +1,596 @@
+import type { PageAgent } from '../PageAgent'
+import type { I18n } from '../i18n'
+import { truncate } from '../utils'
+import type { EventBus } from '../utils/bus'
+import { type Step, UIState } from './UIState'
+
+import styles from './Panel.module.css'
+
+/**
+ * Agent control panel
+ */
+export class Panel {
+	#wrapper: HTMLElement
+	#indicator: HTMLElement
+	#statusText: HTMLElement
+	#historySection: HTMLElement
+	#expandButton: HTMLElement
+	#pauseButton: HTMLElement
+	#stopButton: HTMLElement
+	#inputSection: HTMLElement
+	#taskInput: HTMLInputElement
+	#bus: EventBus
+
+	#state = new UIState()
+	#isExpanded = false
+	#pageAgent: PageAgent
+	#userAnswerResolver: ((input: string) => void) | null = null
+	#isWaitingForUserAnswer: boolean = false
+	#headerUpdateTimer: ReturnType<typeof setInterval> | null = null
+	#pendingHeaderText: string | null = null
+	#isAnimating = false
+
+	get wrapper(): HTMLElement {
+		return this.#wrapper
+	}
+
+	constructor(pageAgent: PageAgent) {
+		this.#pageAgent = pageAgent
+		this.#bus = pageAgent.bus
+		this.#wrapper = this.#createWrapper()
+		this.#indicator = this.#wrapper.querySelector(`.${styles.indicator}`)!
+		this.#statusText = this.#wrapper.querySelector(`.${styles.statusText}`)!
+		this.#historySection = this.#wrapper.querySelector(`.${styles.historySection}`)!
+		this.#expandButton = this.#wrapper.querySelector(`.${styles.expandButton}`)!
+		this.#pauseButton = this.#wrapper.querySelector(`.${styles.pauseButton}`)!
+		this.#stopButton = this.#wrapper.querySelector(`.${styles.stopButton}`)!
+		this.#inputSection = this.#wrapper.querySelector(`.${styles.inputSectionWrapper}`)!
+		this.#taskInput = this.#wrapper.querySelector(`.${styles.taskInput}`)!
+
+		this.#setupEventListeners()
+		this.#startHeaderUpdateLoop()
+		// this.#expand() // debug
+
+		this.#showInputArea()
+
+		this.#bus.on('panel:show', () => this.#show())
+		this.#bus.on('panel:hide', () => this.#hide())
+		this.#bus.on('panel:reset', () => this.#reset())
+		this.#bus.on('panel:update', (stepData) => this.#update(stepData))
+		this.#bus.on('panel:expand', () => this.#expand())
+		this.#bus.on('panel:collapse', () => this.#collapse())
+	}
+
+	/**
+	 * Ask for user input
+	 */
+	async askUser(question: string): Promise<string> {
+		return new Promise((resolve) => {
+			// Set `waiting for user answer` state
+			this.#isWaitingForUserAnswer = true
+			this.#userAnswerResolver = resolve
+
+			// Update state to `running`
+			this.#update({
+				type: 'output',
+				displayText: this.#pageAgent.i18n.t('ui.panel.question', { question }),
+			}) // Expand history panel
+			if (!this.#isExpanded) {
+				this.#expand()
+			}
+
+			this.#showInputArea(this.#pageAgent.i18n.t('ui.panel.userAnswerPrompt'))
+		})
+	}
+
+	/**
+	 * Dispose panel
+	 */
+	dispose(): void {
+		this.#isWaitingForUserAnswer = false
+		this.#stopHeaderUpdateLoop()
+		this.wrapper.remove()
+	}
+
+	/**
+	 * Update status
+	 */
+	#update(stepData: Omit<Step, 'id' | 'stepNumber' | 'timestamp'>): void {
+		const step = this.#state.addStep(stepData)
+
+		// Queue header text update (will be processed by periodic check)
+		const headerText = truncate(step.displayText, 20)
+		this.#pendingHeaderText = headerText
+
+		this.#updateStatusIndicator(step.type)
+		this.#updateHistory()
+
+		// Auto-expand history after task completion
+		if (step.type === 'completed' || step.type === 'error') {
+			if (!this.#isExpanded) {
+				this.#expand()
+			}
+		}
+
+		// Control input area display based on status
+		if (this.#shouldShowInputArea()) {
+			this.#showInputArea()
+		} else {
+			this.#hideInputArea()
+		}
+	}
+
+	/**
+	 * Show panel
+	 */
+	#show(): void {
+		this.wrapper.style.display = 'block'
+		// Force reflow to trigger animation
+		void this.wrapper.offsetHeight
+		this.wrapper.style.opacity = '1'
+		this.wrapper.style.transform = 'translateX(-50%) translateY(0)'
+	}
+
+	/**
+	 * Hide panel
+	 */
+	#hide(): void {
+		this.wrapper.style.opacity = '0'
+		this.wrapper.style.transform = 'translateX(-50%) translateY(20px)'
+		this.wrapper.style.display = 'none'
+	}
+
+	/**
+	 * Reset state
+	 */
+	#reset(): void {
+		this.#state.reset()
+		this.#statusText.textContent = this.#pageAgent.i18n.t('ui.panel.ready')
+		this.#updateStatusIndicator('thinking')
+		this.#updateHistory()
+		this.#collapse()
+		// Reset pause state
+		this.#pageAgent.paused = false
+		this.#updatePauseButton()
+		// Reset user input state
+		this.#isWaitingForUserAnswer = false
+		this.#userAnswerResolver = null
+		// Show input area
+		this.#showInputArea()
+	}
+
+	/**
+	 * Toggle pause state
+	 */
+	#togglePause(): void {
+		this.#pageAgent.paused = !this.#pageAgent.paused
+		this.#updatePauseButton()
+
+		// Update status display
+		if (this.#pageAgent.paused) {
+			this.#statusText.textContent = this.#pageAgent.i18n.t('ui.panel.paused')
+			this.#updateStatusIndicator('thinking') // Use existing thinking state
+		} else {
+			this.#statusText.textContent = this.#pageAgent.i18n.t('ui.panel.continueExecution')
+			this.#updateStatusIndicator('tool_executing') // Restore to execution state
+		}
+	}
+
+	/**
+	 * Update pause button state
+	 */
+	#updatePauseButton(): void {
+		if (this.#pageAgent.paused) {
+			this.#pauseButton.textContent = '▶'
+			this.#pauseButton.title = this.#pageAgent.i18n.t('ui.panel.continue')
+			this.#pauseButton.classList.add(styles.paused)
+		} else {
+			this.#pauseButton.textContent = '⏸︎'
+			this.#pauseButton.title = this.#pageAgent.i18n.t('ui.panel.pause')
+			this.#pauseButton.classList.remove(styles.paused)
+		}
+	}
+
+	/**
+	 * Stop Agent
+	 */
+	#stopAgent(): void {
+		// Update status display
+		this.#update({
+			type: 'error',
+			displayText: this.#pageAgent.i18n.t('ui.panel.taskTerminated'),
+		})
+
+		this.#pageAgent.dispose()
+	}
+
+	/**
+	 * Submit task
+	 */
+	#submitTask() {
+		const input = this.#taskInput.value.trim()
+		if (!input) return
+
+		// Hide input area
+		this.#hideInputArea()
+
+		if (this.#isWaitingForUserAnswer) {
+			// Handle user input mode
+			this.#handleUserAnswer(input)
+		} else {
+			this.#pageAgent.execute(input)
+		}
+	}
+
+	/**
+	 * Handle user answer
+	 */
+	#handleUserAnswer(input: string): void {
+		// Add user input to history
+		this.#update({
+			type: 'input',
+			displayText: this.#pageAgent.i18n.t('ui.panel.userAnswer', { input }),
+		})
+
+		// Reset state
+		this.#isWaitingForUserAnswer = false
+
+		// Call resolver to return user input
+		if (this.#userAnswerResolver) {
+			this.#userAnswerResolver(input)
+			this.#userAnswerResolver = null
+		}
+	}
+
+	/**
+	 * Show input area
+	 */
+	#showInputArea(placeholder?: string): void {
+		// Clear input field
+		this.#taskInput.value = ''
+		this.#taskInput.placeholder = placeholder || this.#pageAgent.i18n.t('ui.panel.taskInput')
+		this.#inputSection.classList.remove(styles.hidden)
+		// Focus on input field
+		setTimeout(() => {
+			this.#taskInput.focus()
+		}, 100)
+	}
+
+	/**
+	 * Hide input area
+	 */
+	#hideInputArea(): void {
+		this.#inputSection.classList.add(styles.hidden)
+	}
+
+	/**
+	 * Check if input area should be shown
+	 */
+	#shouldShowInputArea(): boolean {
+		// Always show input area if waiting for user input
+		if (this.#isWaitingForUserAnswer) return true
+
+		const steps = this.#state.getAllSteps()
+		if (steps.length === 0) {
+			return true // Initial state
+		}
+
+		const lastStep = steps[steps.length - 1]
+		return lastStep.type === 'completed' || lastStep.type === 'error'
+	}
+
+	#createWrapper(): HTMLElement {
+		const wrapper = document.createElement('div')
+		wrapper.id = 'page-agent-runtime_agent-panel'
+		wrapper.className = `${styles.wrapper} ${styles.collapsed}`
+		wrapper.setAttribute('data-browser-use-ignore', 'true')
+
+		wrapper.innerHTML = `
+			<div class="${styles.background}"></div>
+			<div class="${styles.historySectionWrapper}">
+				<div class="${styles.historySection}">
+					${this.#createHistoryItem({
+						id: 'placeholder',
+						stepNumber: 0,
+						timestamp: new Date(),
+						type: 'thinking',
+						displayText: this.#pageAgent.i18n.t('ui.panel.waitingPlaceholder'),
+					})}
+				</div>
+			</div>
+			<div class="${styles.header}">
+				<div class="${styles.statusSection}">
+					<div class="${styles.indicator} ${styles.thinking}"></div>
+					<div class="${styles.statusText}">${this.#pageAgent.i18n.t('ui.panel.ready')}</div>
+				</div>
+				<div class="${styles.controls}">
+					<button class="${styles.controlButton} ${styles.expandButton}" title="${this.#pageAgent.i18n.t('ui.panel.expand')}">
+						▼
+					</button>
+					<button class="${styles.controlButton} ${styles.pauseButton}" title="${this.#pageAgent.i18n.t('ui.panel.pause')}">
+						⏸︎
+					</button>
+					<button class="${styles.controlButton} ${styles.stopButton}" title="${this.#pageAgent.i18n.t('ui.panel.stop')}">
+						X
+					</button>
+				</div>
+			</div>
+			<div class="${styles.inputSectionWrapper} ${styles.hidden}">
+				<div class="${styles.inputSection}">
+					<input 
+						type="text" 
+						class="${styles.taskInput}" 
+						maxlength="200"
+					/>
+				</div>
+			</div>
+		`
+
+		document.body.appendChild(wrapper)
+		return wrapper
+	}
+
+	#setupEventListeners(): void {
+		// Click header area to expand/collapse
+		const header = this.wrapper.querySelector(`.${styles.header}`)!
+		header.addEventListener('click', (e) => {
+			// Don't trigger expand/collapse if clicking on buttons
+			if ((e.target as HTMLElement).closest(`.${styles.controlButton}`)) {
+				return
+			}
+			this.#toggle()
+		})
+
+		// Expand button
+		this.#expandButton.addEventListener('click', (e) => {
+			e.stopPropagation()
+			this.#toggle()
+		})
+
+		// Pause/continue button
+		this.#pauseButton.addEventListener('click', (e) => {
+			e.stopPropagation()
+			this.#togglePause()
+		})
+
+		// Stop button
+		this.#stopButton.addEventListener('click', (e) => {
+			e.stopPropagation()
+			this.#stopAgent()
+		})
+
+		// Submit on Enter key in input field
+		this.#taskInput.addEventListener('keydown', (e) => {
+			if (e.isComposing) return // Ignore IME composition keys
+			if (e.key === 'Enter') {
+				e.preventDefault()
+				this.#submitTask()
+			}
+		})
+
+		// Prevent input area click event bubbling
+		this.#inputSection.addEventListener('click', (e) => {
+			e.stopPropagation()
+		})
+	}
+
+	#toggle(): void {
+		if (this.#isExpanded) {
+			this.#collapse()
+		} else {
+			this.#expand()
+		}
+	}
+
+	#expand(): void {
+		this.#isExpanded = true
+		this.wrapper.classList.remove(styles.collapsed)
+		this.wrapper.classList.add(styles.expanded)
+		this.#expandButton.textContent = '▲'
+	}
+
+	#collapse(): void {
+		this.#isExpanded = false
+		this.wrapper.classList.remove(styles.expanded)
+		this.wrapper.classList.add(styles.collapsed)
+		this.#expandButton.textContent = '▼'
+	}
+
+	/**
+	 * Start periodic header update loop
+	 */
+	#startHeaderUpdateLoop(): void {
+		// Check every 450ms (same as total animation duration)
+		this.#headerUpdateTimer = setInterval(() => {
+			this.#checkAndUpdateHeader()
+		}, 450)
+	}
+
+	/**
+	 * Stop periodic header update loop
+	 */
+	#stopHeaderUpdateLoop(): void {
+		if (this.#headerUpdateTimer) {
+			clearInterval(this.#headerUpdateTimer)
+			this.#headerUpdateTimer = null
+		}
+	}
+
+	/**
+	 * Check if header needs update and trigger animation if not currently animating
+	 */
+	#checkAndUpdateHeader(): void {
+		// If no pending text or currently animating, skip
+		if (!this.#pendingHeaderText || this.#isAnimating) {
+			return
+		}
+
+		// If text is already displayed, clear pending and skip
+		if (this.#statusText.textContent === this.#pendingHeaderText) {
+			this.#pendingHeaderText = null
+			return
+		}
+
+		// Start animation
+		const textToShow = this.#pendingHeaderText
+		this.#pendingHeaderText = null
+		this.#animateTextChange(textToShow)
+	}
+
+	/**
+	 * Animate text change with fade out/in effect
+	 */
+	#animateTextChange(newText: string): void {
+		this.#isAnimating = true
+
+		// Fade out current text
+		this.#statusText.classList.add(styles.fadeOut)
+
+		setTimeout(() => {
+			// Update text content
+			this.#statusText.textContent = newText
+
+			// Fade in new text
+			this.#statusText.classList.remove(styles.fadeOut)
+			this.#statusText.classList.add(styles.fadeIn)
+
+			setTimeout(() => {
+				this.#statusText.classList.remove(styles.fadeIn)
+				this.#isAnimating = false
+			}, 300)
+		}, 150) // Half the duration of fade out animation
+	}
+
+	#updateStatusIndicator(type: Step['type']): void {
+		// Clear all status classes
+		this.#indicator.className = styles.indicator
+
+		// Add corresponding status class
+		this.#indicator.classList.add(styles[type])
+	}
+
+	#updateHistory(): void {
+		const steps = this.#state.getAllSteps()
+
+		this.#historySection.innerHTML = steps.map((step) => this.#createHistoryItem(step)).join('')
+
+		// Scroll to bottom to show latest records
+		this.#scrollToBottom()
+	}
+
+	#scrollToBottom(): void {
+		// Execute in next event loop to ensure DOM update completion
+		setTimeout(() => {
+			this.#historySection.scrollTop = this.#historySection.scrollHeight
+		}, 0)
+	}
+
+	#createHistoryItem(step: Step): string {
+		const time = step.timestamp.toLocaleTimeString('zh-CN', {
+			hour12: false,
+			hour: '2-digit',
+			minute: '2-digit',
+			second: '2-digit',
+		})
+
+		let typeClass = ''
+		let statusIcon = ''
+
+		// Set styles and icons based on step type
+		if (step.type === 'completed') {
+			// Check if this is a result from done tool
+			if (step.toolName === 'done') {
+				// Judge success or failure based on result
+				const failureKeyword = this.#pageAgent.i18n.t('ui.tools.resultFailure')
+				const errorKeyword = this.#pageAgent.i18n.t('ui.tools.resultError')
+				const isSuccess =
+					!step.toolResult ||
+					(!step.toolResult.includes(failureKeyword) && !step.toolResult.includes(errorKeyword))
+				typeClass = isSuccess ? styles.doneSuccess : styles.doneError
+				statusIcon = isSuccess ? '🎉' : '❌'
+			} else {
+				typeClass = styles.completed
+				statusIcon = '✅'
+			}
+		} else if (step.type === 'error') {
+			typeClass = styles.error
+			statusIcon = '❌'
+		} else if (step.type === 'tool_executing') {
+			statusIcon = '⚙️'
+		} else if (step.type === 'output') {
+			typeClass = styles.output
+			statusIcon = '🤖'
+		} else if (step.type === 'input') {
+			typeClass = styles.input
+			statusIcon = '🎯'
+		} else if (step.type === 'retry') {
+			typeClass = styles.retry
+			statusIcon = '🔄'
+		} else {
+			statusIcon = '🧠'
+		}
+
+		const durationText = step.duration ? ` · ${step.duration}ms` : ''
+		const stepLabel = this.#pageAgent.i18n.t('ui.panel.step', {
+			number: step.stepNumber.toString(),
+			time,
+			duration: durationText || '', // Explicitly pass empty string to replace template
+		})
+
+		return `
+			<div class="${styles.historyItem} ${typeClass}">
+				<div class="${styles.historyContent}">
+					<span class="${styles.statusIcon}">${statusIcon}</span>
+					<span>${step.displayText}</span>
+				</div>
+				<div class="${styles.historyMeta}">
+					${stepLabel}
+				</div>
+			</div>
+		`
+	}
+}
+
+/**
+ * Get display text for tool execution
+ */
+export function getToolExecutingText(toolName: string, args: any, i18n: I18n): string {
+	switch (toolName) {
+		case 'click_element_by_index':
+			return i18n.t('ui.tools.clicking', { index: args.index })
+		case 'input_text':
+			return i18n.t('ui.tools.inputting', { index: args.index })
+		case 'select_dropdown_option':
+			return i18n.t('ui.tools.selecting', { text: args.text })
+		case 'scroll':
+			return i18n.t('ui.tools.scrolling')
+		case 'wait':
+			return i18n.t('ui.tools.waiting', { seconds: args.seconds })
+		case 'done':
+			return i18n.t('ui.tools.done')
+		default:
+			return i18n.t('ui.tools.executing', { toolName })
+	}
+}
+
+/**
+ * Get display text for tool completion
+ */
+export function getToolCompletedText(toolName: string, args: any, i18n: I18n): string | null {
+	switch (toolName) {
+		case 'click_element_by_index':
+			return i18n.t('ui.tools.clicked', { index: args.index })
+		case 'input_text':
+			return i18n.t('ui.tools.inputted', { text: args.text })
+		case 'select_dropdown_option':
+			return i18n.t('ui.tools.selected', { text: args.text })
+		case 'scroll':
+			return i18n.t('ui.tools.scrolled')
+		case 'wait':
+			return i18n.t('ui.tools.waited')
+		case 'done':
+			return null
+		default:
+			return null
+	}
+}
--- a/packages/page-agent/src/ui/SimulatorMask.module.css
+++ b/packages/page-agent/src/ui/SimulatorMask.module.css
@@ -0,0 +1,10 @@
+.wrapper {
+	position: fixed;
+	inset: 0;
+	z-index: 2147483641; /* 确保在所有元素之上，除了 panel */
+	/* pointer-events: none; */
+	cursor: not-allowed;
+	overflow: hidden;
+
+	display: none;
+}
--- a/packages/page-agent/src/ui/SimulatorMask.ts
+++ b/packages/page-agent/src/ui/SimulatorMask.ts
@@ -0,0 +1,172 @@
+import { Motion } from 'ai-motion'
+
+import { isPageDark } from '../utils/checkDarkMode'
+
+import styles from './SimulatorMask.module.css'
+import cursorStyles from './cursor.module.css'
+
+export class SimulatorMask {
+	wrapper = document.createElement('div')
+	motion = new Motion({
+		mode: isPageDark() ? 'dark' : 'light',
+		styles: {
+			position: 'absolute',
+			inset: '0',
+		},
+	})
+
+	#cursor = document.createElement('div')
+
+	#currentCursorX = 0
+	#currentCursorY = 0
+
+	#targetCursorX = 0
+	#targetCursorY = 0
+
+	constructor() {
+		this.wrapper.id = 'page-agent-runtime_simulator-mask'
+		this.wrapper.className = styles.wrapper
+		this.wrapper.setAttribute('data-browser-use-ignore', 'true')
+
+		this.wrapper.appendChild(this.motion.element)
+		this.motion.autoResize(this.wrapper)
+
+		// Capture all mouse, keyboard, and wheel events
+		this.wrapper.addEventListener('click', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+		this.wrapper.addEventListener('mousedown', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+		this.wrapper.addEventListener('mouseup', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+		this.wrapper.addEventListener('mousemove', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+		this.wrapper.addEventListener('wheel', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+		this.wrapper.addEventListener('keydown', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+		this.wrapper.addEventListener('keyup', (e) => {
+			e.stopPropagation()
+			e.preventDefault()
+		})
+
+		// Create AI cursor
+		this.#createCursor()
+		// this.show()
+
+		document.body.appendChild(this.wrapper)
+
+		this.#moveCursorToTarget()
+
+		window.addEventListener('PageAgent::MovePointerTo', (event: Event) => {
+			const { x, y } = (event as CustomEvent).detail
+			this.setCursorPosition(x, y)
+		})
+
+		window.addEventListener('PageAgent::ClickPointer', (event: Event) => {
+			this.triggerClickAnimation()
+		})
+	}
+
+	#createCursor() {
+		this.#cursor.className = cursorStyles.cursor
+
+		// Create ripple effect container
+		const rippleContainer = document.createElement('div')
+		rippleContainer.className = cursorStyles.cursorRipple
+		this.#cursor.appendChild(rippleContainer)
+
+		// Create filling layer
+		const fillingLayer = document.createElement('div')
+		fillingLayer.className = cursorStyles.cursorFilling
+		this.#cursor.appendChild(fillingLayer)
+
+		// Create border layer
+		const borderLayer = document.createElement('div')
+		borderLayer.className = cursorStyles.cursorBorder
+		this.#cursor.appendChild(borderLayer)
+
+		this.wrapper.appendChild(this.#cursor)
+	}
+
+	#moveCursorToTarget() {
+		const newX = this.#currentCursorX + (this.#targetCursorX - this.#currentCursorX) * 0.2
+		const newY = this.#currentCursorY + (this.#targetCursorY - this.#currentCursorY) * 0.2
+
+		const xDistance = Math.abs(newX - this.#targetCursorX)
+		if (xDistance > 0) {
+			if (xDistance < 2) {
+				this.#currentCursorX = this.#targetCursorX
+			} else {
+				this.#currentCursorX = newX
+			}
+			this.#cursor.style.left = `${this.#currentCursorX}px`
+		}
+
+		const yDistance = Math.abs(newY - this.#targetCursorY)
+		if (yDistance > 0) {
+			if (yDistance < 2) {
+				this.#currentCursorY = this.#targetCursorY
+			} else {
+				this.#currentCursorY = newY
+			}
+			this.#cursor.style.top = `${this.#currentCursorY}px`
+		}
+
+		requestAnimationFrame(() => this.#moveCursorToTarget())
+	}
+
+	setCursorPosition(x: number, y: number) {
+		this.#targetCursorX = x
+		this.#targetCursorY = y
+	}
+
+	triggerClickAnimation() {
+		this.#cursor.classList.remove(cursorStyles.clicking)
+		// Force reflow to restart animation
+		void this.#cursor.offsetHeight
+		this.#cursor.classList.add(cursorStyles.clicking)
+	}
+
+	show() {
+		this.motion.start()
+		this.motion.fadeIn()
+
+		this.wrapper.style.display = 'block'
+
+		// Initialize cursor position
+		this.#currentCursorX = window.innerWidth / 2
+		this.#currentCursorY = window.innerHeight / 2
+		this.#targetCursorX = this.#currentCursorX
+		this.#targetCursorY = this.#currentCursorY
+		this.#cursor.style.left = `${this.#currentCursorX}px`
+		this.#cursor.style.top = `${this.#currentCursorY}px`
+	}
+
+	hide() {
+		this.motion.fadeOut()
+		this.motion.pause()
+
+		this.#cursor.classList.remove(cursorStyles.clicking)
+
+		setTimeout(() => {
+			this.wrapper.style.display = 'none'
+		}, 800) // Match the animation duration
+	}
+
+	dispose() {
+		this.motion.dispose()
+		this.wrapper.remove()
+	}
+}
--- a/packages/page-agent/src/ui/UIState.ts
+++ b/packages/page-agent/src/ui/UIState.ts
@@ -0,0 +1,93 @@
+/**
+ * Agent execution state management
+ */
+
+export interface Step {
+	id: string
+	stepNumber: number
+	timestamp: Date
+	type: 'thinking' | 'tool_executing' | 'completed' | 'error' | 'output' | 'input' | 'retry'
+
+	// Tool execution related
+	toolName?: string
+	toolArgs?: any
+	toolResult?: any
+
+	// Display data
+	displayText: string
+	duration?: number
+}
+
+export type AgentStatus = 'idle' | 'running' | 'paused' | 'completed' | 'error'
+
+export class UIState {
+	private steps: Step[] = []
+	private currentStep: Step | null = null
+	private status: AgentStatus = 'idle'
+	private stepCounter = 0
+
+	addStep(stepData: Omit<Step, 'id' | 'stepNumber' | 'timestamp'>): Step {
+		const step: Step = {
+			id: this.generateId(),
+			stepNumber: ++this.stepCounter,
+			timestamp: new Date(),
+			...stepData,
+		}
+
+		this.steps.push(step)
+		this.currentStep = step
+
+		// Update overall status
+		this.updateStatus(step.type)
+
+		return step
+	}
+
+	updateCurrentStep(updates: Partial<Step>): Step | null {
+		if (!this.currentStep) return null
+
+		Object.assign(this.currentStep, updates)
+		return this.currentStep
+	}
+
+	getCurrentStep(): Step | null {
+		return this.currentStep
+	}
+
+	getAllSteps(): Step[] {
+		return [...this.steps]
+	}
+
+	getStatus(): AgentStatus {
+		return this.status
+	}
+
+	reset(): void {
+		this.steps = []
+		this.currentStep = null
+		this.status = 'idle'
+		this.stepCounter = 0
+	}
+
+	private updateStatus(stepType: Step['type']): void {
+		switch (stepType) {
+			case 'thinking':
+			case 'tool_executing':
+			case 'output':
+			case 'input':
+			case 'retry':
+				this.status = 'running'
+				break
+			case 'completed':
+				this.status = 'completed'
+				break
+			case 'error':
+				this.status = 'error'
+				break
+		}
+	}
+
+	private generateId(): string {
+		return `step_${Date.now()}_${Math.random().toString(36).substring(2, 11)}`
+	}
+}
--- a/packages/page-agent/src/ui/cursor.module.css
+++ b/packages/page-agent/src/ui/cursor.module.css
@@ -0,0 +1,91 @@
+/* AI 光标样式 */
+.cursor {
+	position: absolute;
+	width: var(--cursor-size, 75px);
+	height: var(--cursor-size, 75px);
+	pointer-events: none;
+	z-index: 10000;
+	transform: translate(-30%, -30%);
+
+	animation: cursor-enter 300ms ease-out forwards;
+}
+
+.cursorBorder {
+	position: absolute;
+	inset: 0;
+	background: linear-gradient(45deg, rgb(57, 182, 255), rgb(189, 69, 251));
+	mask-image: url(https://img.alicdn.com/imgextra/i1/O1CN01YHLVYR1LvqWIyo5kH_!!6000000001362-2-tps-202-202.png);
+	mask-size: 100% 100%;
+	mask-repeat: no-repeat;
+	animation: cursor-breathe 2s ease-in-out infinite;
+}
+
+.cursorFilling {
+	position: absolute;
+	inset: 0;
+	background: url(https://img.alicdn.com/imgextra/i3/O1CN01JZOqOS1Tu1sIKbPLW_!!6000000002441-2-tps-202-202.png);
+	background-size: 100% 100%;
+	background-repeat: no-repeat;
+}
+
+.cursorRipple {
+	position: absolute;
+	inset: 0;
+	pointer-events: none;
+}
+
+.cursor.clicking .cursorRipple::after {
+	content: '';
+	position: absolute;
+	width: 100%;
+	height: 100%;
+	left: -30%;
+	top: -30%;
+	border: 4px solid rgba(57, 182, 255, 1);
+	border-radius: 50%;
+	animation: cursor-ripple 300ms ease-out forwards;
+}
+
+/* 光标动画关键帧 */
+@keyframes cursor-breathe {
+	0%,
+	100% {
+		transform: scale(1);
+		opacity: 0.9;
+	}
+	50% {
+		transform: scale(1.05);
+		opacity: 1;
+	}
+}
+
+@keyframes cursor-rotate {
+	0% {
+		transform: rotate(0deg);
+	}
+	100% {
+		transform: rotate(360deg);
+	}
+}
+
+@keyframes cursor-enter {
+	0% {
+		transform: translate(-30%, -30%) scale(0.5);
+		opacity: 0;
+	}
+	100% {
+		transform: translate(-30%, -30%) scale(1);
+		opacity: 1;
+	}
+}
+
+@keyframes cursor-ripple {
+	0% {
+		transform: scale(0);
+		opacity: 1;
+	}
+	100% {
+		transform: scale(2);
+		opacity: 0;
+	}
+}
--- a/packages/page-agent/src/ui/motion-css/createMotion.ts
+++ b/packages/page-agent/src/ui/motion-css/createMotion.ts
@@ -0,0 +1,64 @@
+import styles from './motion.module.css'
+
+export function createMotion() {
+	const wrapper = document.createElement('div')
+	wrapper.className = styles.wrapper
+
+	{
+		const colorWrapper = document.createElement('div')
+		colorWrapper.className = styles.colorWrapper
+		wrapper.appendChild(colorWrapper)
+
+		const layerA = document.createElement('div')
+		layerA.className = styles.colorLayer + ' ' + styles.layerA
+		colorWrapper.appendChild(layerA)
+
+		const layerB = document.createElement('div')
+		layerB.className = styles.colorLayer + ' ' + styles.layerB
+		colorWrapper.appendChild(layerB)
+
+		const layerC = document.createElement('div')
+		layerC.className = styles.colorLayer + ' ' + styles.layerC
+		colorWrapper.appendChild(layerC)
+	}
+
+	{
+		const borderWrapper = document.createElement('div')
+		borderWrapper.className = styles.borderWrapper
+		wrapper.appendChild(borderWrapper)
+
+		const layerA = document.createElement('div')
+		layerA.className = styles.borderLayer + ' ' + styles.layerA
+		borderWrapper.appendChild(layerA)
+
+		const layerB = document.createElement('div')
+		layerB.className = styles.borderLayer + ' ' + styles.layerB
+		borderWrapper.appendChild(layerB)
+
+		const layerC = document.createElement('div')
+		layerC.className = styles.borderLayer + ' ' + styles.layerC
+		borderWrapper.appendChild(layerC)
+	}
+
+	function show() {
+		wrapper.classList.remove(styles.exit)
+		wrapper.classList.remove(styles.entry)
+		// Force reflow to restart animation
+		void wrapper.offsetHeight
+		wrapper.classList.add(styles.entry)
+	}
+
+	function hide() {
+		wrapper.classList.remove(styles.entry)
+		wrapper.classList.remove(styles.exit)
+		// Force reflow to restart animation
+		void wrapper.offsetHeight
+		wrapper.classList.add(styles.exit)
+	}
+
+	return {
+		element: wrapper,
+		show,
+		hide,
+	}
+}
--- a/packages/page-agent/src/ui/motion-css/motion.module.css
+++ b/packages/page-agent/src/ui/motion-css/motion.module.css
@@ -0,0 +1,397 @@
+.wrapper {
+	position: absolute;
+	inset: 0;
+	pointer-events: none;
+
+	transform-origin: center;
+
+	--color-1: rgb(57, 182, 255);
+	--color-2: rgb(189, 69, 251);
+	--color-3: rgb(255, 87, 51);
+	--color-4: rgb(255, 214, 0);
+
+	--blend-mode: screen;
+}
+
+.colorLayer {
+	position: absolute;
+	inset: 0;
+
+	/* 变亮混合模式 */
+	/* mix-blend-mode: screen; */
+	/* mix-blend-mode: overlay; */
+	/* mix-blend-mode: multiply; */
+	mix-blend-mode: add;
+
+	/* 边框遮罩 - 中间透明，边缘不透明 */
+	mask-image: url(https://img.alicdn.com/imgextra/i2/O1CN01iW1wfX1C0ICvoPbTq_!!6000000000018-2-tps-512-512.png);
+	mask-repeat: no-repeat;
+	mask-size: calc(100% + 10px) calc(100% + 10px);
+}
+
+.borderWrapper {
+	position: absolute;
+	inset: 0;
+
+	/* filter: blur(10px); */
+}
+
+.borderLayer {
+	position: absolute;
+	inset: 0;
+
+	/* 变亮混合模式 */
+	/* mix-blend-mode: overlay; */
+	mix-blend-mode: add;
+
+	mask-image:
+		linear-gradient(
+			to right,
+			black 0px,
+			black 2px,
+			transparent 2px,
+			transparent calc(100% - 2px),
+			black calc(100% - 2px),
+			black 100%
+		),
+		linear-gradient(
+			to top,
+			black 0px,
+			black 2px,
+			transparent 2px,
+			transparent calc(100% - 2px),
+			black calc(100% - 2px),
+			black 100%
+		);
+
+	mask-composite: add;
+	mask-repeat: no-repeat;
+	mask-size: 100% 100%;
+
+	/* filter: blur(100px); */
+}
+
+.blueLayer {
+	&.colorLayer {
+		mask-position: left -5px top -5px;
+	}
+
+	&::after {
+		content: '';
+		position: absolute;
+		/* inset: 0; */
+		width: calc(max(100vw, 100vh) * 1.5);
+		height: 600px;
+		top: calc(50% - 300px);
+		left: 50%;
+		filter: blur(100px);
+		background: rgb(57, 182, 255);
+		animation: rotate-clockwise 4s linear infinite;
+		animation-delay: -3s;
+	}
+}
+
+.purpleLayer {
+	&.colorLayer {
+		mask-position: left -3px top -7px;
+	}
+
+	&::after {
+		content: '';
+		position: absolute;
+		/* inset: 0; */
+		width: calc(max(100vw, 100vh) * 1.5);
+		height: 600px;
+		top: calc(50% - 300px);
+		left: 50%;
+		filter: blur(100px);
+		background: rgb(189, 69, 251);
+		animation: rotate-clockwise 4s linear infinite;
+		animation-delay: -2s;
+	}
+}
+
+.orangeLayer {
+	/* opacity: 0.5; */
+
+	&.colorLayer {
+		mask-position: left -7px top -2px;
+	}
+
+	&::after {
+		content: '';
+		position: absolute;
+		/* inset: 0; */
+		width: calc(max(100vw, 100vh) * 1.5);
+		height: 600px;
+		top: calc(50% - 300px);
+		left: 50%;
+		filter: blur(100px);
+		background: rgb(255, 87, 51);
+		animation: rotate-counter-clockwise 3s linear infinite;
+		animation-delay: -2s;
+	}
+}
+
+.yellowLayer {
+	/* opacity: 0.5; */
+
+	&.colorLayer {
+		mask-position: left -6px top -4px;
+	}
+
+	&::after {
+		content: '';
+		position: absolute;
+		/* inset: 0; */
+		width: calc(max(100vw, 100vh) * 1.5);
+		height: 600px;
+		top: calc(50% - 300px);
+		left: 50%;
+		filter: blur(100px);
+		background: rgb(255, 214, 0);
+		animation: rotate-counter-clockwise 4s linear infinite;
+		animation-delay: -1s;
+	}
+}
+
+/* 旋转动画 */
+@keyframes rotate-clockwise {
+	0% {
+		transform: translateX(-50%) rotate(0deg);
+	}
+	100% {
+		transform: translateX(-50%) rotate(360deg);
+	}
+}
+
+@keyframes rotate-counter-clockwise {
+	0% {
+		transform: translateX(-50%) rotate(0deg);
+	}
+	100% {
+		transform: translateX(-50%) rotate(-360deg);
+	}
+}
+
+@keyframes wrapper-entry {
+	from {
+		transform: scale(1.1);
+	}
+	to {
+		transform: scale(1);
+	}
+}
+
+/* 
+rgb(57, 182, 255)
+rgb(189, 69, 251)
+rgb(255, 87, 51)
+rgb(255, 214, 0)
+*/
+
+@keyframes mask-running {
+	from {
+		transform: translateX(0%);
+	}
+	to {
+		transform: translateX(100%);
+	}
+}
+
+@keyframes mask-running-reverse {
+	from {
+		transform: translateX(100%);
+	}
+	to {
+		transform: translateX(0%);
+	}
+}
+
+.colorWrapper {
+	position: absolute;
+	inset: 0;
+
+	.colorLayer {
+		position: absolute;
+		inset: 0;
+
+		mix-blend-mode: var(--blend-mode);
+
+		/* 边框遮罩 - 中间透明，边缘不透明 */
+		mask-image: url(https://img.alicdn.com/imgextra/i2/O1CN01iW1wfX1C0ICvoPbTq_!!6000000000018-2-tps-512-512.png);
+		mask-repeat: no-repeat;
+		mask-size: 100% 100%;
+	}
+}
+
+.borderWrapper {
+	position: absolute;
+	inset: 0;
+
+	--blend-mode: lighten;
+
+	.borderLayer {
+		position: absolute;
+		inset: 0;
+
+		mix-blend-mode: var(--blend-mode);
+
+		mask-border: url(https://img.alicdn.com/imgextra/i3/O1CN01bFjRug1yssyWEUbKL_!!6000000006635-2-tps-256-256.png)
+			25;
+		-webkit-mask-box-image: url(https://img.alicdn.com/imgextra/i3/O1CN01bFjRug1yssyWEUbKL_!!6000000006635-2-tps-256-256.png)
+			25;
+
+		mask-repeat: no-repeat;
+		mask-size: 100% 100%;
+
+		background-color: var(--color-2);
+	}
+}
+
+.entry .colorWrapper,
+.entry .borderWrapper {
+	animation: wrapper-entry 0.8s ease-in-out forwards;
+}
+
+.exit .colorWrapper,
+.exit .borderWrapper {
+	animation: wrapper-entry 0.8s ease-in-out reverse forwards;
+}
+
+.layerA {
+	position: absolute;
+	inset: 0;
+
+	&::before {
+		mix-blend-mode: var(--blend-mode);
+		content: '';
+		display: block;
+		position: absolute;
+		width: 100%;
+		height: 100%;
+		left: -100%;
+		top: 0;
+		background-image: linear-gradient(
+			to right bottom,
+			transparent,
+			var(--color-1),
+			transparent,
+			var(--color-1),
+			transparent
+		);
+		animation: mask-running 2s linear infinite;
+	}
+
+	&::after {
+		mix-blend-mode: var(--blend-mode);
+		content: '';
+		display: block;
+		position: absolute;
+		width: 100%;
+		height: 100%;
+		left: 0;
+		top: 0;
+		background-image: linear-gradient(
+			to right bottom,
+			transparent,
+			var(--color-1),
+			transparent,
+			var(--color-1),
+			transparent
+		);
+		animation: mask-running 2s linear infinite;
+	}
+}
+
+.layerB {
+	position: absolute;
+	inset: 0;
+
+	&::before {
+		mix-blend-mode: var(--blend-mode);
+		content: '';
+		display: block;
+		position: absolute;
+		width: 100%;
+		height: 100%;
+		left: -100%;
+		top: 0;
+		background: linear-gradient(
+			to right top,
+			transparent,
+			var(--color-2),
+			transparent,
+			var(--color-2),
+			transparent
+		);
+		animation: mask-running-reverse 3s linear infinite;
+	}
+
+	&::after {
+		mix-blend-mode: var(--blend-mode);
+		content: '';
+		display: block;
+		position: absolute;
+		width: 100%;
+		height: 100%;
+		left: 0;
+		top: 0;
+		background: linear-gradient(
+			to right top,
+			transparent,
+			var(--color-2),
+			transparent,
+			var(--color-2),
+			transparent
+		);
+		animation: mask-running-reverse 3s linear infinite;
+	}
+}
+
+.layerC {
+	position: absolute;
+	inset: 0;
+
+	opacity: 0.5;
+
+	&::before {
+		mix-blend-mode: var(--blend-mode);
+		content: '';
+		display: block;
+		position: absolute;
+		width: 100%;
+		height: 100%;
+		left: -100%;
+		top: 0;
+		background: linear-gradient(
+			to right top,
+			transparent,
+			var(--color-3),
+			transparent,
+			var(--color-3),
+			transparent
+		);
+		animation: mask-running 1s linear infinite;
+	}
+
+	&::after {
+		mix-blend-mode: var(--blend-mode);
+		content: '';
+		display: block;
+		position: absolute;
+		width: 100%;
+		height: 100%;
+		left: 0;
+		top: 0;
+		background: linear-gradient(
+			to right top,
+			transparent,
+			var(--color-3),
+			transparent,
+			var(--color-3),
+			transparent
+		);
+		animation: mask-running 1s linear infinite;
+	}
+}
--- a/packages/page-agent/src/ui/motion-css/readme
+++ b/packages/page-agent/src/ui/motion-css/readme
@@ -0,0 +1,5 @@
+This is the CSS implementation of ai-motion.
+
+Easy to use but Terrible performance. Causing full screen glitching in some browsers.
+
+Use it only in a small area.
--- a/packages/page-agent/src/utils/assert.ts
+++ b/packages/page-agent/src/utils/assert.ts
@@ -0,0 +1,17 @@
+import chalk from 'chalk'
+
+/**
+ * Simple assertion function that throws an error if the condition is falsy
+ * @param condition - The condition to assert
+ * @param message - Optional error message
+ * @throws Error if condition is falsy
+ */
+export function assert(condition: unknown, message?: string, silent?: boolean): asserts condition {
+	if (!condition) {
+		const errorMessage = message ?? 'Assertion failed'
+
+		if (!silent) console.error(chalk.red(`❌ assert: ${errorMessage}`))
+
+		throw new Error(errorMessage)
+	}
+}
--- a/packages/page-agent/src/utils/bus.ts
+++ b/packages/page-agent/src/utils/bus.ts
@@ -0,0 +1,122 @@
+/**
+ * Type-safe event bus for decoupling PageAgent and Panel
+ */
+import type { Step } from '../ui/UIState'
+
+/**
+ * Event mapping definitions
+ * @note Event bus callbacks must be repeatable without errors
+ */
+export interface PageAgentEventMap {
+	// Panel control events
+	// call panel.show()
+	'panel:show': { params: undefined }
+	// call panel.hide()
+	'panel:hide': { params: undefined }
+	// call panel.reset()
+	'panel:reset': { params: undefined }
+	// call panel.update()
+	'panel:update': { params: Omit<Step, 'id' | 'stepNumber' | 'timestamp'> }
+	// call panel.expand()
+	'panel:expand': { params: undefined }
+	// call panel.collapse()
+	'panel:collapse': { params: undefined }
+
+	// PageAgent status events
+	// 'agent:beforeUpdate': { params: undefined }
+	// 'agent:afterUpdate': { params: undefined }
+	// 'agent:execute': { params: { task: string } }
+	// 'agent:done': { params: { text: string; success: boolean } }
+	// 'agent:paused': { params: undefined }
+	// 'agent:resumed': { params: undefined }
+	// 'agent:disposed': { params: undefined }
+	// 'agent:error': { params: { error: string | Error } }
+
+	// Task status change events
+	// 'task:start': { params: { task: string } }
+	// 'task:step': { params: Omit<AgentStep, 'id' | 'stepNumber' | 'timestamp'> }
+	// 'task:complete': { params: { text: string; success: boolean } }
+	// 'task:error': { params: { error: string | Error } }
+
+	// Index signature for dynamic event names
+	// [key: string]: { params: any }
+}
+
+/**
+ * Event handler type definitions
+ */
+export type EventHandler<T extends keyof PageAgentEventMap> =
+	PageAgentEventMap[T]['params'] extends undefined
+		? () => void
+		: (params: PageAgentEventMap[T]['params']) => void
+
+/**
+ * Async event handler type definitions
+ */
+export type AsyncEventHandler<T extends keyof PageAgentEventMap> =
+	PageAgentEventMap[T]['params'] extends undefined
+		? () => Promise<void>
+		: (params: PageAgentEventMap[T]['params']) => Promise<void>
+
+/**
+ * Type-safe event bus
+ * @note Mainly used to decouple logic and UI
+ * @note All modules of a PageAgent instance share the same EventBus instance for communication
+ * @note Use with caution if delivery guarantee is needed for logic communication
+ * @note `on` `once` `emit` methods handle built-in events with type protection, use `addEventListener` for other events
+ */
+class EventBus extends EventTarget {
+	/**
+	 * Listen to built-in events
+	 */
+	on<T extends keyof PageAgentEventMap>(event: T, handler: EventHandler<T>): void {
+		const wrappedHandler = (e: Event) => {
+			const customEvent = e as CustomEvent
+			const params = customEvent.detail?.[0]
+			return handler(params)
+		}
+		this.addEventListener(event, wrappedHandler)
+	}
+
+	/**
+	 * Listen to built-in events (one-time)
+	 */
+	once<T extends keyof PageAgentEventMap>(event: T, handler: EventHandler<T>): void {
+		const wrappedHandler = (e: Event) => {
+			const customEvent = e as CustomEvent
+			const params = customEvent.detail?.[0]
+			return handler(params)
+		}
+		this.addEventListener(event, wrappedHandler, { once: true })
+	}
+
+	/**
+	 * Emit built-in events
+	 */
+	emit<T extends keyof PageAgentEventMap>(
+		event: T,
+		...args: PageAgentEventMap[T]['params'] extends undefined
+			? []
+			: [PageAgentEventMap[T]['params']]
+	): void {
+		const customEvent = new CustomEvent(event, { detail: args })
+		this.dispatchEvent(customEvent)
+		return
+	}
+}
+
+const buses = new Map<string, EventBus>()
+
+/**
+ * Get the event bus for a given channel
+ */
+export function getEventBus(channel: string) {
+	if (buses.has(channel)) {
+		return buses.get(channel)!
+	}
+	const bus = new EventBus()
+	buses.set(channel, bus)
+	return bus
+}
+
+export type { EventBus }
--- a/packages/page-agent/src/utils/checkDarkMode.ts
+++ b/packages/page-agent/src/utils/checkDarkMode.ts
@@ -0,0 +1,110 @@
+/**
+ * Checks for common dark mode CSS classes on the html or body elements.
+ * @returns {boolean} - True if a common dark mode class is found.
+ */
+function hasDarkModeClass() {
+	const DFEAULT_DARK_MODE_CLASSES = ['dark', 'dark-mode', 'theme-dark', 'night', 'night-mode']
+
+	const htmlElement = document.documentElement
+	const bodyElement = document.body
+
+	// Check class names on <html> and <body>
+	for (const className of DFEAULT_DARK_MODE_CLASSES) {
+		if (htmlElement.classList.contains(className) || bodyElement.classList.contains(className)) {
+			return true
+		}
+	}
+
+	// Some sites use data attributes
+	const darkThemeAttribute = htmlElement.getAttribute('data-theme')
+	if (darkThemeAttribute?.toLowerCase().includes('dark')) {
+		return true
+	}
+
+	return false
+}
+
+/**
+ * Parses an RGB or RGBA color string and returns an object with r, g, b properties.
+ * @param {string} colorString - e.g., "rgb(34, 34, 34)" or "rgba(0, 0, 0, 0.5)"
+ * @returns {{r: number, g: number, b: number}|null}
+ */
+function parseRgbColor(colorString: string) {
+	const rgbMatch = /rgba?\((\d+),\s*(\d+),\s*(\d+)/.exec(colorString)
+	if (!rgbMatch) {
+		return null // Not a valid rgb/rgba string
+	}
+	return {
+		r: parseInt(rgbMatch[1]),
+		g: parseInt(rgbMatch[2]),
+		b: parseInt(rgbMatch[3]),
+	}
+}
+
+/**
+ * Determines if a color is "dark" based on its calculated luminance.
+ * @param {string} colorString - The CSS color string (e.g., "rgb(50, 50, 50)").
+ * @param {number} threshold - A value between 0 and 255. Colors with luminance below this will be considered dark. Default is 128.
+ * @returns {boolean} - True if the color is considered dark.
+ */
+function isColorDark(colorString: string, threshold = 128) {
+	if (!colorString || colorString === 'transparent' || colorString.startsWith('rgba(0, 0, 0, 0)')) {
+		return false // Transparent is not dark
+	}
+
+	const rgb = parseRgbColor(colorString)
+	if (!rgb) {
+		return false // Could not parse color
+	}
+
+	// Calculate perceived luminance using the standard formula
+	const luminance = 0.299 * rgb.r + 0.587 * rgb.g + 0.114 * rgb.b
+
+	return luminance < threshold
+}
+
+/**
+ * Checks the background color of the body element to determine if the page is dark.
+ * @returns {boolean}
+ */
+function isBackgroundDark() {
+	// We check both <html> and <body> because some pages set the color on <html>
+	const htmlStyle = window.getComputedStyle(document.documentElement)
+	const bodyStyle = window.getComputedStyle(document.body)
+
+	// Get background colors
+	const htmlBgColor = htmlStyle.backgroundColor
+	const bodyBgColor = bodyStyle.backgroundColor
+
+	// The body's background might be transparent, in which case we should
+	// fall back to the html element's background.
+	if (isColorDark(bodyBgColor)) {
+		return true
+	} else if (bodyBgColor === 'transparent' || bodyBgColor.startsWith('rgba(0, 0, 0, 0)')) {
+		return isColorDark(htmlBgColor)
+	}
+
+	return false
+}
+
+/**
+ * A comprehensive function to determine if the page is currently in a dark theme.
+ * It combines class checking and background color analysis.
+ * @returns {boolean} - True if the page is likely dark.
+ */
+export function isPageDark() {
+	// Strategy 1: Check for common dark mode classes
+	if (hasDarkModeClass()) {
+		return true
+	}
+
+	// Strategy 2: Analyze the computed background color
+	if (isBackgroundDark()) {
+		return true
+	}
+
+	// @TODO add more checks here, e.g., analyzing text color,
+	// or checking the background of major layout elements like <main> or #app.
+
+	return false
+}
--- a/packages/page-agent/src/utils/index.ts
+++ b/packages/page-agent/src/utils/index.ts
@@ -0,0 +1,80 @@
+/**
+ * Wait until condition becomes true
+ * @returns Returns when condition becomes true, throws otherwise
+ * @param timeout Timeout in milliseconds, default 0 means no timeout, throws error on timeout
+ */
+export async function waitUntil(check: () => boolean, timeout = 60 * 60_1000): Promise<boolean> {
+	if (check()) return true
+
+	return new Promise((resolve, reject) => {
+		const start = Date.now()
+		const interval = setInterval(() => {
+			if (check()) {
+				clearInterval(interval)
+				resolve(true)
+			} else if (Date.now() - start > timeout) {
+				clearInterval(interval)
+				reject(new Error('Timeout waiting for condition to become true'))
+			}
+		}, 100)
+	})
+}
+
+//
+
+export function truncate(text: string, maxLength: number): string {
+	if (text.length > maxLength) {
+		return text.substring(0, maxLength) + '...'
+	}
+	return text
+}
+
+//
+
+export function trimLines(text: string): string {
+	return text
+		.split('\n')
+		.map((line) => line.trim())
+		.join('\n')
+}
+
+//
+
+export function randomID(existingIDs?: string[]): string {
+	let id = Math.random().toString(36).substring(2, 11)
+
+	if (!existingIDs) {
+		return id
+	}
+
+	const MAX_TRY = 1000
+	let tryCount = 0
+
+	while (existingIDs.includes(id)) {
+		id = Math.random().toString(36).substring(2, 11)
+		tryCount++
+		if (tryCount > MAX_TRY) {
+			throw new Error('randomID: too many try')
+		}
+	}
+
+	return id
+}
+
+//
+
+if (!window.__PAGE_AGENT_IDS__) {
+	window.__PAGE_AGENT_IDS__ = []
+}
+
+const ids = window.__PAGE_AGENT_IDS__
+
+/**
+ * Generate a random ID.
+ * @note Unique within this window.
+ */
+export function uid() {
+	const id = randomID(ids)
+	ids.push(id)
+	return id
+}
--- a/packages/page-agent/tsconfig.json
+++ b/packages/page-agent/tsconfig.json
@@ -0,0 +1,10 @@
+{
+	"extends": "../../tsconfig.json",
+	"compilerOptions": {
+		"composite": true,
+		"noEmit": false,
+		"outDir": "./dist",
+		"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo"
+	},
+	"include": ["src", "env.d.ts"]
+}
--- a/packages/page-agent/vite.config.js
+++ b/packages/page-agent/vite.config.js
@@ -0,0 +1,85 @@
+// @ts-check
+import chalk from 'chalk'
+import 'dotenv/config'
+import process from 'node:process'
+import { dirname, resolve } from 'path'
+import dts from 'unplugin-dts/vite'
+import { fileURLToPath } from 'url'
+import { defineConfig } from 'vite'
+import cssInjectedByJsPlugin from 'vite-plugin-css-injected-by-js'
+
+const __dirname = dirname(fileURLToPath(import.meta.url))
+
+// ============================================================================
+// Library Config (ES Module for NPM Package)
+// ============================================================================
+/** @type {import('vite').UserConfig} */
+const libConfig = {
+	clearScreen: false,
+	plugins: [
+		dts({ tsconfigPath: './tsconfig.json', bundleTypes: true }),
+		cssInjectedByJsPlugin({ relativeCSSInjection: true }),
+	],
+	publicDir: false,
+	esbuild: {
+		keepNames: true,
+	},
+	build: {
+		lib: {
+			entry: resolve(__dirname, 'src/PageAgent.ts'),
+			name: 'PageAgent',
+			fileName: 'page-agent',
+			formats: ['es'],
+		},
+		outDir: resolve(__dirname, 'dist', 'lib'),
+		rollupOptions: {
+			external: ['ai', 'ai-motion', 'chalk', 'zod'],
+		},
+		minify: false,
+		sourcemap: true,
+		cssCodeSplit: true,
+	},
+	define: {
+		'process.env.NODE_ENV': '"production"',
+	},
+}
+
+// ============================================================================
+// UMD Config (Browser Bundle for CDN)
+// ============================================================================
+/** @type {import('vite').UserConfig} */
+const umdConfig = {
+	plugins: [cssInjectedByJsPlugin({ relativeCSSInjection: true })],
+	publicDir: false,
+	esbuild: {
+		keepNames: true,
+	},
+	build: {
+		lib: {
+			entry: resolve(__dirname, 'src/entry.ts'),
+			name: 'PageAgent',
+			fileName: 'page-agent',
+			formats: ['umd'],
+		},
+		outDir: resolve(__dirname, 'dist', 'umd'),
+		cssCodeSplit: true,
+	},
+	define: {
+		'process.env.NODE_ENV': '"production"',
+	},
+}
+
+// ============================================================================
+
+const MODE = process.env.MODE
+
+console.log(chalk.cyan(`📦 Build mode: ${chalk.bold(MODE || 'lib')}`))
+
+let config
+if (MODE === 'umd') {
+	config = umdConfig
+} else {
+	config = libConfig
+}
+
+export default defineConfig(config)