feat: init

This commit is contained in:
Simon
2025-09-29 16:33:15 +08:00
parent e8041e0582
commit 847620b5e8
98 changed files with 20166 additions and 0 deletions

524
src/PageAgent.ts Normal file
View File

@@ -0,0 +1,524 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import { tool } from 'ai'
import type { LanguageModelUsage, ToolSet } from 'ai'
import chalk from 'chalk'
import zod from 'zod'
import type { PageAgentConfig } from './config'
import { MACRO_TOOL_NAME, MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
import * as dom from './dom'
import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
import { getPageInfo } from './dom/getPageInfo'
import { I18n } from './i18n'
import { LLM } from './llms'
import { patchReact } from './patches/react'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools'
import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel'
import { SimulatorMask } from './ui/SimulatorMask'
import { trimLines, uid, waitUntil } from './utils'
import { assert } from './utils/assert'
import { getEventBus } from './utils/bus'
export type { PageAgentConfig }
export interface AgentBrain {
// thinking?: string
evaluation_previous_goal: string
memory: string
next_goal: string
}
export interface AgentHistory {
brain: AgentBrain
action: {
name: string
input: any
output: any
}
usage: LanguageModelUsage
}
export interface ExecutionResult {
success: boolean
data: string
history: AgentHistory[]
}
export class PageAgent extends EventTarget {
config: PageAgentConfig
id = uid()
bus = getEventBus(this.id)
i18n: I18n
paused = false
disposed = false
task = ''
#llm: LLM
#totalWaitTime = 0
#abortController = new AbortController()
/** Corresponds to eval_page in browser-use */
flatTree: FlatDomTree | null = null
/**
* All highlighted index-mapped interactive elements
* Corresponds to DOMState.selector_map in browser-use
*/
selectorMap = new Map<number, InteractiveElementDomNode>()
/** highlight index -> element text */
elementTextMap = new Map<number, string>()
/** Corresponds to clickable_elements_to_string in browser-use */
simplifiedHTML = '<EMPTY>'
/** last time the tree was updated */
lastTimeUpdate = 0
/** Corresponds to actions in browser-use */
tools = new Map(tools)
/** Fullscreen mask */
mask = new SimulatorMask()
/** Interactive panel */
panel = new Panel(this)
/** History records */
history: AgentHistory[] = []
constructor(config: PageAgentConfig = {}) {
super()
this.config = config
this.#llm = new LLM(this.config, this.id)
this.i18n = new I18n(this.config.language)
patchReact(this)
}
/**
* @todo maybe return something?
*/
async execute(task: string): Promise<ExecutionResult> {
if (!task) throw new Error('Task is required')
this.task = task
// Show mask and panel
this.mask.show()
this.bus.emit('panel:show')
this.bus.emit('panel:reset')
this.bus.emit('panel:update', {
type: 'input',
displayText: task,
})
if (this.#abortController) {
this.#abortController.abort()
this.#abortController = new AbortController()
}
this.history = []
try {
let step = 0
while (true) {
console.group(`step: ${step + 1}`)
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
// pause
await waitUntil(() => !this.paused)
// Update status to thinking
console.log(chalk.blue('Thinking...'))
this.bus.emit('panel:update', {
type: 'thinking',
displayText: this.i18n.t('ui.panel.thinking'),
})
const result = await this.#llm.invoke(
[
{
role: 'system',
content: this.#getSystemPrompt(),
},
{
role: 'user',
content: this.#assembleUserPrompt(),
},
],
// tools,
this.#packMacroTool(),
this.#abortController.signal
)
const toolResult = result.toolResult
const input = toolResult.input
const output = toolResult.output
const brain = {
thinking: input.thinking,
evaluation_previous_goal: input.evaluation_previous_goal,
memory: input.memory,
next_goal: input.next_goal,
}
const actionName = Object.keys(input.action)[0]
const action = {
name: actionName,
input: input.action[actionName],
output: output,
}
this.history.push({
brain,
action,
usage: result.usage,
})
console.log(chalk.green('Step finished:'), actionName)
console.groupEnd()
step++
if (step > MAX_STEPS) {
this.#onDone('Step count exceeded maximum limit', false)
return {
success: false,
data: 'Step count exceeded maximum limit',
history: this.history,
}
}
if (actionName === 'done') {
const success = action.input.success || false
const text = action.input.text || 'no text provided'
console.log(chalk.green.bold('Task completed'), success, text)
this.#onDone(text, success)
return {
success,
data: text,
history: this.history,
}
}
}
} catch (error: unknown) {
console.error('Task failed', error)
this.#onDone(String(error), false)
return {
success: false,
data: String(error),
history: this.history,
}
}
}
/**
* Merge all tools into a single MacroTool with the following input:
* - thinking: string
* - evaluation_previous_goal: string
* - memory: string
* - next_goal: string
* - action: { toolName: toolInput }
* where action must be selected from tools defined in this.tools
*
* @topic 要不要合并成一个 tool
* @facts
* - 我们需要模型每步返回 evaluation/memory/goal 等思考过程
* - browser use 合并成一个巨大的 tool
* ```json
* {
* "memory": "...",
* "goal": "...",
* "actions": [
* {
* "name": "...",
* "args": "..."
* }
* // ...
* ]
* }
* ```
* - qwen 目前必须指定 function name 来确保 tool call
* @reasoning
* - 不能为了 qwen 的缺陷而设计系统
* - 更复杂的 tool 更容易出错
* - 分散的 tool 更容易利用 ai-sdk 的重试机制,也更容易处理错误
* - 不能用额外的步骤生成这些数据,不仅性能过差,而且 goal 之类的必须和 call 一起生成
* @options
* - Plan @A
* - 和 browser use 使用完全一致的做法,合并成一个大 tool要求每次调用
* - 会把 tool 定义变得非常复杂,增加出错率
* - Plan @B
* - 每次调用两个 tool其中一个用来输出思考
* - 很难用提示词 enforce 这么复杂的规则
* - Plan @C
* - 自动为每个 tool 增加固定的 reasoning/memory/goal 等输入,并自动拦截提取这些数据
* - 会让 tool 定义变得很长
* @conclusion
* - 使用 @A
*/
#packMacroTool(): ToolSet {
const tools = this.tools
// discriminated version
// @note Success rate ~0, model seems unable to understand discriminated union
// // Create discriminated union schemas from tools
// const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
// return zod.object({
// name: zod.literal(toolName),
// input: tool.inputSchema,
// })
// })
// // Ensure at least one tool exists
// assert(actionSchemas.length, 'No tools available to create macro tool')
// const actionSchema = zod.discriminatedUnion('name', actionSchemas as any)
// union version
const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
return zod.object({
[toolName]: tool.inputSchema,
})
})
const actionSchema = zod.union(actionSchemas)
return {
[MACRO_TOOL_NAME]: tool({
// description: 'Output the result of the agent',
inputSchema: zod.object({
// thinking: zod.string().optional(),
evaluation_previous_goal: zod.string().optional(),
memory: zod.string().optional(),
next_goal: zod.string().optional(),
action: actionSchema,
}),
execute: async (input, options) => {
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
// pause
await waitUntil(() => !this.paused)
console.log(chalk.blue.bold('MacroTool execute'), input)
const action = input.action!
const toolName = Object.keys(action)[0]
const toolInput = action[toolName]
const brain = trimLines(`✅: ${input.evaluation_previous_goal}
💾: ${input.memory}
🎯: ${input.next_goal}
`)
console.log(brain)
this.bus.emit('panel:update', {
type: 'thinking',
displayText: brain,
})
// Find the corresponding tool
const tool = tools.get(toolName)
assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput, options)
this.bus.emit('panel:update', {
type: 'tool_executing',
toolName,
toolArgs: toolInput,
displayText: getToolExecutingText(toolName, toolInput, this.i18n),
})
const startTime = Date.now()
// Execute tool, passing options parameter
let result = await tool.execute!.bind(this)(toolInput, options)
const duration = Date.now() - startTime
console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
if (toolName === 'wait') {
this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
if (this.#totalWaitTime >= 3)
result += '\nDo NOT wait any longer unless you have a good reason.\n'
result += '</sys>'
} else {
// For other tools, reset wait time
this.#totalWaitTime = 0
}
// Briefly display execution result
const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
if (displayResult)
this.bus.emit('panel:update', {
type: 'tool_executing',
toolName,
toolArgs: toolInput,
toolResult: result,
displayText: displayResult,
duration,
})
// Wait a moment to let user see the result
await new Promise((resolve) => setTimeout(resolve, 100))
return result
},
}),
}
}
/**
* Get system prompt, dynamically replace language settings based on configured language
*/
#getSystemPrompt(): string {
let systemPrompt = SYSTEM_PROMPT
const targetLanguage = this.config.language === 'zh-CN' ? '中文' : 'English'
systemPrompt = systemPrompt.replace(
/Default working language: \*\*.*?\*\*/,
`Default working language: **${targetLanguage}**`
)
return systemPrompt
}
#assembleUserPrompt(): string {
let prompt = ''
// <agent_history>
// - <step_>
prompt += '<agent_history>\n'
this.history.forEach((history, index) => {
prompt += `<step_${index + 1}>
Evaluation of Previous Step: ${history.brain.evaluation_previous_goal}
Memory: ${history.brain.memory}
Next Goal: ${history.brain.next_goal}
Action Results: ${history.action.output}
</step_${index + 1}>
`
})
prompt += '</agent_history>\n\n'
// <agent_state>
// - <user_request>
// - <step_info>
// <agent_state>
prompt += `<agent_state>
<user_request>
${this.task}
</user_request>
<step_info>
Step ${this.history.length + 1} of ${MAX_STEPS} max possible steps
Current date and time: ${new Date().toISOString()}
</step_info>
</agent_state>
`
// <browser_state>
prompt += this.#getBrowserState()
return trimLines(prompt)
}
#onDone(text: string, success = true) {
dom.cleanUpHighlights()
// Update panel status
this.bus.emit('panel:update', {
type: success ? 'output' : 'error',
displayText: text,
})
// Task completed
this.bus.emit('panel:update', {
type: 'completed',
displayText: this.i18n.t('ui.panel.taskCompleted'),
})
this.mask.hide()
this.#abortController.abort()
}
#getBrowserState(): string {
const pageUrl = window.location.href
const pageTitle = document.title
const pi = getPageInfo()
this.#updateTree()
let prompt = trimLines(`<browser_state>
Current Page: [${pageTitle}](${pageUrl})
Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page
${VIEWPORT_EXPANSION === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'}
`)
// Page header info
const has_content_above = pi.pixels_above > 4
if (has_content_above && VIEWPORT_EXPANSION !== -1) {
prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[Start of page]\n`
}
// Current viewport info
prompt += this.simplifiedHTML
prompt += `\n`
// Page footer info
const has_content_below = pi.pixels_below > 4
if (has_content_below && VIEWPORT_EXPANSION !== -1) {
prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[End of page]\n`
}
prompt += `</browser_state>\n`
return prompt
}
/**
* Update document tree
*/
#updateTree() {
this.dispatchEvent(new Event('beforeUpdate'))
this.lastTimeUpdate = Date.now()
dom.cleanUpHighlights()
this.mask.wrapper.style.pointerEvents = 'none'
this.flatTree = dom.getFlatTree({
...this.config,
interactiveBlacklist: [
...(this.config.interactiveBlacklist || []),
...document.querySelectorAll('[data-page-agent-not-interactive]').values(),
],
})
this.mask.wrapper.style.pointerEvents = 'auto'
this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes)
this.selectorMap.clear()
this.selectorMap = dom.getSelectorMap(this.flatTree)
this.elementTextMap.clear()
this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML)
this.dispatchEvent(new Event('afterUpdate'))
}
dispose() {
console.log('Disposing PageAgent...')
this.disposed = true
dom.cleanUpHighlights()
this.flatTree = null
this.selectorMap.clear()
this.elementTextMap.clear()
this.panel.dispose()
this.mask.dispose()
this.history = []
this.#abortController.abort('PageAgent disposed')
}
}

49
src/config/constants.ts Normal file
View File

@@ -0,0 +1,49 @@
/**
* @note Since isTopElement depends on elementFromPoint,
* it returns null when out of viewport, this feature has no practical use, only differ between -1 and 0
*/
// export const VIEWPORT_EXPANSION = 100
export const VIEWPORT_EXPANSION = -1
// models
// 🥇 GPT-4.1 (best so far)
export const DEFAULT_MODEL_NAME: string = 'gpt-41-mini-0414-global' // baseline 🌟
// export const DEFAULT_MODEL_NAME: string = 'gpt-41-0414-global' // unnecessary
// 🤞 qwen (tool call format often irregular)
// export const DEFAULT_MODEL_NAME: string = 'qwen-plus-latest' // okay
// export const DEFAULT_MODEL_NAME: string = 'qwen-turbo-latest' // BAD☠
// 👍 Anthropic
// export const DEFAULT_MODEL_NAME: string = 'claude_sonnet4'
// 👌 DeepSeek
// export const DEFAULT_MODEL_NAME: string = 'DeepSeek-V3-671B'
// export const DEFAULT_MODEL_NAME: string = 'deepseek-v3.1'
// export const DEFAULT_MODEL_NAME: string = 'deepseek-v3'
// ☠️❌🙂‍↔️ GPT-5 (slow as hell)
// export const DEFAULT_MODEL_NAME: string = '_gpt-5-nano-0807-global'
// export const DEFAULT_MODEL_NAME: string = '_gpt-5-mini-0807-global'
// export const DEFAULT_MODEL_NAME: string = '_gpt-5-0807-global'
// ❌ Gemini (incapable tool call json schema)
// @todo need a special client for gemini
// export const DEFAULT_MODEL_NAME: string = 'gemini-2.5-pro-06-17'
// export const DEFAULT_MODEL_NAME: string = import.meta.env.OPEN_ROUTER_MODEL!
// ak
export const DEFAULT_API_KEY: string = 'not-needed'
// export const DEFAULT_API_KEY: string = import.meta.env.OPEN_ROUTER_KEY!
// base url
export const DEFAULT_BASE_URL: string = 'http://localhost:3000/api/agent'
// export const DEFAULT_BASE_URL: string = import.meta.env.OPEN_ROUTER_BASE_URL!
// internal
export const MACRO_TOOL_NAME = 'AgentOutput' as const
export const LLM_MAX_RETRIES = 2
export const MAX_STEPS = 20

10
src/config/index.ts Normal file
View File

@@ -0,0 +1,10 @@
import type { DomConfig } from '@/dom'
import type { SupportedLanguage } from '@/i18n'
import type { LLMConfig } from '@/llms'
export interface UIConfig {
// theme?: 'light' | 'dark'
language?: SupportedLanguage
}
export type PageAgentConfig = LLMConfig & DomConfig & UIConfig

1685
src/dom/dom_tree/index.js Normal file

File diff suppressed because it is too large Load Diff

51
src/dom/dom_tree/type.ts Normal file
View File

@@ -0,0 +1,51 @@
// FlatDomTree: 扁平化 DOM 树结构,适用于高效存储和遍历页面结构。
// 每个节点通过 map 索引,支持文本节点和元素节点,字段区分 undefined 和 false。
export interface FlatDomTree {
rootId: string
map: Record<string, DomNode>
}
export type DomNode = TextDomNode | ElementDomNode | InteractiveElementDomNode
export interface TextDomNode {
type: 'TEXT_NODE'
text: string
isVisible: boolean
// 其他可选字段
[key: string]: unknown
}
export interface ElementDomNode {
tagName: string
attributes?: Record<string, string>
xpath?: string
children?: string[]
isVisible?: boolean
isTopElement?: boolean
isInViewport?: boolean
isNew?: boolean
isInteractive?: false
highlightIndex?: number
extra?: Record<string, any>
// 其他可选字段
[key: string]: unknown
}
export interface InteractiveElementDomNode {
tagName: string
attributes?: Record<string, string>
xpath?: string
children?: string[]
isVisible?: boolean
isTopElement?: boolean
isInViewport?: boolean
isInteractive: true
highlightIndex: number
/**
* 可交互元素的 dom 引用
*/
ref: HTMLElement
// 其他可选字段
[key: string]: unknown
}

42
src/dom/getPageInfo.ts Normal file
View File

@@ -0,0 +1,42 @@
export function getPageInfo() {
const viewport_width = window.innerWidth
const viewport_height = window.innerHeight
const page_width = Math.max(document.documentElement.scrollWidth, document.body.scrollWidth || 0)
const page_height = Math.max(
document.documentElement.scrollHeight,
document.body.scrollHeight || 0
)
const scroll_x = window.scrollX || window.pageXOffset || document.documentElement.scrollLeft || 0
const scroll_y = window.scrollY || window.pageYOffset || document.documentElement.scrollTop || 0
const pixels_below = Math.max(0, page_height - (window.innerHeight + scroll_y))
const pixels_right = Math.max(0, page_width - (window.innerWidth + scroll_x))
return {
// Current viewport dimensions
viewport_width,
viewport_height,
// Total page dimensions
page_width,
page_height,
// Current scroll position
scroll_x,
scroll_y,
pixels_above: scroll_y,
pixels_below,
pages_above: viewport_height > 0 ? scroll_y / viewport_height : 0,
pages_below: viewport_height > 0 ? pixels_below / viewport_height : 0,
total_pages: viewport_height > 0 ? page_height / viewport_height : 0,
current_page_position: scroll_y / Math.max(1, page_height - viewport_height),
pixels_left: scroll_x,
pixels_right,
}
}

475
src/dom/index.ts Normal file
View File

@@ -0,0 +1,475 @@
import { VIEWPORT_EXPANSION } from '@/config/constants'
import domTree from '@/dom/dom_tree/index'
import {
ElementDomNode,
FlatDomTree,
InteractiveElementDomNode,
TextDomNode,
} from '@/dom/dom_tree/type'
export interface DomConfig {
interactiveBlacklist?: (Element | (() => Element))[]
interactiveWhitelist?: (Element | (() => Element))[]
include_attributes?: string[]
highlightOpacity?: number
highlightLabelOpacity?: number
}
/**
* 用于检测可交互元素是否是新出现的。
*/
const newElementsCache = new WeakMap<HTMLElement, string>()
export function getFlatTree(config: DomConfig): FlatDomTree {
const interactiveBlacklist = [] as Element[]
for (const item of config.interactiveBlacklist || []) {
if (typeof item === 'function') {
interactiveBlacklist.push(item())
} else {
interactiveBlacklist.push(item)
}
}
const interactiveWhitelist = [] as Element[]
for (const item of config.interactiveWhitelist || []) {
if (typeof item === 'function') {
interactiveWhitelist.push(item())
} else {
interactiveWhitelist.push(item)
}
}
const elements = domTree({
doHighlightElements: true,
debugMode: true,
focusHighlightIndex: -1,
viewportExpansion: VIEWPORT_EXPANSION,
interactiveBlacklist,
interactiveWhitelist,
highlightOpacity: config.highlightOpacity ?? 0.0,
highlightLabelOpacity: config.highlightLabelOpacity ?? 0.1,
}) as FlatDomTree
const currentUrl = window.location.href
/**
* 标记新出现的元素
* @todo browser-use 使用 hash(位置,属性等信息) 来判断是否同一个元素,
* 能够解决 1. 元素被删除后重新添加 2. 页面卸载 等问题。
* 这里先简单做.
*/
for (const nodeId in elements.map) {
const node = elements.map[nodeId]
if (node.isInteractive && node.ref) {
const ref = node.ref as HTMLElement
// @note 这样太严格,元素是可以跨页面存在的
// if (newElementsCache.get(ref) !== currentUrl) {
if (!newElementsCache.has(ref)) {
newElementsCache.set(ref, currentUrl)
node.isNew = true
}
}
}
return elements
}
/**
* elementsToString 内部使用的类型
*/
interface TreeNode {
type: 'text' | 'element'
parent: TreeNode | null
children: TreeNode[]
isVisible: boolean
// Text node properties
text?: string
// Element node properties
tagName?: string
attributes?: Record<string, string>
isInteractive?: boolean
isTopElement?: boolean
isNew?: boolean
highlightIndex?: number
extra?: Record<string, any>
}
/**
* 对应 python 中的 views::clickable_elements_to_string,
* 将 dom 信息处理成适合 llm 阅读的文本格式
* @形如
* ``` text
* [0]<a aria-label=page-agent.js 首页 />
* [1]<div >P />
* [2]<div >page-agent.js
* UI Agent in your webpage />
* [3]<a >文档 />
* [4]<a aria-label=查看源码(在新窗口打开)>源码 />
* UI Agent in your webpage
* 用户输入需求AI 理解页面并自动操作。
* [5]<a role=button>快速开始 />
* [6]<a role=button>查看文档 />
* 无需后端
* ```
* 其中可交互元素用序号标出提示llm可以用序号操作。
* 缩进代表父子关系。
* 普通文本则直接列出来。
*
* @todo 数据脱敏过滤器
*/
export function flatTreeToString(flatTree: FlatDomTree, include_attributes?: string[]): string {
const DEFAULT_INCLUDE_ATTRIBUTES = [
'title',
'type',
'checked',
'name',
'role',
'value',
'placeholder',
'data-date-format',
'alt',
'aria-label',
'aria-expanded',
'data-state',
'aria-checked',
// @edit added for better form handling
'id',
'for',
// for jump check
'target',
// absolute 定位的下拉菜单
'aria-haspopup',
'aria-controls',
'aria-owns',
]
const includeAttrs = [...(include_attributes || []), ...DEFAULT_INCLUDE_ATTRIBUTES]
// Helper function to cap text length
const capTextLength = (text: string, maxLength: number): string => {
if (text.length > maxLength) {
return text.substring(0, maxLength) + '...'
}
return text
}
// Build tree structure from flat map
const buildTreeNode = (nodeId: string): TreeNode | null => {
const node = flatTree.map[nodeId]
if (!node) return null
if (node.type === 'TEXT_NODE') {
const textNode = node as TextDomNode
return {
type: 'text',
text: textNode.text,
isVisible: textNode.isVisible,
parent: null,
children: [],
}
} else {
const elementNode = node as ElementDomNode
const children: TreeNode[] = []
if (elementNode.children) {
for (const childId of elementNode.children) {
const child = buildTreeNode(childId)
if (child) {
child.parent = null // Will be set later
children.push(child)
}
}
}
return {
type: 'element',
tagName: elementNode.tagName,
attributes: elementNode.attributes ?? {},
isVisible: elementNode.isVisible ?? false,
isInteractive: elementNode.isInteractive ?? false,
isTopElement: elementNode.isTopElement ?? false,
isNew: elementNode.isNew ?? false,
highlightIndex: elementNode.highlightIndex,
parent: null,
children,
extra: elementNode.extra ?? {},
}
}
}
// Set parent references
const setParentReferences = (node: TreeNode, parent: TreeNode | null = null) => {
node.parent = parent
for (const child of node.children) {
setParentReferences(child, node)
}
}
// Build root node
const rootNode = buildTreeNode(flatTree.rootId)
if (!rootNode) return ''
setParentReferences(rootNode)
// Helper to check if text node has parent with highlight index
const hasParentWithHighlightIndex = (node: TreeNode): boolean => {
let current = node.parent
while (current) {
if (current.type === 'element' && current.highlightIndex !== undefined) {
return true
}
current = current.parent
}
return false
}
// Helper to check if parent is top element
// const isParentTopElement = (node: TreeNode): boolean => {
// return node.parent?.type === 'element' && node.parent.isTopElement === true
// }
// Main processing function
const processNode = (node: TreeNode, depth: number, result: string[]): void => {
let nextDepth = depth
const depthStr = '\t'.repeat(depth)
if (node.type === 'element') {
// Add element with highlight_index
if (node.highlightIndex !== undefined) {
nextDepth += 1
const text = getAllTextTillNextClickableElement(node)
let attributesHtmlStr = ''
if (includeAttrs.length > 0 && node.attributes) {
const attributesToInclude: Record<string, string> = {}
// Filter attributes
for (const key of includeAttrs) {
const value = node.attributes[key]
if (value && value.trim() !== '') {
attributesToInclude[key] = value.trim()
}
}
// Remove duplicate values (for attributes longer than 5 chars)
const orderedKeys = includeAttrs.filter((key) => key in attributesToInclude)
if (orderedKeys.length > 1) {
const keysToRemove = new Set<string>()
const seenValues: Record<string, string> = {}
for (const key of orderedKeys) {
const value = attributesToInclude[key]
if (value.length > 5) {
if (value in seenValues) {
keysToRemove.add(key)
} else {
seenValues[value] = key
}
}
}
for (const key of keysToRemove) {
delete attributesToInclude[key]
}
}
// Remove role if it matches tagName
if (attributesToInclude.role === node.tagName) {
delete attributesToInclude.role
}
// Remove attributes that duplicate text content
const attrsToRemoveIfTextMatches = ['aria-label', 'placeholder', 'title']
for (const attr of attrsToRemoveIfTextMatches) {
if (
attributesToInclude[attr] &&
attributesToInclude[attr].toLowerCase().trim() === text.toLowerCase().trim()
) {
delete attributesToInclude[attr]
}
}
if (Object.keys(attributesToInclude).length > 0) {
attributesHtmlStr = Object.entries(attributesToInclude)
.map(([key, value]) => `${key}=${capTextLength(value, 20)}`)
.join(' ')
}
}
// Build the line
const highlightIndicator = node.isNew
? `*[${node.highlightIndex}]`
: `[${node.highlightIndex}]`
let line = `${depthStr}${highlightIndicator}<${node.tagName ?? ''}`
if (attributesHtmlStr) {
line += ` ${attributesHtmlStr}`
}
/**
* @edit scrollable 数据
*/
if (node.extra) {
if (node.extra.scrollable) {
let scrollDataText = ''
if (node.extra.scrollData?.left)
scrollDataText += `left=${node.extra.scrollData.left}, `
if (node.extra.scrollData?.top) scrollDataText += `top=${node.extra.scrollData.top}, `
if (node.extra.scrollData?.right)
scrollDataText += `right=${node.extra.scrollData.right}, `
if (node.extra.scrollData?.bottom)
scrollDataText += `bottom=${node.extra.scrollData.bottom}`
line += ` data-scrollable="${scrollDataText}"`
}
}
if (text) {
const trimmedText = text.trim()
if (!attributesHtmlStr) {
line += ' '
}
line += `>${trimmedText}`
} else if (!attributesHtmlStr) {
line += ' '
}
line += ' />'
result.push(line)
}
// Process children regardless
for (const child of node.children) {
processNode(child, nextDepth, result)
}
} else if (node.type === 'text') {
// Add text only if it doesn't have a highlighted parent
if (hasParentWithHighlightIndex(node)) {
return
}
if (
node.parent &&
node.parent.type === 'element' &&
node.parent.isVisible &&
node.parent.isTopElement
) {
result.push(`${depthStr}${node.text ?? ''}`)
}
}
}
const result: string[] = []
processNode(rootNode, 0, result)
return result.join('\n')
}
// Get all text until next clickable element
export const getAllTextTillNextClickableElement = (node: TreeNode, maxDepth = -1): string => {
const textParts: string[] = []
const collectText = (currentNode: TreeNode, currentDepth: number) => {
if (maxDepth !== -1 && currentDepth > maxDepth) {
return
}
// Skip this branch if we hit a highlighted element (except for the current node)
if (
currentNode.type === 'element' &&
currentNode !== node &&
currentNode.highlightIndex !== undefined
) {
return
}
if (currentNode.type === 'text' && currentNode.text) {
textParts.push(currentNode.text)
} else if (currentNode.type === 'element') {
for (const child of currentNode.children) {
collectText(child, currentDepth + 1)
}
}
}
collectText(node, 0)
return textParts.join('\n').trim()
}
export function getSelectorMap(flatTree: FlatDomTree): Map<number, InteractiveElementDomNode> {
const selectorMap = new Map<number, InteractiveElementDomNode>()
const keys = Object.keys(flatTree.map)
for (const key of keys) {
const node = flatTree.map[key]
if (node.isInteractive && typeof node.highlightIndex === 'number') {
selectorMap.set(node.highlightIndex, node as InteractiveElementDomNode)
}
}
return selectorMap
}
export function getElementTextMap(simplifiedHTML: string) {
const lines = simplifiedHTML
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0)
const elementTextMap = new Map<number, string>()
for (const line of lines) {
const regex = /^\[(\d+)\]<[^>]+>([^<]*)/
const match = regex.exec(line)
if (match) {
const index = parseInt(match[1], 10)
elementTextMap.set(index, line)
}
}
return elementTextMap
}
export function cleanUpHighlights() {
const cleanupFunctions = (window as any)._highlightCleanupFunctions || []
for (const cleanup of cleanupFunctions) {
if (typeof cleanup === 'function') {
cleanup()
}
}
;(window as any)._highlightCleanupFunctions = []
}
// 监听 URL 的任何变化,立刻清空 highLights
window.addEventListener('popstate', () => {
// console.log('URL changed (popstate), highlights cleaned up.')
cleanUpHighlights()
})
window.addEventListener('hashchange', () => {
// console.log('URL changed (hashchange), highlights cleaned up.')
cleanUpHighlights()
})
window.addEventListener('beforeunload', () => {
// console.log('Page is unloading, highlights cleaned up.')
cleanUpHighlights()
})
const navigation = (window as any).navigation
if (navigation && typeof navigation.addEventListener === 'function') {
navigation.addEventListener('navigate', () => {
// console.log('Navigation event detected, highlights cleaned up.')
cleanUpHighlights()
})
} else {
// 定时器
let currentUrl = window.location.href
setInterval(() => {
if (window.location.href !== currentUrl) {
currentUrl = window.location.href
// console.log('URL changed (interval), highlights cleaned up.')
cleanUpHighlights()
}
}, 500)
}

33
src/entry.ts Normal file
View File

@@ -0,0 +1,33 @@
/**
* Auto-run entry for page-agent.js. Insert this script into your page to get page-agent functionality.
*/
import { PageAgent, type PageAgentConfig } from './PageAgent'
import { DEFAULT_MODEL_NAME } from './config/constants'
// Clean up existing instances to prevent multiple injections from bookmarklet
if (window.pageAgent) {
window.pageAgent.dispose()
}
// Mount to global window object
window.PageAgent = PageAgent
// Export for ES module usage
export { PageAgent }
console.log('🚀 page-agent.js loaded!')
const currentScript = document.currentScript as HTMLScriptElement | null
if (currentScript) {
const url = new URL(currentScript.src)
const modelName = url.searchParams.get('model') || DEFAULT_MODEL_NAME
const language = (url.searchParams.get('lang') as 'zh-CN' | 'en-US') || 'zh-CN'
const config = { modelName, language } as PageAgentConfig
window.pageAgent = new PageAgent(config)
} else {
window.pageAgent = new PageAgent()
}
console.log('🚀 page-agent.js initialized with config:', window.pageAgent.config)
window.pageAgent.bus.emit('panel:show') // Show panel

44
src/i18n/index.ts Normal file
View File

@@ -0,0 +1,44 @@
import { type SupportedLanguage, locales } from './locales'
import type { TranslationKey, TranslationParams, TranslationSchema } from './types'
export class I18n {
private language: SupportedLanguage
private translations: TranslationSchema
constructor(language: SupportedLanguage = 'en-US') {
this.language = language in locales ? language : 'en-US'
this.translations = locales[language]
}
// 类型安全的翻译方法
t(key: TranslationKey, params?: TranslationParams): string {
const value = this.getNestedValue(this.translations, key)
if (!value) {
console.warn(`Translation key "${key}" not found for language "${this.language}"`)
return key
}
if (params) {
return this.interpolate(value, params)
}
return value
}
private getNestedValue(obj: any, path: string): string | undefined {
return path.split('.').reduce((current, key) => current?.[key], obj)
}
private interpolate(template: string, params: TranslationParams): string {
return template.replace(/\{\{(\w+)\}\}/g, (match, key) => {
return params[key]?.toString() || match
})
}
getLanguage(): SupportedLanguage {
return this.language
}
}
// 导出类型和实例创建函数
export type { TranslationKey, SupportedLanguage, TranslationParams }
export { locales }

98
src/i18n/locales.ts Normal file
View File

@@ -0,0 +1,98 @@
import type { TranslationSchema } from './types'
// 中文翻译(作为基准)
const zhCN: TranslationSchema = {
ui: {
panel: {
ready: '准备就绪',
thinking: '正在思考...',
paused: '暂停中,稍后',
taskInput: '输入新任务,详细描述步骤,回车提交',
userAnswerPrompt: '请回答上面问题,回车提交',
taskTerminated: '任务已终止',
taskCompleted: '任务结束',
continueExecution: '继续执行',
userAnswer: '用户回答: {{input}}',
pause: '暂停',
continue: '继续',
stop: '终止',
expand: '展开历史',
collapse: '收起历史',
step: '步骤 {{number}} · {{time}}{{duration}}',
},
tools: {
clicking: '正在点击元素 [{{index}}]...',
inputting: '正在输入文本到元素 [{{index}}]...',
selecting: '正在选择选项 "{{text}}"...',
scrolling: '正在滚动页面...',
waiting: '等待 {{seconds}} 秒...',
done: '结束任务',
clicked: '🖱️ 已点击元素 [{{index}}]',
inputted: '⌨️ 已输入文本 "{{text}}"',
selected: '☑️ 已选择选项 "{{text}}"',
scrolled: '🛞 页面滚动完成',
waited: '⌛️ 等待完成',
executing: '正在执行 {{toolName}}...',
},
errors: {
elementNotFound: '未找到索引为 {{index}} 的交互元素',
taskRequired: '任务描述不能为空',
executionFailed: '任务执行失败',
notInputElement: '元素不是输入框或文本域',
notSelectElement: '元素不是选择框',
optionNotFound: '未找到选项 "{{text}}"',
},
},
} as const
// 英文翻译(必须符合相同的结构)
const enUS: TranslationSchema = {
ui: {
panel: {
ready: 'Ready',
thinking: 'Thinking...',
paused: 'Paused',
taskInput: 'Enter new task, describe steps in detail, press Enter to submit',
userAnswerPrompt: 'Please answer the question above, press Enter to submit',
taskTerminated: 'Task terminated',
taskCompleted: 'Task completed',
continueExecution: 'Continue execution',
userAnswer: 'User answer: {{input}}',
pause: 'Pause',
continue: 'Continue',
stop: 'Stop',
expand: 'Expand history',
collapse: 'Collapse history',
step: 'Step {{number}} · {{time}}{{duration}}',
},
tools: {
clicking: 'Clicking element [{{index}}]...',
inputting: 'Inputting text to element [{{index}}]...',
selecting: 'Selecting option "{{text}}"...',
scrolling: 'Scrolling page...',
waiting: 'Waiting {{seconds}} seconds...',
done: 'Task done',
clicked: '🖱️ Clicked element [{{index}}]',
inputted: '⌨️ Inputted text "{{text}}"',
selected: '☑️ Selected option "{{text}}"',
scrolled: '🛞 Page scrolled',
waited: '⌛️ Wait completed',
executing: '正在执行 {{toolName}}...',
},
errors: {
elementNotFound: 'No interactive element found at index {{index}}',
taskRequired: 'Task description is required',
executionFailed: 'Task execution failed',
notInputElement: 'Element is not an input or textarea',
notSelectElement: 'Element is not a select element',
optionNotFound: 'Option "{{text}}" not found',
},
},
} as const
export const locales = {
'zh-CN': zhCN,
'en-US': enUS,
} as const
export type SupportedLanguage = keyof typeof locales

57
src/i18n/types.ts Normal file
View File

@@ -0,0 +1,57 @@
// 定义翻译数据的结构类型
export interface TranslationSchema {
ui: {
panel: {
ready: string
thinking: string
paused: string
taskInput: string
userAnswerPrompt: string
taskTerminated: string
taskCompleted: string
continueExecution: string
userAnswer: string
pause: string
continue: string
stop: string
expand: string
collapse: string
step: string
}
tools: {
clicking: string
inputting: string
selecting: string
scrolling: string
waiting: string
done: string
clicked: string
inputted: string
selected: string
scrolled: string
waited: string
executing: string
}
errors: {
elementNotFound: string
taskRequired: string
executionFailed: string
notInputElement: string
notSelectElement: string
optionNotFound: string
}
}
}
// 工具类型:提取嵌套对象的所有路径
type NestedKeyOf<ObjectType extends object> = {
[Key in keyof ObjectType & (string | number)]: ObjectType[Key] extends object
? `${Key}` | `${Key}.${NestedKeyOf<ObjectType[Key]>}`
: `${Key}`
}[keyof ObjectType & (string | number)]
// 从翻译结构中提取所有可能的key路径
export type TranslationKey = NestedKeyOf<TranslationSchema>
// 参数化翻译的类型
export type TranslationParams = Record<string, string | number>

243
src/llms/index.ts Normal file
View File

@@ -0,0 +1,243 @@
/**
* @topic LLM 与主流程的隔离
* @reasoning
* 将 llm 的调用和主流程分开是复杂的,
* 因为 agent 的 tool call 通常集成在 llm 模块中,而而先得到 llm 返回,然后处理工具调用
* tools 和 llm 调用的逻辑不可避免地耦合在一起tool 的执行又和主流程耦合在一起
* 而 history 的维护和更新逻辑,又必须嵌入多轮 tool call 中
* @reasoning
* - 放弃框架提供的自动的多轮调用,每轮调用都由主流程发起
* - 理想情况下llm 调用应该获得 structured output然后由额外的模块触发 tool call目前模型和框架都无法实现
* - 当前只能将 llm api 和 本地 tool call 耦合在一起,不关心其中的衔接方式
* @conclusion
* - @llm responsibility boundary:
* - call llm api with given messages and tools
* - invoke tool call and get the result of the tool
* - return the result to main loop
* - @main_loop responsibility boundary:
* - maintain all behaviors of an **agent**
* @conclusion
* - 这里的 llm 模块不是 agent只负责一轮 llm 调用和工具调用,无状态
*/
/**
* @topic 结构化输出
* @facts
* - 几乎所有模型都支持 tool call schema
* - 几乎所有模型都支持返回 json
* - 只有 openAI/grok/gemini 支持 schema 并保证格式
* - 主流模型都支持 tool_choice: required
* - 除了 qwen 必须指定一个函数名 (9月上新后支持)
* @conclusion
* - 永远使用 tool call 来返回结构化数据,禁止模型直接返回(视为出错)
* - 不能假设 tool 参数合法,必须有修复机制,而且修复也应该使用 tool call 返回
*/
import { OpenAIProvider, OpenAIResponsesProviderOptions, createOpenAI } from '@ai-sdk/openai'
import type { LanguageModelV2, LanguageModelV2ToolCall } from '@ai-sdk/provider'
import type { LanguageModelUsage, ModelMessage, TypedToolCall, TypedToolResult } from 'ai'
import { ToolSet, generateText, stepCountIs } from 'ai'
import chalk from 'chalk'
import {
DEFAULT_API_KEY,
DEFAULT_BASE_URL,
DEFAULT_MODEL_NAME,
LLM_MAX_RETRIES,
MACRO_TOOL_NAME,
} from '@/config/constants'
import { assert } from '@/utils/assert'
import { EventBus, getEventBus } from '@/utils/bus'
export interface LLMConfig {
baseURL?: string
apiKey?: string
modelName?: string
maxRetries?: number
}
export class LLM {
config: Required<LLMConfig>
id: string
#openai: OpenAIProvider
#model: LanguageModelV2
#bus: EventBus
constructor(config: LLMConfig, id: string) {
this.config = {
baseURL: DEFAULT_BASE_URL,
apiKey: DEFAULT_API_KEY,
modelName: DEFAULT_MODEL_NAME,
maxRetries: LLM_MAX_RETRIES,
...config,
}
this.id = id
this.#bus = getEventBus(id)
this.#openai = createOpenAI({ baseURL: this.config.baseURL, apiKey: this.config.apiKey })
this.#model = this.#openai.chat(this.config.modelName)
// @note Will throw JSON parsing error
// this.#model = this.#openai.responses(modelName)
}
/**
* - call llm api *once*
* - invoke tool call *once*
* - return the result of the tool
*/
async invoke<T extends ToolSet>(
messages: ModelMessage[],
tools: T,
abortSignal: AbortSignal
): Promise<{
toolCall: TypedToolCall<T>
toolResult: TypedToolResult<T>
usage: LanguageModelUsage
}> {
const isClaude = this.config.modelName.slice(0, 8).includes('claude')
const isQwen = this.config.modelName.slice(0, 6).includes('qwen')
const isGPT = this.config.modelName.slice(0, 5).includes('gpt')
return await withRetry(
async () => {
// try {
const result = await generateText({
model: this.#model,
messages,
tools,
abortSignal,
/**
* 文档中没有说明,从源码看,@facts
* - 只会重试被识别为 retryable 的 API_CALL_ERROR
* - 返回无法解析的 json 应该不会重试
* - experimental_repairToolCall 只会执行一次,不算作重试
* @facts
* - 许多 proxy 过的 openAI 兼容接口返回的错误格式并不规范,通常不会被识别为 retryable
* @conclusion
* - 看起来并不实用,不如完全手工控制粗粒度重试
*/
// maxRetries: this.config.maxRetries,
maxRetries: 0,
// toolChoice: 'required',
// @note incompatible to Claude
toolChoice: isClaude ? undefined : { type: 'tool', toolName: MACRO_TOOL_NAME as any },
/**
* controlled by main loop. our method only call api once
*/
// stopWhen: [hasToolCall('done'), stepCountIs(100)],
stopWhen: [stepCountIs(1)],
// stopWhen: [hasToolCall('AgentOutput')],
providerOptions: {
openai: {
// @note this one needs all fields in tool schema must be `required`
// strictJsonSchema: true,
// This way only at most one tool can be called at a time
parallelToolCalls: false,
reasoningEffort: 'minimal',
// @note not working
// serviceTier: 'priority',
textVerbosity: 'low',
// @note Optimize OpenAI model caching, should be unique per user, currently has no effect
promptCacheKey: 'page-agent:' + this.id,
} as OpenAIResponsesProviderOptions,
},
/**
* schema 出错时执行一次,不确定是否计入重试
* 目前看起来像是会直接抛错,被 withRetry 处理
* @note
* 如果不提供,则 ai-sdk 会把 tool-error 加入 message 中重新调用一次,
* 配合 stepCountIs 或者 hasToolCall 都会导致错误被 silenttoolResults 永远为 0
* 遗憾的是,这里没有办法抛错(抛错后回到默认逻辑),只要这里 repair 不好,就会导致 silent error
* 更糟糕的是,只要传入了 tools无论 stopWhen 如何设置,都会被当作 multi-step
* 本质上就和我们 single step 的逻辑冲突
* 长远来看必须删掉 ai-sdk直接用 openAI API 实现
*/
// experimental_repairToolCall: (options): Promise<LanguageModelV2ToolCall | null> => {
// console.error('hahhah', options)
// throw options.error
// },
})
console.log(chalk.blue.bold('LLM:invoke finished'), result)
const toolError: any = result.content.find((part) => part.type === 'tool-error')
if (toolError) throw toolError.error
assert(!result.text, 'Model returned text without calling done tool', true)
assert(result.toolCalls.length === 1, 'Model must call exactly one tool', true)
assert(result.toolResults.length === 1, 'Step must have exactly one tool result', true)
const toolCall = result.toolCalls[0]
const toolResult = result.toolResults[0]
const usage = result.totalUsage
return {
toolCall,
toolResult,
usage,
}
// } catch (error) {
// // handle ai-sdk internal error here
// // currently useless since we bypassed most of ai-sdk logic
// console.log('generateText error', error)
// console.log('APICallError', APICallError.isInstance(error))
// console.log('isNoSuchModelError', NoSuchModelError.isInstance(error))
// throw error
// }
},
// retry settings
{
maxRetries: this.config.maxRetries,
onRetry: (retries: number) => {
this.#bus.emit('panel:update', {
type: 'retry',
displayText: `retry-ing (${retries} / ${this.config.maxRetries})`,
})
},
onError: (error: Error, withRetry: boolean) => {
this.#bus.emit('panel:update', {
type: 'error',
displayText: `step failed: ${(error as Error).message}`,
})
},
}
)
}
}
async function withRetry<T>(
fn: () => Promise<T>,
settings: {
maxRetries: number
onRetry: (retries: number) => void
onError: (error: Error, withRetry: boolean) => void
}
): Promise<T> {
let retries = 0
let lastError: Error | null = null
while (retries <= settings.maxRetries) {
if (retries > 0) {
settings.onRetry(retries)
await new Promise((resolve) => setTimeout(resolve, 100))
}
try {
return await fn()
} catch (error: any) {
console.error(error)
settings.onError(error as Error, retries < settings.maxRetries)
// do not retry if aborted by user
if (error?.name === 'AbortError') throw error
lastError = error as Error
retries++
await new Promise((resolve) => setTimeout(resolve, 100))
}
}
throw lastError!
}

20
src/patches/antd.ts Normal file
View File

@@ -0,0 +1,20 @@
import type { PageAgent } from '@/PageAgent'
const clearFunctions = [] as (() => void)[]
/**
* antd 的 select 是 div 包 input 的结构,所有信息都在 input 标签上,
* 但是 input 不可见,也不会出现在清洗后的树里,因此这里把他提上来
*/
function fixAntdSelect() {
const selects = [...document.querySelectorAll('input[role="combobox"]')]
// for (const select of selects) {}
}
export function patchAntd(pageAgent: PageAgent) {
pageAgent.addEventListener('beforeUpdate', fixAntdSelect)
pageAgent.addEventListener('afterUpdate', () => {
for (const fn of clearFunctions) fn()
clearFunctions.length = 0
})
}

16
src/patches/react.ts Normal file
View File

@@ -0,0 +1,16 @@
import type { PageAgent } from '@/PageAgent'
// Find common React root elements and add data-page-agent-not-interactive attribute
export function patchReact(pageAgent: PageAgent) {
const reactRootElements = document.querySelectorAll(
'[data-reactroot], [data-reactid], [data-react-checksum], #root, #app, [id^="root-"], [id^="app-"], #adex-wrapper, #adex-root'
)
for (const element of reactRootElements) {
element.setAttribute('data-page-agent-not-interactive', 'true')
}
}
/**
* @todo (Heavy, might have false negatives) Interaction detection, if element width/height equals body offsetWidth/Height, consider it root element and non-interactive (React often attaches many events to root elements, causing false positives)
*/

View File

@@ -0,0 +1,156 @@
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
<intro>
You excel at following tasks:
1. Navigating complex websites and extracting precise information
2. Automating form submissions and interactive web actions
3. Gathering and saving information
4. Operate effectively in an agent loop
5. Efficiently performing diverse web tasks
</intro>
<language_settings>
- Default working language: **中文**
- Use the language that user is using. Return in user's language.
</language_settings>
<input>
At every step, your input will consist of:
1. <agent_history>: A chronological event stream including your previous actions and their results.
2. <agent_state>: Current <user_request> and <step_info>.
3. <browser_state>: Current URL, interactive elements indexed for actions, and visible page content.
</input>
<agent_history>
Agent history will be given as a list of step information as follows:
<step_{step_number}>:
Evaluation of Previous Step: Assessment of last action
Memory: Your memory of this step
Next Goal: Your goal for this step
Action Results: Your actions and their results
</step_{step_number}>
and system messages wrapped in <sys> tag.
</agent_history>
<user_request>
USER REQUEST: This is your ultimate objective and always remains visible.
- This has the highest priority. Make the user happy.
- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
- If the task is open ended you can plan yourself how to get it done.
</user_request>
<browser_state>
1. Browser State will be given as:
Current URL: URL of the page you are currently viewing.
Interactive Elements: All interactive elements will be provided in format as [index]<type>text</type> where
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Examples:
[33]<div>User form</div>
\t*[35]<button aria-label='Submit form'>Submit</button>
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
</browser_state>
<browser_rules>
Strictly follow these rules while using the browser and navigating the web:
- Only interact with elements that have a numeric [index] assigned.
- Only use indexes that are explicitly provided.
- If the page changes after, for example, an input text action, analyze if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling actions if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- All the elements that are scrollable are marked with `data-scrollable` attribute. Including the scrollable distance in every directions. You can scroll *the element* in case some area are overflowed.
- If a captcha appears, tell user you can not solve captcha. finished the task and ask user to solve it.
- If expected elements are missing, try scrolling, or navigating back.
- If the page is not fully loaded, use the `wait` action.
- Do not repeat one action for more than 3 times unless some conditions changed.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
</browser_rules>
<capability>
- You can only handle single page app. Do not jump out of current page.
- Do not click on link if it will open in a new page (etc. <a target="_blank">)
- It is ok to fail the task.
- User can be wrong. If the request of user is not achievable, inappropriate or you do not have enough information or tools to achieve it. Tell user to make a better request.
- Webpage can be broken. All webpages or apps have bugs. Some bug will make it hard for your job. It's encouraged to tell user the problem of current page. Your feedbacks (including failing) are valuable for user.
- Trying to hard can be harmful. Repeating some action back and forth or pushing for a complex procedure with little knowledge can cause unwanted result and harmful side-effects. User would rather you to complete the task with a fail.
- If you are not clear about the request or steps. `ask_user` to clarify it.
- If you do not have knowledge for the current webpage or task. You must require user to give specific instructions and detailed steps.
</capability>
<task_completion_rules>
You must call the `done` action in one of three cases:
- When you have fully completed the USER REQUEST.
- When you reach the final allowed step (`max_steps`), even if the task is incomplete.
- When you feel stuck or unable to solve user request. Or user request is not clear or contains inappropriate content.
- If it is ABSOLUTELY IMPOSSIBLE to continue.
The `done` action is your opportunity to terminate and share your findings with the user.
- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components.
- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`.
- You can use the `text` field of the `done` action to communicate your findings and to provide a coherent reply to the user and fulfill the USER REQUEST.
- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions.
- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer.
- If the user asks for a structured output, your `done` action's schema may be modified. Take this schema into account when solving the task!
</task_completion_rules>
<reasoning_rules>
Exhibit the following reasoning patterns to successfully achieve the <user_request>:
- Reason about <agent_history> to track progress and context toward <user_request>.
- Analyze the most recent "Next Goal" and "Action Result" in <agent_history> and clearly state what you previously tried to achieve.
- Analyze all relevant items in <agent_history> and <browser_state> to understand your state.
- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in <agent_history>. If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery.
- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or ask user for help.
- `ask_user` for help if you have any difficulty. Users want to be kept in the loop.
- If you see information relevant to <user_request>, plan saving the information to memory.
- Always reason about the <user_request>. Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajectory with the user request and think carefully if thats how the user requested it.
</reasoning_rules>
<examples>
Here are examples of good output patterns. Use them as reference but never copy them directly.
<evaluation_examples>
- Positive Examples:
"evaluation_previous_goal": "Successfully navigated to the product page and found the target information. Verdict: Success"
"evaluation_previous_goal": "Clicked the login button and user authentication form appeared. Verdict: Success"
</evaluation_examples>
<memory_examples>
"memory": "Found many pending reports that need to be analyzed in the main page. Successfully processed the first 2 reports on quarterly sales data and moving on to inventory analysis and customer feedback reports."
</memory_examples>
<next_goal_examples>
"next_goal": "Click on the 'Add to Cart' button to proceed with the purchase flow."
"next_goal": "Extract details from the first item on the page."
</next_goal_examples>
</examples>
<output>
You must ALWAYS respond with a valid JSON in this exact format:
{
"evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 concise sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence."
"action":{"one_action_name": {// action-specific parameter}}
}
</output>

430
src/tools/actions.ts Normal file
View File

@@ -0,0 +1,430 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import type { PageAgent } from '../PageAgent'
// ======= general utils =======
export async function waitFor(seconds: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
}
let currentUrl = window.location.href
export async function getSystemInfo() {
// If current URL is already up to date, no need to add message
if (currentUrl === window.location.href) return ''
await waitFor(0.3) // Wait a bit longer for page to load
currentUrl = window.location.href
return `\n<sys> Current URL changed to: ${currentUrl} </sys>`
}
// ======= dom utils =======
export async function movePointerToElement(element: HTMLElement) {
const rect = element.getBoundingClientRect()
const x = rect.left + rect.width / 2
const y = rect.top + rect.height / 2
window.dispatchEvent(new CustomEvent('PageAgent::MovePointerTo', { detail: { x, y } }))
await waitFor(0.3)
}
/**
* Get the HTMLElement by index from the selectorMap in PageAgent.
*/
export function getElementByIndex(pageAgent: PageAgent, index: number): HTMLElement {
const interactiveNode = pageAgent.selectorMap.get(index)
if (!interactiveNode) {
throw new Error(`No interactive element found at index ${index}`)
}
const element = interactiveNode.ref
if (!element) {
throw new Error(`Element at index ${index} does not have a reference`)
}
if (!(element instanceof HTMLElement)) {
throw new Error(`Element at index ${index} is not an HTMLElement`)
}
return element
}
let lastClickedElement: HTMLElement | null = null
function blurLastClickedElement() {
if (lastClickedElement) {
lastClickedElement.blur()
lastClickedElement.dispatchEvent(
new MouseEvent('mouseout', { bubbles: true, cancelable: true })
)
lastClickedElement = null
}
}
/**
* Simulate a click on the element
*/
export async function clickElement(element: HTMLElement) {
blurLastClickedElement()
lastClickedElement = element
await scrollIntoViewIfNeeded(element)
await movePointerToElement(element)
window.dispatchEvent(new CustomEvent('PageAgent::ClickPointer'))
await waitFor(0.1)
// hover it
element.dispatchEvent(new MouseEvent('mouseenter', { bubbles: true, cancelable: true }))
element.dispatchEvent(new MouseEvent('mouseover', { bubbles: true, cancelable: true }))
// dispatch a sequence of events to ensure all listeners are triggered
element.dispatchEvent(new MouseEvent('mousedown', { bubbles: true, cancelable: true }))
// focus it to ensure it gets the click event
element.focus()
element.dispatchEvent(new MouseEvent('mouseup', { bubbles: true, cancelable: true }))
element.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }))
// dispatch a click event
// element.click()
await waitFor(0.1) // Wait to ensure click event processing completes
}
// eslint-disable-next-line @typescript-eslint/unbound-method
const nativeInputValueSetter = Object.getOwnPropertyDescriptor(
window.HTMLInputElement.prototype,
'value'
)!.set!
// eslint-disable-next-line @typescript-eslint/unbound-method
const nativeTextAreaValueSetter = Object.getOwnPropertyDescriptor(
window.HTMLTextAreaElement.prototype,
'value'
)!.set!
/**
* create a synthetic keyboard event
* with key keycode code
*/
export async function createSyntheticInputEvent(elem: HTMLElement, key: string) {
elem.dispatchEvent(new KeyboardEvent('keydown', { bubbles: true, cancelable: true, key }))
await waitFor(0.01)
if (elem instanceof HTMLInputElement || elem instanceof HTMLTextAreaElement) {
elem.dispatchEvent(new Event('beforeinput', { bubbles: true }))
await waitFor(0.01)
elem.dispatchEvent(new Event('input', { bubbles: true }))
await waitFor(0.01)
}
elem.dispatchEvent(new KeyboardEvent('keyup', { bubbles: true, cancelable: true, key }))
}
export async function inputTextElement(element: HTMLElement, text: string) {
if (!(element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement)) {
throw new Error('Element is not an input or textarea')
}
await clickElement(element)
if (element instanceof HTMLTextAreaElement) {
nativeTextAreaValueSetter.call(element, text)
} else {
nativeInputValueSetter.call(element, text)
}
const inputEvent = new Event('input', { bubbles: true })
element.dispatchEvent(inputEvent)
await waitFor(0.1) // Wait to ensure input event processing completes
blurLastClickedElement()
}
/**
* @todo browser-use version is very complex and supports menu tags, need to follow up
*/
export async function selectOptionElement(selectElement: HTMLSelectElement, optionText: string) {
if (!(selectElement instanceof HTMLSelectElement)) {
throw new Error('Element is not a select element')
}
const options = Array.from(selectElement.options)
const option = options.find((opt) => opt.textContent?.trim() === optionText.trim())
if (!option) {
throw new Error(`Option with text "${optionText}" not found in select element`)
}
selectElement.value = option.value
selectElement.dispatchEvent(new Event('change', { bubbles: true }))
await waitFor(0.1) // Wait to ensure change event processing completes
}
// eslint-disable-next-line @typescript-eslint/require-await
export async function scrollIntoViewIfNeeded(element: HTMLElement) {
const el = element as any
if (el.scrollIntoViewIfNeeded) {
el.scrollIntoViewIfNeeded()
// await waitFor(0.5) // Animation playback
} else {
// @todo visibility check
el.scrollIntoView({ behavior: 'auto', block: 'center', inline: 'nearest' })
// await waitFor(0.5) // Animation playback
}
}
export async function scrollVertically(
down: boolean,
scroll_amount: number,
element?: HTMLElement | null
) {
// Element-specific scrolling if element is provided
if (element) {
const targetElement = element
console.log(
'[SCROLL DEBUG] Starting direct container scroll for element:',
targetElement.tagName
)
let currentElement = targetElement as HTMLElement | null
let scrollSuccess = false
let scrolledElement: HTMLElement | null = null
let scrollDelta = 0
let attempts = 0
const dy = scroll_amount
while (currentElement && attempts < 10) {
const computedStyle = window.getComputedStyle(currentElement)
const hasScrollableY = /(auto|scroll|overlay)/.test(computedStyle.overflowY)
const canScrollVertically = currentElement.scrollHeight > currentElement.clientHeight
console.log(
'[SCROLL DEBUG] Checking element:',
currentElement.tagName,
'hasScrollableY:',
hasScrollableY,
'canScrollVertically:',
canScrollVertically,
'scrollHeight:',
currentElement.scrollHeight,
'clientHeight:',
currentElement.clientHeight
)
if (hasScrollableY && canScrollVertically) {
const beforeScroll = currentElement.scrollTop
const maxScroll = currentElement.scrollHeight - currentElement.clientHeight
let scrollAmount = dy / 3
if (scrollAmount > 0) {
scrollAmount = Math.min(scrollAmount, maxScroll - beforeScroll)
} else {
scrollAmount = Math.max(scrollAmount, -beforeScroll)
}
currentElement.scrollTop = beforeScroll + scrollAmount
const afterScroll = currentElement.scrollTop
const actualScrollDelta = afterScroll - beforeScroll
console.log(
'[SCROLL DEBUG] Scroll attempt:',
currentElement.tagName,
'before:',
beforeScroll,
'after:',
afterScroll,
'delta:',
actualScrollDelta
)
if (Math.abs(actualScrollDelta) > 0.5) {
scrollSuccess = true
scrolledElement = currentElement
scrollDelta = actualScrollDelta
console.log(
'[SCROLL DEBUG] Successfully scrolled container:',
currentElement.tagName,
'delta:',
actualScrollDelta
)
break
}
}
if (currentElement === document.body || currentElement === document.documentElement) {
break
}
currentElement = currentElement.parentElement
attempts++
}
if (scrollSuccess) {
return `Scrolled container (${scrolledElement?.tagName}) by ${scrollDelta}px`
} else {
return `No scrollable container found for element (${targetElement.tagName})`
}
}
// Page-level scrolling (default or fallback)
const dy = scroll_amount
const bigEnough = (el: HTMLElement) => el.clientHeight >= window.innerHeight * 0.5
const canScroll = (el: HTMLElement | null) =>
el &&
/(auto|scroll|overlay)/.test(getComputedStyle(el).overflowY) &&
el.scrollHeight > el.clientHeight &&
bigEnough(el)
let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement
el = canScroll(el)
? el
: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
(document.scrollingElement as HTMLElement) ||
(document.documentElement as HTMLElement)
if (el === document.scrollingElement || el === document.documentElement || el === document.body) {
window.scrollBy(0, dy)
return `✅ Scrolled page by ${dy}px.`
} else {
el!.scrollBy({ top: dy, behavior: 'smooth' })
await waitFor(0.1) // Animation playback
return `✅ Scrolled container (${el!.tagName}) by ${dy}px.`
}
}
export async function scrollHorizontally(
right: boolean,
scroll_amount: number,
element?: HTMLElement | null
) {
// Element-specific scrolling if element is provided
if (element) {
const targetElement = element
console.log(
'[SCROLL DEBUG] Starting direct container scroll for element:',
targetElement.tagName
)
let currentElement = targetElement as HTMLElement | null
let scrollSuccess = false
let scrolledElement: HTMLElement | null = null
let scrollDelta = 0
let attempts = 0
const dx = right ? scroll_amount : -scroll_amount
while (currentElement && attempts < 10) {
const computedStyle = window.getComputedStyle(currentElement)
const hasScrollableX = /(auto|scroll|overlay)/.test(computedStyle.overflowX)
const canScrollHorizontally = currentElement.scrollWidth > currentElement.clientWidth
console.log(
'[SCROLL DEBUG] Checking element:',
currentElement.tagName,
'hasScrollableX:',
hasScrollableX,
'canScrollHorizontally:',
canScrollHorizontally,
'scrollWidth:',
currentElement.scrollWidth,
'clientWidth:',
currentElement.clientWidth
)
if (hasScrollableX && canScrollHorizontally) {
const beforeScroll = currentElement.scrollLeft
const maxScroll = currentElement.scrollWidth - currentElement.clientWidth
let scrollAmount = dx / 3
if (scrollAmount > 0) {
scrollAmount = Math.min(scrollAmount, maxScroll - beforeScroll)
} else {
scrollAmount = Math.max(scrollAmount, -beforeScroll)
}
currentElement.scrollLeft = beforeScroll + scrollAmount
const afterScroll = currentElement.scrollLeft
const actualScrollDelta = afterScroll - beforeScroll
console.log(
'[SCROLL DEBUG] Scroll attempt:',
currentElement.tagName,
'before:',
beforeScroll,
'after:',
afterScroll,
'delta:',
actualScrollDelta
)
if (Math.abs(actualScrollDelta) > 0.5) {
scrollSuccess = true
scrolledElement = currentElement
scrollDelta = actualScrollDelta
console.log(
'[SCROLL DEBUG] Successfully scrolled container:',
currentElement.tagName,
'delta:',
actualScrollDelta
)
break
}
}
if (currentElement === document.body || currentElement === document.documentElement) {
break
}
currentElement = currentElement.parentElement
attempts++
}
if (scrollSuccess) {
return `Scrolled container (${scrolledElement?.tagName}) horizontally by ${scrollDelta}px`
} else {
return `No horizontally scrollable container found for element (${targetElement.tagName})`
}
}
// Page-level scrolling (default or fallback)
const dx = right ? scroll_amount : -scroll_amount
const bigEnough = (el: HTMLElement) => el.clientWidth >= window.innerWidth * 0.5
const canScroll = (el: HTMLElement | null) =>
el &&
/(auto|scroll|overlay)/.test(getComputedStyle(el).overflowX) &&
el.scrollWidth > el.clientWidth &&
bigEnough(el)
let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement
el = canScroll(el)
? el
: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
(document.scrollingElement as HTMLElement) ||
(document.documentElement as HTMLElement)
if (el === document.scrollingElement || el === document.documentElement || el === document.body) {
window.scrollBy(dx, 0)
return `✅ Scrolled page horizontally by ${dx}px`
} else {
el!.scrollBy({ left: dx, behavior: 'smooth' })
await waitFor(0.1) // Animation playback
return `✅ Scrolled container (${el!.tagName}) horizontally by ${dx}px`
}
}

208
src/tools/index.ts Normal file
View File

@@ -0,0 +1,208 @@
/**
* Internal tools for PageAgent.
* @note Adapted from browser-use
*/
import { Tool, tool } from 'ai'
import zod from 'zod'
import type { PageAgent } from '@/PageAgent'
import {
clickElement,
getElementByIndex,
getSystemInfo,
inputTextElement,
scrollHorizontally,
scrollVertically,
selectOptionElement,
waitFor,
} from './actions'
// debug
import * as utils from './actions'
// @ts-expect-error debug only
window.utils = utils
/**
* Internal tools for PageAgent.
*/
export const tools = new Map<string, Tool>()
// tools.set(
// 'get_current_html',
// tool({
// description: 'Get the current (updated) simplified HTML of the page',
// inputSchema: zod.object({}),
// execute: function (this: PageAgent) {
// this.updateTree()
// return this.simplifiedHTML
// },
// })
// )
tools.set(
'done',
tool({
description:
'Complete task - provide a summary of results for the user. Set success=True if task completed successfully, false otherwise. Text should be your response to the user summarizing results.',
inputSchema: zod.object({
text: zod.string(),
success: zod.boolean().default(true),
}),
execute: function (this: PageAgent, input) {
// @note main loop will handle this one
// this.onDone(input.text, input.success)
},
})
)
tools.set(
'wait',
tool({
description:
'Wait for x seconds. default 1s (max 10 seconds, min 1 second). This can be used to wait until the page or data is fully loaded.',
inputSchema: zod.object({
seconds: zod.number().min(1).max(10).default(1),
}),
execute: async function (this: PageAgent, input) {
const lastTimeUpdate = this.lastTimeUpdate
const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
console.log(`actualWaitTime: ${actualWaitTime} seconds`)
await waitFor(actualWaitTime)
return `✅ Waited for ${input.seconds} seconds.` + (await getSystemInfo())
},
})
)
tools.set(
'ask_user',
tool({
description:
'Ask the user a question and wait for their answer. Use this if you need more information or clarification.',
inputSchema: zod.object({
question: zod.string(),
}),
execute: async function (this: PageAgent, input) {
const answer = await this.panel.askUser(input.question)
return `✅ Received user answer: ${answer}` + (await getSystemInfo())
},
})
)
tools.set(
'click_element_by_index',
tool({
description: 'Click element by index',
inputSchema: zod.object({
index: zod.int().min(0),
}),
execute: async function (this: PageAgent, input) {
const element = getElementByIndex(this, input.index)
const elemText = this.elementTextMap.get(input.index)
await clickElement(element)
// @workaround: Handle links that open in new tabs
if (element instanceof HTMLAnchorElement && element.target === '_blank') {
return `⚠️ Clicked link that opens in a new tab (${elemText ? elemText : input.index}). You are not capable of reading new tabs.`
}
return `✅ Clicked element (${elemText ? elemText : input.index}).` + (await getSystemInfo())
},
})
)
tools.set(
'input_text',
tool({
description: 'Click and input text into a input interactive element',
inputSchema: zod.object({
index: zod.int().min(0),
text: zod.string(),
}),
execute: async function (this: PageAgent, input) {
const element = getElementByIndex(this, input.index)
const elemText = this.elementTextMap.get(input.index)
await inputTextElement(element, input.text)
return (
`✅ Input text (${input.text}) into element (${elemText ? elemText : input.index}).` +
(await getSystemInfo())
)
},
})
)
tools.set(
'select_dropdown_option',
tool({
description:
'Select dropdown option for interactive element index by the text of the option you want to select',
inputSchema: zod.object({
index: zod.int().min(0),
text: zod.string(),
}),
execute: async function (this: PageAgent, input) {
const element = getElementByIndex(this, input.index)
const elemText = this.elementTextMap.get(input.index)
await selectOptionElement(element as any, input.text)
return (
`✅ Selected option (${input.text}) in element (${elemText ? elemText : input.index}).` +
(await getSystemInfo())
)
},
})
)
/**
* @note Reference from browser-use
*/
tools.set(
'scroll',
tool({
description:
'Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 1.0 for one page, etc.). Optional index parameter to scroll within a specific element or its scroll container (works well for dropdowns and custom UI components). Optional pixels parameter to scroll by a specific number of pixels instead of pages.',
inputSchema: zod.object({
down: zod.boolean().default(true),
num_pages: zod.number().min(0).max(10).optional().default(0.1),
pixels: zod.number().int().min(0).optional(),
index: zod.number().int().min(0).optional(),
}),
execute: async function (this: PageAgent, input) {
const { down, num_pages, index, pixels } = input
const scroll_amount = pixels ? pixels : num_pages * (down ? 1 : -1) * window.innerHeight
const element = index !== undefined ? getElementByIndex(this, index) : null
return (await scrollVertically(down, scroll_amount, element)) + (await getSystemInfo())
},
})
)
tools.set(
'scroll_horizontally',
tool({
description:
'Scroll the page or element horizontally (set right=True to scroll right, right=False to scroll left, pixels=number of pixels to scroll). Optional index parameter to scroll within a specific element or its scroll container (works well for wide tables).',
inputSchema: zod.object({
right: zod.boolean().default(true),
pixels: zod.number().int().min(0),
index: zod.number().int().min(0).optional(),
}),
execute: async function (this: PageAgent, input) {
const { right, pixels, index } = input
const scroll_amount = pixels * (right ? 1 : -1)
const element = index !== undefined ? getElementByIndex(this, index) : null
return (await scrollHorizontally(right, scroll_amount, element)) + (await getSystemInfo())
},
})
)
// @todo get_dropdown_options
// @todo select_dropdown_option
// @todo send_keys
// @todo upload_file
// @todo go_back
// @todo extract_structured_data

598
src/ui/Panel.module.css Normal file
View File

@@ -0,0 +1,598 @@
.wrapper {
position: fixed;
bottom: 100px;
left: 50%;
transform: translateX(-50%) translateY(20px);
opacity: 0;
z-index: 2147483642; /* 比 SimulatorMask 高一层 */
box-sizing: border-box;
overflow: visible;
* {
box-sizing: border-box;
}
--width: 360px;
--height: 40px;
--border-radius: 12px;
--side-space: 12px; /* 控制栏两侧的间距 */
--history-width: calc(var(--width) - var(--side-space) * 2);
--color-1: rgb(57, 182, 255);
--color-2: rgb(189, 69, 251);
--color-3: rgb(255, 87, 51);
--color-4: rgb(255, 214, 0);
width: var(--width);
height: var(--height);
transition: all 0.3s ease-in-out;
/* 响应式设计 */
@media (max-width: 480px) {
width: calc(100vw - 40px);
left: 20px;
transform: none;
}
.background {
position: absolute;
inset: -2px -8px;
border-radius: calc(var(--border-radius) + 4px);
filter: blur(16px);
overflow: hidden;
/* mix-blend-mode: lighten; */
/* display: none; */
&::before {
content: '';
z-index: -1;
pointer-events: none;
position: absolute;
width: 100%;
height: 100%;
/* left: -100%; */
left: 0;
top: 0;
background-image: linear-gradient(
to bottom left,
var(--color-1),
var(--color-2),
var(--color-1)
);
animation: mask-running 2s linear infinite;
}
&::after {
content: '';
z-index: -1;
pointer-events: none;
position: absolute;
width: 100%;
height: 100%;
left: 0;
top: 0;
background-image: linear-gradient(
to bottom left,
var(--color-2),
var(--color-1),
var(--color-2)
);
animation: mask-running 2s linear infinite;
animation-delay: 1s;
}
}
}
@keyframes mask-running {
from {
transform: translateX(-100%);
}
to {
transform: translateX(100%);
}
}
/* 控制栏 */
.header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 8px 12px;
user-select: none;
position: absolute;
inset: 0;
cursor: pointer;
flex-shrink: 0; /* 防止 header 被压缩 */
background: rgba(0, 0, 0, 0.5);
backdrop-filter: blur(10px);
border-radius: var(--border-radius);
background-clip: padding-box;
box-shadow:
0 0 0px 2px rgba(255, 255, 255, 0.4),
0 0 5px 1px rgba(255, 255, 255, 0.3);
.statusSection {
display: flex;
align-items: center;
gap: 8px;
flex: 1;
min-height: 24px; /* 确保垂直居中 */
.indicator {
width: 6px;
height: 6px;
border-radius: 50%;
background: rgba(255, 255, 255, 0.5);
flex-shrink: 0;
animation: none; /* 默认无动画 */
/* 运行状态 - 有动画 */
&.thinking {
background: rgb(57, 182, 255);
animation: pulse 0.8s ease-in-out infinite;
}
&.tool_executing {
background: rgb(189, 69, 251);
animation: pulse 0.6s ease-in-out infinite;
}
&.retry {
background: rgb(255, 214, 0);
animation: retryPulse 1s ease-in-out infinite;
}
/* 静止状态 - 无动画 */
&.completed,
&.input,
&.output {
background: rgb(34, 197, 94);
animation: none;
}
&.error {
background: rgb(239, 68, 68);
animation: none;
}
}
.statusText {
color: white;
font-size: 12px;
line-height: 1;
font-weight: 500;
transition: all 0.3s ease-in-out;
position: relative;
overflow: hidden;
display: flex;
align-items: center;
min-height: 24px; /* 确保垂直居中 */
&.fadeOut {
animation: statusTextFadeOut 0.3s ease forwards;
}
&.fadeIn {
animation: statusTextFadeIn 0.3s ease forwards;
}
}
}
.controls {
display: flex;
align-items: center;
gap: 4px;
.controlButton {
width: 24px;
height: 24px;
border: none;
border-radius: 4px;
background: rgba(255, 255, 255, 0.1);
color: white;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
font-size: 12px;
line-height: 1;
&:hover {
background: rgba(255, 255, 255, 0.2);
}
}
.pauseButton {
font-weight: 600;
&.paused {
background: rgba(34, 197, 94, 0.2); /* 绿色背景表示可以继续 */
color: rgb(34, 197, 94);
&:hover {
background: rgba(34, 197, 94, 0.3);
}
}
}
.stopButton {
background: rgba(239, 68, 68, 0.2);
color: rgb(255, 41, 41);
font-weight: 600;
&:hover {
background: rgba(239, 68, 68, 0.3);
}
}
}
}
@keyframes statusTextFadeIn {
0% {
opacity: 0;
transform: translateY(5px);
}
100% {
opacity: 1;
transform: translateY(0);
}
}
@keyframes statusTextFadeOut {
0% {
opacity: 1;
transform: translateY(0);
}
100% {
opacity: 0;
transform: translateY(-5px);
}
}
.historySectionWrapper {
position: absolute;
width: var(--history-width);
bottom: var(--height);
left: var(--side-space);
z-index: -2;
padding-top: 0px;
visibility: collapse;
overflow: hidden;
transition: all 0.2s;
background: rgba(2, 0, 20, 0.5);
/* background: rgba(186, 186, 186, 0.2); */
backdrop-filter: blur(10px);
text-shadow: 0 0 1px rgba(0, 0, 0, 0.2);
border-top-left-radius: calc(var(--border-radius) + 4px);
border-top-right-radius: calc(var(--border-radius) + 4px);
/* border: 2px solid rgba(255, 255, 255, 0.8); */
border: 2px solid rgba(255, 255, 255, 0.4);
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.6);
/* @media (prefers-color-scheme: dark) {
box-shadow:
0 8px 32px 0 rgba(0, 0, 0, 0.85),
0 2px 12px 0 rgba(57, 182, 255, 0.1);
} */
.expanded & {
padding-top: 8px;
visibility: visible;
}
.historySection {
position: relative;
overflow-y: auto;
overscroll-behavior: contain;
scrollbar-width: none;
max-height: 0;
padding-inline: 8px;
transition: max-height 0.2s;
.expanded & {
max-height: 400px;
}
.historyItem {
/* backdrop-filter: blur(10px); */
padding: 8px 10px;
margin-bottom: 6px;
background: linear-gradient(135deg, rgba(255, 255, 255, 0.08), rgba(255, 255, 255, 0.03));
border-radius: 8px;
border-left: 2px solid rgba(57, 182, 255, 0.5);
font-size: 12px;
color: white;
/* color: black; */
line-height: 1.3;
position: relative;
overflow: hidden;
/* 微妙的内阴影 */
box-shadow:
inset 0 1px 0 rgba(255, 255, 255, 0.1),
0 1px 3px rgba(0, 0, 0, 0.1);
&::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
height: 1px;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
}
&:hover {
background: linear-gradient(135deg, rgba(255, 255, 255, 0.12), rgba(255, 255, 255, 0.06));
/* transform: translateY(-1px); */
box-shadow:
inset 0 1px 0 rgba(255, 255, 255, 0.15),
0 2px 4px rgba(0, 0, 0, 0.15);
}
&:last-child {
margin-bottom: 10px;
}
&.completed,
&.input,
&.output {
border-left-color: rgb(34, 197, 94);
background: linear-gradient(135deg, rgba(34, 197, 94, 0.1), rgba(34, 197, 94, 0.05));
}
&.error {
border-left-color: rgb(239, 68, 68);
background: linear-gradient(135deg, rgba(239, 68, 68, 0.1), rgba(239, 68, 68, 0.05));
}
&.retry {
border-left-color: rgb(255, 214, 0);
background: linear-gradient(135deg, rgba(255, 214, 0, 0.1), rgba(255, 214, 0, 0.05));
}
/* 突出显示 done 成功结果 */
&.doneSuccess {
background: linear-gradient(
135deg,
rgba(34, 197, 94, 0.25),
rgba(34, 197, 94, 0.15),
rgba(34, 197, 94, 0.08)
);
border: none;
border-left: 4px solid rgb(34, 197, 94);
box-shadow:
0 4px 12px rgba(34, 197, 94, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.2),
0 0 20px rgba(34, 197, 94, 0.1);
font-weight: 600;
color: rgb(220, 252, 231);
padding: 10px 12px;
margin-bottom: 8px;
border-radius: 8px;
position: relative;
overflow: hidden;
&::before {
background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.4), transparent);
}
&::after {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.1), transparent);
animation: shimmer 2s ease-in-out infinite;
}
.historyContent {
.statusIcon {
font-size: 16px;
animation: celebrate 0.8s ease-in-out;
filter: drop-shadow(0 2px 4px rgba(34, 197, 94, 0.5));
}
}
}
/* 突出显示 done 失败结果 */
&.doneError {
background: linear-gradient(
135deg,
rgba(239, 68, 68, 0.25),
rgba(239, 68, 68, 0.15),
rgba(239, 68, 68, 0.08)
);
border: none;
border-left: 4px solid rgb(239, 68, 68);
box-shadow:
0 4px 12px rgba(239, 68, 68, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.2),
0 0 20px rgba(239, 68, 68, 0.1);
font-weight: 600;
color: rgb(254, 226, 226);
padding: 10px 12px;
margin-bottom: 8px;
border-radius: 8px;
position: relative;
overflow: hidden;
&::before {
background: linear-gradient(90deg, transparent, rgba(239, 68, 68, 0.4), transparent);
}
.historyContent {
.statusIcon {
font-size: 16px;
filter: drop-shadow(0 2px 4px rgba(239, 68, 68, 0.5));
}
}
}
.historyContent {
display: flex;
align-items: center;
gap: 8px;
word-break: break-all;
white-space: pre-wrap;
/* overflow-x: auto; */
.statusIcon {
font-size: 12px;
flex-shrink: 0;
line-height: 1;
transition: all 0.3s ease;
}
}
.historyMeta {
font-size: 10px;
color: rgba(255, 255, 255, 0.6);
/* color: rgb(61, 61, 61); */
margin-top: 8px;
line-height: 1;
}
}
}
}
/* 动画关键帧 - 更快的闪烁 */
@keyframes pulse {
0%,
100% {
opacity: 1;
transform: scale(1);
}
50% {
opacity: 0.4;
transform: scale(1.3);
}
}
/* 重试动画 - 旋转脉冲 */
@keyframes retryPulse {
0%,
100% {
opacity: 1;
transform: scale(1) rotate(0deg);
}
25% {
opacity: 0.6;
transform: scale(1.2) rotate(90deg);
}
50% {
opacity: 0.8;
transform: scale(1.1) rotate(180deg);
}
75% {
opacity: 0.6;
transform: scale(1.2) rotate(270deg);
}
}
/* 庆祝动画 */
@keyframes celebrate {
0%,
100% {
transform: scale(1);
}
25% {
transform: scale(1.2) rotate(-5deg);
}
75% {
transform: scale(1.2) rotate(5deg);
}
}
/* done 卡片的光泽效果 */
@keyframes shimmer {
0% {
left: -100%;
}
100% {
left: 100%;
}
}
/* 输入区域样式 */
.inputSectionWrapper {
position: absolute;
width: var(--history-width);
top: var(--height);
left: var(--side-space);
z-index: -1;
visibility: visible;
overflow: hidden;
height: 48px;
transition: all 0.2s;
background: rgba(186, 186, 186, 0.2);
backdrop-filter: blur(10px);
border-bottom-left-radius: calc(var(--border-radius) + 4px);
border-bottom-right-radius: calc(var(--border-radius) + 4px);
border: 2px solid rgba(255, 255, 255, 0.3);
box-shadow: 0 1px 16px rgba(0, 0, 0, 0.4);
&.hidden {
visibility: collapse;
height: 0;
}
.inputSection {
display: flex;
align-items: center;
gap: 4px;
padding: 8px 8px;
.taskInput {
flex: 1;
background: rgba(255, 255, 255, 0.4);
border: 1px solid rgba(255, 255, 255, 0.3);
border-radius: 10px;
padding-inline: 10px;
color: rgb(20, 20, 20);
font-size: 12px;
height: 28px;
line-height: 1;
outline: none;
transition: all 0.2s ease;
/* text-shadow: 0 0 2px rgba(255, 255, 255, 0.8); */
/* border-color: rgba(57, 182, 255, 0.3); */
&::placeholder {
color: rgb(53, 53, 53);
}
&:focus {
background: rgba(255, 255, 255, 0.8);
border-color: rgba(57, 182, 255, 0.6);
box-shadow: 0 0 0 2px rgba(57, 182, 255, 0.2);
}
}
}
}

548
src/ui/Panel.ts Normal file
View File

@@ -0,0 +1,548 @@
import type { PageAgent } from '@/PageAgent'
import type { I18n } from '@/i18n'
import { truncate } from '@/utils'
import type { EventBus } from '@/utils/bus'
import { type Step, UIState } from './UIState'
import styles from './Panel.module.css'
/**
* Agent control panel
*/
export class Panel {
#wrapper: HTMLElement
#indicator: HTMLElement
#statusText: HTMLElement
#historySection: HTMLElement
#expandButton: HTMLElement
#pauseButton: HTMLElement
#stopButton: HTMLElement
#inputSection: HTMLElement
#taskInput: HTMLInputElement
#bus: EventBus
#state = new UIState()
#isExpanded = false
#pageAgent: PageAgent
#userAnswerResolver: ((input: string) => void) | null = null
#isWaitingForUserAnswer: boolean = false
get wrapper(): HTMLElement {
return this.#wrapper
}
constructor(pageAgent: PageAgent) {
this.#pageAgent = pageAgent
this.#bus = pageAgent.bus
this.#wrapper = this.#createWrapper()
this.#indicator = this.#wrapper.querySelector(`.${styles.indicator}`)!
this.#statusText = this.#wrapper.querySelector(`.${styles.statusText}`)!
this.#historySection = this.#wrapper.querySelector(`.${styles.historySection}`)!
this.#expandButton = this.#wrapper.querySelector(`.${styles.expandButton}`)!
this.#pauseButton = this.#wrapper.querySelector(`.${styles.pauseButton}`)!
this.#stopButton = this.#wrapper.querySelector(`.${styles.stopButton}`)!
this.#inputSection = this.#wrapper.querySelector(`.${styles.inputSectionWrapper}`)!
this.#taskInput = this.#wrapper.querySelector(`.${styles.taskInput}`)!
this.#setupEventListeners()
// this.#expand() // debug
this.#showInputArea()
this.#bus.on('panel:show', () => this.#show())
this.#bus.on('panel:hide', () => this.#hide())
this.#bus.on('panel:reset', () => this.#reset())
this.#bus.on('panel:update', (stepData) => this.#update(stepData))
this.#bus.on('panel:expand', () => this.#expand())
this.#bus.on('panel:collapse', () => this.#collapse())
}
/**
* Ask for user input
*/
async askUser(question: string): Promise<string> {
return new Promise((resolve) => {
// Set `waiting for user answer` state
this.#isWaitingForUserAnswer = true
this.#userAnswerResolver = resolve
// Update state to `running`
this.#update({
type: 'output',
displayText: `询问: ${question}`,
})
// Expand history panel
if (!this.#isExpanded) {
this.#expand()
}
this.#showInputArea(this.#pageAgent.i18n.t('ui.panel.userAnswerPrompt'))
})
}
/**
* Dispose panel
*/
dispose(): void {
this.#isWaitingForUserAnswer = false
this.wrapper.remove()
}
/**
* Update status
*/
async #update(stepData: Omit<Step, 'id' | 'stepNumber' | 'timestamp'>): Promise<void> {
const step = this.#state.addStep(stepData)
// Show animation if text changes
const headerText = truncate(step.displayText, 20)
if (this.#statusText.textContent !== headerText) {
await this.#animateTextChange(headerText)
}
this.#updateStatusIndicator(step.type)
this.#updateHistory()
// Auto-expand history after task completion
if (step.type === 'completed' || step.type === 'error') {
if (!this.#isExpanded) {
this.#expand()
}
}
// Control input area display based on status
if (this.#shouldShowInputArea()) {
this.#showInputArea()
} else {
this.#hideInputArea()
}
}
/**
* Show panel
*/
#show(): void {
this.wrapper.style.display = 'block'
// Force reflow to trigger animation
void this.wrapper.offsetHeight
this.wrapper.style.opacity = '1'
this.wrapper.style.transform = 'translateX(-50%) translateY(0)'
}
/**
* 隐藏面板
*/
#hide(): void {
this.wrapper.style.opacity = '0'
this.wrapper.style.transform = 'translateX(-50%) translateY(20px)'
this.wrapper.style.display = 'none'
}
/**
* 重置状态
*/
#reset(): void {
this.#state.reset()
this.#statusText.textContent = this.#pageAgent.i18n.t('ui.panel.ready')
this.#updateStatusIndicator('thinking')
this.#updateHistory()
this.#collapse()
// Reset pause state
this.#pageAgent.paused = false
this.#updatePauseButton()
// Reset user input state
this.#isWaitingForUserAnswer = false
this.#userAnswerResolver = null
// Show input area
this.#showInputArea()
}
/**
* Toggle pause state
*/
#togglePause(): void {
this.#pageAgent.paused = !this.#pageAgent.paused
this.#updatePauseButton()
// Update status display
if (this.#pageAgent.paused) {
this.#statusText.textContent = '暂停中,稍后'
this.#updateStatusIndicator('thinking') // Use existing thinking state
} else {
this.#statusText.textContent = '继续执行'
this.#updateStatusIndicator('tool_executing') // Restore to execution state
}
}
/**
* 更新暂停按钮状态
*/
#updatePauseButton(): void {
if (this.#pageAgent.paused) {
this.#pauseButton.textContent = '▶'
this.#pauseButton.title = '继续'
this.#pauseButton.classList.add(styles.paused)
} else {
this.#pauseButton.textContent = '⏸︎'
this.#pauseButton.title = '暂停'
this.#pauseButton.classList.remove(styles.paused)
}
}
/**
* 终止 Agent
*/
#stopAgent(): void {
// Update status display
this.#update({
type: 'error',
displayText: '任务已终止',
})
this.#pageAgent.dispose()
}
/**
* 提交任务
*/
#submitTask() {
const input = this.#taskInput.value.trim()
if (!input) return
// Hide input area
this.#hideInputArea()
if (this.#isWaitingForUserAnswer) {
// Handle user input mode
this.#handleUserAnswer(input)
} else {
this.#pageAgent.execute(input)
}
}
/**
* 处理用户回答
*/
#handleUserAnswer(input: string): void {
// Add user input to history
this.#update({
type: 'input',
displayText: `用户回答: ${input}`,
})
// Reset state
this.#isWaitingForUserAnswer = false
// Call resolver to return user input
if (this.#userAnswerResolver) {
this.#userAnswerResolver(input)
this.#userAnswerResolver = null
}
}
/**
* 显示输入区域
*/
#showInputArea(placeholder?: string): void {
// Clear input field
this.#taskInput.value = ''
this.#taskInput.placeholder = placeholder || '输入新任务,详细描述步骤,回车提交'
this.#inputSection.classList.remove(styles.hidden)
// Focus on input field
setTimeout(() => {
this.#taskInput.focus()
}, 100)
}
/**
* 隐藏输入区域
*/
#hideInputArea(): void {
this.#inputSection.classList.add(styles.hidden)
}
/**
* 检查是否应该显示输入区域
*/
#shouldShowInputArea(): boolean {
// Always show input area if waiting for user input
if (this.#isWaitingForUserAnswer) return true
const steps = this.#state.getAllSteps()
if (steps.length === 0) {
return true // Initial state
}
const lastStep = steps[steps.length - 1]
return lastStep.type === 'completed' || lastStep.type === 'error'
}
#createWrapper(): HTMLElement {
const wrapper = document.createElement('div')
wrapper.id = 'page-agent-runtime_agent-panel'
wrapper.className = `${styles.wrapper} ${styles.collapsed}`
wrapper.setAttribute('data-browser-use-ignore', 'true')
wrapper.innerHTML = `
<div class="${styles.background}"></div>
<div class="${styles.historySectionWrapper}">
<div class="${styles.historySection}">
${this.#createHistoryItem({
id: 'placeholder',
stepNumber: 0,
timestamp: new Date(),
type: 'thinking',
displayText: '等待任务开始...',
})}
</div>
</div>
<div class="${styles.header}">
<div class="${styles.statusSection}">
<div class="${styles.indicator} ${styles.thinking}"></div>
<div class="${styles.statusText}">准备就绪</div>
</div>
<div class="${styles.controls}">
<button class="${styles.controlButton} ${styles.expandButton}" title="展开历史">
</button>
<button class="${styles.controlButton} ${styles.pauseButton}" title="暂停">
⏸︎
</button>
<button class="${styles.controlButton} ${styles.stopButton}" title="终止">
X
</button>
</div>
</div>
<div class="${styles.inputSectionWrapper} ${styles.hidden}">
<div class="${styles.inputSection}">
<input
type="text"
class="${styles.taskInput}"
maxlength="200"
/>
</div>
</div>
`
document.body.appendChild(wrapper)
return wrapper
}
#setupEventListeners(): void {
// Click header area to expand/collapse
const header = this.wrapper.querySelector(`.${styles.header}`)!
header.addEventListener('click', (e) => {
// Don't trigger expand/collapse if clicking on buttons
if ((e.target as HTMLElement).closest(`.${styles.controlButton}`)) {
return
}
this.#toggle()
})
// Expand button
this.#expandButton.addEventListener('click', (e) => {
e.stopPropagation()
this.#toggle()
})
// Pause/continue button
this.#pauseButton.addEventListener('click', (e) => {
e.stopPropagation()
this.#togglePause()
})
// Stop button
this.#stopButton.addEventListener('click', (e) => {
e.stopPropagation()
this.#stopAgent()
})
// Submit on Enter key in input field
this.#taskInput.addEventListener('keydown', (e) => {
if (e.isComposing) return // Ignore IME composition keys
if (e.key === 'Enter') {
e.preventDefault()
this.#submitTask()
}
})
// Prevent input area click event bubbling
this.#inputSection.addEventListener('click', (e) => {
e.stopPropagation()
})
}
#toggle(): void {
if (this.#isExpanded) {
this.#collapse()
} else {
this.#expand()
}
}
#expand(): void {
this.#isExpanded = true
this.wrapper.classList.remove(styles.collapsed)
this.wrapper.classList.add(styles.expanded)
this.#expandButton.textContent = '▲'
}
#collapse(): void {
this.#isExpanded = false
this.wrapper.classList.remove(styles.expanded)
this.wrapper.classList.add(styles.collapsed)
this.#expandButton.textContent = '▼'
}
async #animateTextChange(newText: string): Promise<void> {
return new Promise((resolve) => {
// Fade out current text
this.#statusText.classList.add(styles.fadeOut)
setTimeout(() => {
// Update text content
this.#statusText.textContent = newText
// Fade in new text
this.#statusText.classList.remove(styles.fadeOut)
this.#statusText.classList.add(styles.fadeIn)
setTimeout(() => {
this.#statusText.classList.remove(styles.fadeIn)
resolve()
}, 300)
}, 150) // Half the duration of fade out animation
})
}
#updateStatusIndicator(type: Step['type']): void {
// Clear all status classes
this.#indicator.className = styles.indicator
// Add corresponding status class
this.#indicator.classList.add(styles[type])
}
#updateHistory(): void {
const steps = this.#state.getAllSteps()
this.#historySection.innerHTML = steps
.slice(-10) // Only show last 10 items
.map((step) => this.#createHistoryItem(step))
.join('')
// Scroll to bottom to show latest records
this.#scrollToBottom()
}
#scrollToBottom(): void {
// Execute in next event loop to ensure DOM update completion
setTimeout(() => {
this.#historySection.scrollTop = this.#historySection.scrollHeight
}, 0)
}
#createHistoryItem(step: Step): string {
const time = step.timestamp.toLocaleTimeString('zh-CN', {
hour12: false,
hour: '2-digit',
minute: '2-digit',
second: '2-digit',
})
let typeClass = ''
let statusIcon = ''
// Set styles and icons based on step type
if (step.type === 'completed') {
// Check if this is a result from done tool
if (step.toolName === 'done') {
// @todo not right
// Judge success or failure based on result
const isSuccess =
!step.toolResult ||
(!step.toolResult.includes('失败') && !step.toolResult.includes('错误'))
typeClass = isSuccess ? styles.doneSuccess : styles.doneError
statusIcon = isSuccess ? '🎉' : '❌'
} else {
typeClass = styles.completed
statusIcon = '✅'
}
} else if (step.type === 'error') {
typeClass = styles.error
statusIcon = '❌'
} else if (step.type === 'tool_executing') {
statusIcon = '⚙️'
} else if (step.type === 'output') {
typeClass = styles.output
statusIcon = '🤖'
} else if (step.type === 'input') {
typeClass = styles.input
statusIcon = '🎯'
} else if (step.type === 'retry') {
typeClass = styles.retry
statusIcon = '🔄'
} else {
statusIcon = '🧠'
}
return `
<div class="${styles.historyItem} ${typeClass}">
<div class="${styles.historyContent}">
<span class="${styles.statusIcon}">${statusIcon}</span>
<span>${step.displayText}</span>
</div>
<div class="${styles.historyMeta}">
步骤 ${step.stepNumber} · ${time}
${step.duration ? ` · ${step.duration}ms` : ''}
</div>
</div>
`
}
}
/**
* 获取工具执行时的显示文本
*/
export function getToolExecutingText(toolName: string, args: any, i18n: I18n): string {
switch (toolName) {
case 'click_element_by_index':
return i18n.t('ui.tools.clicking', { index: args.index })
case 'input_text':
return i18n.t('ui.tools.inputting', { index: args.index })
case 'select_dropdown_option':
return i18n.t('ui.tools.selecting', { text: args.text })
case 'scroll':
return i18n.t('ui.tools.scrolling')
case 'wait':
return i18n.t('ui.tools.waiting', { seconds: args.seconds })
case 'done':
return i18n.t('ui.tools.done')
default:
return i18n.t('ui.tools.executing', { toolName })
}
}
/**
* 获取工具完成时的显示文本
*/
export function getToolCompletedText(toolName: string, args: any, i18n: I18n): string | null {
switch (toolName) {
case 'click_element_by_index':
return i18n.t('ui.tools.clicked', { index: args.index })
case 'input_text':
return i18n.t('ui.tools.inputted', { text: args.text })
case 'select_dropdown_option':
return i18n.t('ui.tools.selected', { text: args.text })
case 'scroll':
return i18n.t('ui.tools.scrolled')
case 'wait':
return i18n.t('ui.tools.waited')
case 'done':
return null
default:
return null
}
}

View File

@@ -0,0 +1,10 @@
.wrapper {
position: fixed;
inset: 0;
z-index: 2147483641; /* 确保在所有元素之上,除了 panel */
/* pointer-events: none; */
cursor: not-allowed;
overflow: hidden;
display: none;
}

172
src/ui/SimulatorMask.ts Normal file
View File

@@ -0,0 +1,172 @@
import { Motion } from 'ai-motion'
import { isPageDark } from '@/utils/checkDarkMode'
import styles from './SimulatorMask.module.css'
import cursorStyles from './cursor.module.css'
export class SimulatorMask {
wrapper = document.createElement('div')
motion = new Motion({
mode: isPageDark() ? 'dark' : 'light',
styles: {
position: 'absolute',
inset: '0',
},
})
#cursor = document.createElement('div')
#currentCursorX = 0
#currentCursorY = 0
#targetCursorX = 0
#targetCursorY = 0
constructor() {
this.wrapper.id = 'page-agent-runtime_simulator-mask'
this.wrapper.className = styles.wrapper
this.wrapper.setAttribute('data-browser-use-ignore', 'true')
this.wrapper.appendChild(this.motion.element)
this.motion.autoResize(this.wrapper)
// Capture all mouse, keyboard, and wheel events
this.wrapper.addEventListener('click', (e) => {
e.stopPropagation()
e.preventDefault()
})
this.wrapper.addEventListener('mousedown', (e) => {
e.stopPropagation()
e.preventDefault()
})
this.wrapper.addEventListener('mouseup', (e) => {
e.stopPropagation()
e.preventDefault()
})
this.wrapper.addEventListener('mousemove', (e) => {
e.stopPropagation()
e.preventDefault()
})
this.wrapper.addEventListener('wheel', (e) => {
e.stopPropagation()
e.preventDefault()
})
this.wrapper.addEventListener('keydown', (e) => {
e.stopPropagation()
e.preventDefault()
})
this.wrapper.addEventListener('keyup', (e) => {
e.stopPropagation()
e.preventDefault()
})
// Create AI cursor
this.#createCursor()
// this.show()
document.body.appendChild(this.wrapper)
this.#moveCursorToTarget()
window.addEventListener('PageAgent::MovePointerTo', (event: Event) => {
const { x, y } = (event as CustomEvent).detail
this.setCursorPosition(x, y)
})
window.addEventListener('PageAgent::ClickPointer', (event: Event) => {
this.triggerClickAnimation()
})
}
#createCursor() {
this.#cursor.className = cursorStyles.cursor
// Create ripple effect container
const rippleContainer = document.createElement('div')
rippleContainer.className = cursorStyles.cursorRipple
this.#cursor.appendChild(rippleContainer)
// Create filling layer
const fillingLayer = document.createElement('div')
fillingLayer.className = cursorStyles.cursorFilling
this.#cursor.appendChild(fillingLayer)
// Create border layer
const borderLayer = document.createElement('div')
borderLayer.className = cursorStyles.cursorBorder
this.#cursor.appendChild(borderLayer)
this.wrapper.appendChild(this.#cursor)
}
#moveCursorToTarget() {
const newX = this.#currentCursorX + (this.#targetCursorX - this.#currentCursorX) * 0.2
const newY = this.#currentCursorY + (this.#targetCursorY - this.#currentCursorY) * 0.2
const xDistance = Math.abs(newX - this.#targetCursorX)
if (xDistance > 0) {
if (xDistance < 2) {
this.#currentCursorX = this.#targetCursorX
} else {
this.#currentCursorX = newX
}
this.#cursor.style.left = `${this.#currentCursorX}px`
}
const yDistance = Math.abs(newY - this.#targetCursorY)
if (yDistance > 0) {
if (yDistance < 2) {
this.#currentCursorY = this.#targetCursorY
} else {
this.#currentCursorY = newY
}
this.#cursor.style.top = `${this.#currentCursorY}px`
}
requestAnimationFrame(() => this.#moveCursorToTarget())
}
setCursorPosition(x: number, y: number) {
this.#targetCursorX = x
this.#targetCursorY = y
}
triggerClickAnimation() {
this.#cursor.classList.remove(cursorStyles.clicking)
// Force reflow to restart animation
void this.#cursor.offsetHeight
this.#cursor.classList.add(cursorStyles.clicking)
}
show() {
this.motion.start()
this.motion.fadeIn()
this.wrapper.style.display = 'block'
// Initialize cursor position
this.#currentCursorX = window.innerWidth / 2
this.#currentCursorY = window.innerHeight / 2
this.#targetCursorX = this.#currentCursorX
this.#targetCursorY = this.#currentCursorY
this.#cursor.style.left = `${this.#currentCursorX}px`
this.#cursor.style.top = `${this.#currentCursorY}px`
}
hide() {
this.motion.fadeOut()
this.motion.pause()
this.#cursor.classList.remove(cursorStyles.clicking)
setTimeout(() => {
this.wrapper.style.display = 'none'
}, 800) // Match the animation duration
}
dispose() {
this.motion.dispose()
this.wrapper.remove()
}
}

93
src/ui/UIState.ts Normal file
View File

@@ -0,0 +1,93 @@
/**
* Agent execution state management
*/
export interface Step {
id: string
stepNumber: number
timestamp: Date
type: 'thinking' | 'tool_executing' | 'completed' | 'error' | 'output' | 'input' | 'retry'
// Tool execution related
toolName?: string
toolArgs?: any
toolResult?: any
// Display data
displayText: string
duration?: number
}
export type AgentStatus = 'idle' | 'running' | 'paused' | 'completed' | 'error'
export class UIState {
private steps: Step[] = []
private currentStep: Step | null = null
private status: AgentStatus = 'idle'
private stepCounter = 0
addStep(stepData: Omit<Step, 'id' | 'stepNumber' | 'timestamp'>): Step {
const step: Step = {
id: this.generateId(),
stepNumber: ++this.stepCounter,
timestamp: new Date(),
...stepData,
}
this.steps.push(step)
this.currentStep = step
// Update overall status
this.updateStatus(step.type)
return step
}
updateCurrentStep(updates: Partial<Step>): Step | null {
if (!this.currentStep) return null
Object.assign(this.currentStep, updates)
return this.currentStep
}
getCurrentStep(): Step | null {
return this.currentStep
}
getAllSteps(): Step[] {
return [...this.steps]
}
getStatus(): AgentStatus {
return this.status
}
reset(): void {
this.steps = []
this.currentStep = null
this.status = 'idle'
this.stepCounter = 0
}
private updateStatus(stepType: Step['type']): void {
switch (stepType) {
case 'thinking':
case 'tool_executing':
case 'output':
case 'input':
case 'retry':
this.status = 'running'
break
case 'completed':
this.status = 'completed'
break
case 'error':
this.status = 'error'
break
}
}
private generateId(): string {
return `step_${Date.now()}_${Math.random().toString(36).substring(2, 11)}`
}
}

91
src/ui/cursor.module.css Normal file
View File

@@ -0,0 +1,91 @@
/* AI 光标样式 */
.cursor {
position: absolute;
width: var(--cursor-size, 75px);
height: var(--cursor-size, 75px);
pointer-events: none;
z-index: 10000;
transform: translate(-30%, -30%);
animation: cursor-enter 300ms ease-out forwards;
}
.cursorBorder {
position: absolute;
inset: 0;
background: linear-gradient(45deg, rgb(57, 182, 255), rgb(189, 69, 251));
mask-image: url(https://img.alicdn.com/imgextra/i1/O1CN01YHLVYR1LvqWIyo5kH_!!6000000001362-2-tps-202-202.png);
mask-size: 100% 100%;
mask-repeat: no-repeat;
animation: cursor-breathe 2s ease-in-out infinite;
}
.cursorFilling {
position: absolute;
inset: 0;
background: url(https://img.alicdn.com/imgextra/i3/O1CN01JZOqOS1Tu1sIKbPLW_!!6000000002441-2-tps-202-202.png);
background-size: 100% 100%;
background-repeat: no-repeat;
}
.cursorRipple {
position: absolute;
inset: 0;
pointer-events: none;
}
.cursor.clicking .cursorRipple::after {
content: '';
position: absolute;
width: 100%;
height: 100%;
left: -30%;
top: -30%;
border: 4px solid rgba(57, 182, 255, 1);
border-radius: 50%;
animation: cursor-ripple 300ms ease-out forwards;
}
/* 光标动画关键帧 */
@keyframes cursor-breathe {
0%,
100% {
transform: scale(1);
opacity: 0.9;
}
50% {
transform: scale(1.05);
opacity: 1;
}
}
@keyframes cursor-rotate {
0% {
transform: rotate(0deg);
}
100% {
transform: rotate(360deg);
}
}
@keyframes cursor-enter {
0% {
transform: translate(-30%, -30%) scale(0.5);
opacity: 0;
}
100% {
transform: translate(-30%, -30%) scale(1);
opacity: 1;
}
}
@keyframes cursor-ripple {
0% {
transform: scale(0);
opacity: 1;
}
100% {
transform: scale(2);
opacity: 0;
}
}

View File

@@ -0,0 +1,64 @@
import styles from './motion.module.css'
export function createMotion() {
const wrapper = document.createElement('div')
wrapper.className = styles.wrapper
{
const colorWrapper = document.createElement('div')
colorWrapper.className = styles.colorWrapper
wrapper.appendChild(colorWrapper)
const layerA = document.createElement('div')
layerA.className = styles.colorLayer + ' ' + styles.layerA
colorWrapper.appendChild(layerA)
const layerB = document.createElement('div')
layerB.className = styles.colorLayer + ' ' + styles.layerB
colorWrapper.appendChild(layerB)
const layerC = document.createElement('div')
layerC.className = styles.colorLayer + ' ' + styles.layerC
colorWrapper.appendChild(layerC)
}
{
const borderWrapper = document.createElement('div')
borderWrapper.className = styles.borderWrapper
wrapper.appendChild(borderWrapper)
const layerA = document.createElement('div')
layerA.className = styles.borderLayer + ' ' + styles.layerA
borderWrapper.appendChild(layerA)
const layerB = document.createElement('div')
layerB.className = styles.borderLayer + ' ' + styles.layerB
borderWrapper.appendChild(layerB)
const layerC = document.createElement('div')
layerC.className = styles.borderLayer + ' ' + styles.layerC
borderWrapper.appendChild(layerC)
}
function show() {
wrapper.classList.remove(styles.exit)
wrapper.classList.remove(styles.entry)
// Force reflow to restart animation
void wrapper.offsetHeight
wrapper.classList.add(styles.entry)
}
function hide() {
wrapper.classList.remove(styles.entry)
wrapper.classList.remove(styles.exit)
// Force reflow to restart animation
void wrapper.offsetHeight
wrapper.classList.add(styles.exit)
}
return {
element: wrapper,
show,
hide,
}
}

View File

@@ -0,0 +1,397 @@
.wrapper {
position: absolute;
inset: 0;
pointer-events: none;
transform-origin: center;
--color-1: rgb(57, 182, 255);
--color-2: rgb(189, 69, 251);
--color-3: rgb(255, 87, 51);
--color-4: rgb(255, 214, 0);
--blend-mode: screen;
}
.colorLayer {
position: absolute;
inset: 0;
/* 变亮混合模式 */
/* mix-blend-mode: screen; */
/* mix-blend-mode: overlay; */
/* mix-blend-mode: multiply; */
mix-blend-mode: add;
/* 边框遮罩 - 中间透明,边缘不透明 */
mask-image: url(https://img.alicdn.com/imgextra/i2/O1CN01iW1wfX1C0ICvoPbTq_!!6000000000018-2-tps-512-512.png);
mask-repeat: no-repeat;
mask-size: calc(100% + 10px) calc(100% + 10px);
}
.borderWrapper {
position: absolute;
inset: 0;
/* filter: blur(10px); */
}
.borderLayer {
position: absolute;
inset: 0;
/* 变亮混合模式 */
/* mix-blend-mode: overlay; */
mix-blend-mode: add;
mask-image:
linear-gradient(
to right,
black 0px,
black 2px,
transparent 2px,
transparent calc(100% - 2px),
black calc(100% - 2px),
black 100%
),
linear-gradient(
to top,
black 0px,
black 2px,
transparent 2px,
transparent calc(100% - 2px),
black calc(100% - 2px),
black 100%
);
mask-composite: add;
mask-repeat: no-repeat;
mask-size: 100% 100%;
/* filter: blur(100px); */
}
.blueLayer {
&.colorLayer {
mask-position: left -5px top -5px;
}
&::after {
content: '';
position: absolute;
/* inset: 0; */
width: calc(max(100vw, 100vh) * 1.5);
height: 600px;
top: calc(50% - 300px);
left: 50%;
filter: blur(100px);
background: rgb(57, 182, 255);
animation: rotate-clockwise 4s linear infinite;
animation-delay: -3s;
}
}
.purpleLayer {
&.colorLayer {
mask-position: left -3px top -7px;
}
&::after {
content: '';
position: absolute;
/* inset: 0; */
width: calc(max(100vw, 100vh) * 1.5);
height: 600px;
top: calc(50% - 300px);
left: 50%;
filter: blur(100px);
background: rgb(189, 69, 251);
animation: rotate-clockwise 4s linear infinite;
animation-delay: -2s;
}
}
.orangeLayer {
/* opacity: 0.5; */
&.colorLayer {
mask-position: left -7px top -2px;
}
&::after {
content: '';
position: absolute;
/* inset: 0; */
width: calc(max(100vw, 100vh) * 1.5);
height: 600px;
top: calc(50% - 300px);
left: 50%;
filter: blur(100px);
background: rgb(255, 87, 51);
animation: rotate-counter-clockwise 3s linear infinite;
animation-delay: -2s;
}
}
.yellowLayer {
/* opacity: 0.5; */
&.colorLayer {
mask-position: left -6px top -4px;
}
&::after {
content: '';
position: absolute;
/* inset: 0; */
width: calc(max(100vw, 100vh) * 1.5);
height: 600px;
top: calc(50% - 300px);
left: 50%;
filter: blur(100px);
background: rgb(255, 214, 0);
animation: rotate-counter-clockwise 4s linear infinite;
animation-delay: -1s;
}
}
/* 旋转动画 */
@keyframes rotate-clockwise {
0% {
transform: translateX(-50%) rotate(0deg);
}
100% {
transform: translateX(-50%) rotate(360deg);
}
}
@keyframes rotate-counter-clockwise {
0% {
transform: translateX(-50%) rotate(0deg);
}
100% {
transform: translateX(-50%) rotate(-360deg);
}
}
@keyframes wrapper-entry {
from {
transform: scale(1.1);
}
to {
transform: scale(1);
}
}
/*
rgb(57, 182, 255)
rgb(189, 69, 251)
rgb(255, 87, 51)
rgb(255, 214, 0)
*/
@keyframes mask-running {
from {
transform: translateX(0%);
}
to {
transform: translateX(100%);
}
}
@keyframes mask-running-reverse {
from {
transform: translateX(100%);
}
to {
transform: translateX(0%);
}
}
.colorWrapper {
position: absolute;
inset: 0;
.colorLayer {
position: absolute;
inset: 0;
mix-blend-mode: var(--blend-mode);
/* 边框遮罩 - 中间透明,边缘不透明 */
mask-image: url(https://img.alicdn.com/imgextra/i2/O1CN01iW1wfX1C0ICvoPbTq_!!6000000000018-2-tps-512-512.png);
mask-repeat: no-repeat;
mask-size: 100% 100%;
}
}
.borderWrapper {
position: absolute;
inset: 0;
--blend-mode: lighten;
.borderLayer {
position: absolute;
inset: 0;
mix-blend-mode: var(--blend-mode);
mask-border: url(https://img.alicdn.com/imgextra/i3/O1CN01bFjRug1yssyWEUbKL_!!6000000006635-2-tps-256-256.png)
25;
-webkit-mask-box-image: url(https://img.alicdn.com/imgextra/i3/O1CN01bFjRug1yssyWEUbKL_!!6000000006635-2-tps-256-256.png)
25;
mask-repeat: no-repeat;
mask-size: 100% 100%;
background-color: var(--color-2);
}
}
.entry .colorWrapper,
.entry .borderWrapper {
animation: wrapper-entry 0.8s ease-in-out forwards;
}
.exit .colorWrapper,
.exit .borderWrapper {
animation: wrapper-entry 0.8s ease-in-out reverse forwards;
}
.layerA {
position: absolute;
inset: 0;
&::before {
mix-blend-mode: var(--blend-mode);
content: '';
display: block;
position: absolute;
width: 100%;
height: 100%;
left: -100%;
top: 0;
background-image: linear-gradient(
to right bottom,
transparent,
var(--color-1),
transparent,
var(--color-1),
transparent
);
animation: mask-running 2s linear infinite;
}
&::after {
mix-blend-mode: var(--blend-mode);
content: '';
display: block;
position: absolute;
width: 100%;
height: 100%;
left: 0;
top: 0;
background-image: linear-gradient(
to right bottom,
transparent,
var(--color-1),
transparent,
var(--color-1),
transparent
);
animation: mask-running 2s linear infinite;
}
}
.layerB {
position: absolute;
inset: 0;
&::before {
mix-blend-mode: var(--blend-mode);
content: '';
display: block;
position: absolute;
width: 100%;
height: 100%;
left: -100%;
top: 0;
background: linear-gradient(
to right top,
transparent,
var(--color-2),
transparent,
var(--color-2),
transparent
);
animation: mask-running-reverse 3s linear infinite;
}
&::after {
mix-blend-mode: var(--blend-mode);
content: '';
display: block;
position: absolute;
width: 100%;
height: 100%;
left: 0;
top: 0;
background: linear-gradient(
to right top,
transparent,
var(--color-2),
transparent,
var(--color-2),
transparent
);
animation: mask-running-reverse 3s linear infinite;
}
}
.layerC {
position: absolute;
inset: 0;
opacity: 0.5;
&::before {
mix-blend-mode: var(--blend-mode);
content: '';
display: block;
position: absolute;
width: 100%;
height: 100%;
left: -100%;
top: 0;
background: linear-gradient(
to right top,
transparent,
var(--color-3),
transparent,
var(--color-3),
transparent
);
animation: mask-running 1s linear infinite;
}
&::after {
mix-blend-mode: var(--blend-mode);
content: '';
display: block;
position: absolute;
width: 100%;
height: 100%;
left: 0;
top: 0;
background: linear-gradient(
to right top,
transparent,
var(--color-3),
transparent,
var(--color-3),
transparent
);
animation: mask-running 1s linear infinite;
}
}

5
src/ui/motion-css/readme Normal file
View File

@@ -0,0 +1,5 @@
This is the CSS implementation of ai-motion.
Easy to use but Terrible performance. Causing full screen glitching in some browsers.
Use it only in a small area.

17
src/utils/assert.ts Normal file
View File

@@ -0,0 +1,17 @@
import chalk from 'chalk'
/**
* Simple assertion function that throws an error if the condition is falsy
* @param condition - The condition to assert
* @param message - Optional error message
* @throws Error if condition is falsy
*/
export function assert(condition: unknown, message?: string, silent?: boolean): asserts condition {
if (!condition) {
const errorMessage = message ?? 'Assertion failed'
if (!silent) console.error(chalk.red(`❌ assert: ${errorMessage}`))
throw new Error(errorMessage)
}
}

128
src/utils/bus.ts Normal file
View File

@@ -0,0 +1,128 @@
/**
* Type-safe event bus for decoupling PageAgent and Panel
*/
import type { Step } from '@/ui/UIState'
/**
* Event mapping definitions
* @note Event bus callbacks must be repeatable without errors
*/
export interface PageAgentEventMap {
// Panel control events
// call panel.show()
'panel:show': { params: undefined }
// call panel.hide()
'panel:hide': { params: undefined }
// call panel.reset()
'panel:reset': { params: undefined }
// call panel.update()
'panel:update': { params: Omit<Step, 'id' | 'stepNumber' | 'timestamp'> }
// call panel.expand()
'panel:expand': { params: undefined }
// call panel.collapse()
'panel:collapse': { params: undefined }
// PageAgent status events
// 'agent:beforeUpdate': { params: undefined }
// 'agent:afterUpdate': { params: undefined }
// 'agent:execute': { params: { task: string } }
// 'agent:done': { params: { text: string; success: boolean } }
// 'agent:paused': { params: undefined }
// 'agent:resumed': { params: undefined }
// 'agent:disposed': { params: undefined }
// 'agent:error': { params: { error: string | Error } }
// Task status change events
// 'task:start': { params: { task: string } }
// 'task:step': { params: Omit<AgentStep, 'id' | 'stepNumber' | 'timestamp'> }
// 'task:complete': { params: { text: string; success: boolean } }
// 'task:error': { params: { error: string | Error } }
// Index signature for dynamic event names
// [key: string]: { params: any }
}
/**
* Event handler type definitions
*/
export type EventHandler<T extends keyof PageAgentEventMap> =
PageAgentEventMap[T]['params'] extends undefined
? () => void
: (params: PageAgentEventMap[T]['params']) => void
/**
* Async event handler type definitions
*/
export type AsyncEventHandler<T extends keyof PageAgentEventMap> =
PageAgentEventMap[T]['params'] extends undefined
? () => Promise<void>
: (params: PageAgentEventMap[T]['params']) => Promise<void>
/**
* Type-safe event bus
* @note Mainly used to decouple logic and UI
* @note All modules of a PageAgent instance share the same EventBus instance for communication
* @note Use with caution if delivery guarantee is needed for logic communication
* @note `on` `once` `emit` methods handle built-in events with type protection, use `addEventListener` for other events
*/
class EventBus extends EventTarget {
/**
* Listen to built-in events
*/
on<T extends keyof PageAgentEventMap>(
event: T,
handler: EventHandler<T & keyof PageAgentEventMap>
): void {
const wrappedHandler = (e: Event) => {
const customEvent = e as CustomEvent
const params = customEvent.detail?.[0]
return handler(params)
}
this.addEventListener(event, wrappedHandler)
}
/**
* Listen to built-in events (one-time)
*/
once<T extends keyof PageAgentEventMap>(
event: T,
handler: EventHandler<T & keyof PageAgentEventMap>
): void {
const wrappedHandler = (e: Event) => {
const customEvent = e as CustomEvent
const params = customEvent.detail?.[0]
return handler(params)
}
this.addEventListener(event, wrappedHandler, { once: true })
}
/**
* Emit built-in events
*/
emit<T extends keyof PageAgentEventMap>(
event: T,
...args: PageAgentEventMap[T]['params'] extends undefined
? []
: [PageAgentEventMap[T]['params']]
): void {
const customEvent = new CustomEvent(event, { detail: args })
this.dispatchEvent(customEvent)
return
}
}
const buses = new Map<string, EventBus>()
/**
* Get the event bus for a given channel
*/
export function getEventBus(channel: string) {
if (buses.has(channel)) {
return buses.get(channel)!
}
const bus = new EventBus()
buses.set(channel, bus)
return bus
}
export type { EventBus }

110
src/utils/checkDarkMode.ts Normal file
View File

@@ -0,0 +1,110 @@
/**
* Checks for common dark mode CSS classes on the html or body elements.
* @returns {boolean} - True if a common dark mode class is found.
*/
function hasDarkModeClass() {
const DFEAULT_DARK_MODE_CLASSES = ['dark', 'dark-mode', 'theme-dark', 'night', 'night-mode']
const htmlElement = document.documentElement
const bodyElement = document.body
// Check class names on <html> and <body>
for (const className of DFEAULT_DARK_MODE_CLASSES) {
if (htmlElement.classList.contains(className) || bodyElement.classList.contains(className)) {
return true
}
}
// Some sites use data attributes
const darkThemeAttribute = htmlElement.getAttribute('data-theme')
if (darkThemeAttribute?.toLowerCase().includes('dark')) {
return true
}
return false
}
/**
* Parses an RGB or RGBA color string and returns an object with r, g, b properties.
* @param {string} colorString - e.g., "rgb(34, 34, 34)" or "rgba(0, 0, 0, 0.5)"
* @returns {{r: number, g: number, b: number}|null}
*/
function parseRgbColor(colorString: string) {
const rgbMatch = /rgba?\((\d+),\s*(\d+),\s*(\d+)/.exec(colorString)
if (!rgbMatch) {
return null // Not a valid rgb/rgba string
}
return {
r: parseInt(rgbMatch[1]),
g: parseInt(rgbMatch[2]),
b: parseInt(rgbMatch[3]),
}
}
/**
* Determines if a color is "dark" based on its calculated luminance.
* @param {string} colorString - The CSS color string (e.g., "rgb(50, 50, 50)").
* @param {number} threshold - A value between 0 and 255. Colors with luminance below this will be considered dark. Default is 128.
* @returns {boolean} - True if the color is considered dark.
*/
function isColorDark(colorString: string, threshold = 128) {
if (!colorString || colorString === 'transparent' || colorString.startsWith('rgba(0, 0, 0, 0)')) {
return false // Transparent is not dark
}
const rgb = parseRgbColor(colorString)
if (!rgb) {
return false // Could not parse color
}
// Calculate perceived luminance using the standard formula
const luminance = 0.299 * rgb.r + 0.587 * rgb.g + 0.114 * rgb.b
return luminance < threshold
}
/**
* Checks the background color of the body element to determine if the page is dark.
* @returns {boolean}
*/
function isBackgroundDark() {
// We check both <html> and <body> because some pages set the color on <html>
const htmlStyle = window.getComputedStyle(document.documentElement)
const bodyStyle = window.getComputedStyle(document.body)
// Get background colors
const htmlBgColor = htmlStyle.backgroundColor
const bodyBgColor = bodyStyle.backgroundColor
// The body's background might be transparent, in which case we should
// fall back to the html element's background.
if (isColorDark(bodyBgColor)) {
return true
} else if (bodyBgColor === 'transparent' || bodyBgColor.startsWith('rgba(0, 0, 0, 0)')) {
return isColorDark(htmlBgColor)
}
return false
}
/**
* A comprehensive function to determine if the page is currently in a dark theme.
* It combines class checking and background color analysis.
* @returns {boolean} - True if the page is likely dark.
*/
export function isPageDark() {
// Strategy 1: Check for common dark mode classes
if (hasDarkModeClass()) {
return true
}
// Strategy 2: Analyze the computed background color
if (isBackgroundDark()) {
return true
}
// @TODO add more checks here, e.g., analyzing text color,
// or checking the background of major layout elements like <main> or #app.
return false
}

37
src/utils/errors.ts Normal file
View File

@@ -0,0 +1,37 @@
/**
* # Error Handling
*
* @kind Abort Error
*
* 无需处理log 即可
*
* @kind Tool Execution Error
*
* Tool 执行过程中抛出的错误。参数是合法的,但是不一定合理,也可能其他页面环境变化导致的错误。
* 重试没有意义,需要上屏并返回给模型,让模型在下一次 tool call 中处理。
*
* @kind Tool Input Error
*
* 在非 openAI 模型中会非常常见,需要上屏并重试。
* 捕获时机:
* - InvalidToolInputError 和 NoSuchToolError 会被 ai-sdk 自动修复
* - 没有说是否计入重试次数
* - 可以定制修复方案
* - @see https://ai-sdk.dev/docs/ai-sdk-core/generating-structured-data#repairing-invalid-or-malformed-json
* - JSONParseError 需要在调用 generateText 时捕获
*
* 重试 3 种思路:
* 1.重新调用,并强调要符合 schema
* 2.加入历史,告诉模型出现的错误,让模型自己在下一次调用中解决
* 3.定义一个专门的 schema 修复模型,将 schema 和错误的数据发给模型,要求返回正确的 schema
*
* 如果重试后继续错误,则以失败结束任务
*
* @kind LLM API Error
*
* 即便一个服务声称自己兼容 openai 的接口 api但是出错的返回格式往往是自定义的
* 因此很难通过返回体来判断真正的错误类型。也很难有完善的错误处理机制。
* 能做的就只有捕获错误并上屏。
* 如果 ai-sdk 识别出来了错误,会自行重试。
* 如果没有,则只能以失败结束任务
*/

80
src/utils/index.ts Normal file
View File

@@ -0,0 +1,80 @@
/**
* Wait until condition becomes true
* @returns Returns when condition becomes true, throws otherwise
* @param timeout Timeout in milliseconds, default 0 means no timeout, throws error on timeout
*/
export async function waitUntil(check: () => boolean, timeout = 60 * 60_1000): Promise<boolean> {
if (check()) return true
return new Promise((resolve, reject) => {
const start = Date.now()
const interval = setInterval(() => {
if (check()) {
clearInterval(interval)
resolve(true)
} else if (Date.now() - start > timeout) {
clearInterval(interval)
reject(new Error('Timeout waiting for condition to become true'))
}
}, 100)
})
}
//
export function truncate(text: string, maxLength: number): string {
if (text.length > maxLength) {
return text.substring(0, maxLength) + '...'
}
return text
}
//
export function trimLines(text: string): string {
return text
.split('\n')
.map((line) => line.trim())
.join('\n')
}
//
export function randomID(existingIDs?: string[]): string {
let id = Math.random().toString(36).substring(2, 11)
if (!existingIDs) {
return id
}
const MAX_TRY = 1000
let tryCount = 0
while (existingIDs.includes(id)) {
id = Math.random().toString(36).substring(2, 11)
tryCount++
if (tryCount > MAX_TRY) {
throw new Error('randomID: too many try')
}
}
return id
}
//
if (!window.__PAGE_AGENT_IDS__) {
window.__PAGE_AGENT_IDS__ = []
}
const ids = window.__PAGE_AGENT_IDS__
/**
* Generate a random ID.
* @note Unique within this window.
*/
export function uid() {
const id = randomID(ids)
ids.push(id)
return id
}