Files
page-agent/src/PageAgent.ts
2025-09-29 16:33:15 +08:00

525 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import { tool } from 'ai'
import type { LanguageModelUsage, ToolSet } from 'ai'
import chalk from 'chalk'
import zod from 'zod'
import type { PageAgentConfig } from './config'
import { MACRO_TOOL_NAME, MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
import * as dom from './dom'
import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
import { getPageInfo } from './dom/getPageInfo'
import { I18n } from './i18n'
import { LLM } from './llms'
import { patchReact } from './patches/react'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools'
import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel'
import { SimulatorMask } from './ui/SimulatorMask'
import { trimLines, uid, waitUntil } from './utils'
import { assert } from './utils/assert'
import { getEventBus } from './utils/bus'
export type { PageAgentConfig }
export interface AgentBrain {
// thinking?: string
evaluation_previous_goal: string
memory: string
next_goal: string
}
export interface AgentHistory {
brain: AgentBrain
action: {
name: string
input: any
output: any
}
usage: LanguageModelUsage
}
export interface ExecutionResult {
success: boolean
data: string
history: AgentHistory[]
}
export class PageAgent extends EventTarget {
config: PageAgentConfig
id = uid()
bus = getEventBus(this.id)
i18n: I18n
paused = false
disposed = false
task = ''
#llm: LLM
#totalWaitTime = 0
#abortController = new AbortController()
/** Corresponds to eval_page in browser-use */
flatTree: FlatDomTree | null = null
/**
* All highlighted index-mapped interactive elements
* Corresponds to DOMState.selector_map in browser-use
*/
selectorMap = new Map<number, InteractiveElementDomNode>()
/** highlight index -> element text */
elementTextMap = new Map<number, string>()
/** Corresponds to clickable_elements_to_string in browser-use */
simplifiedHTML = '<EMPTY>'
/** last time the tree was updated */
lastTimeUpdate = 0
/** Corresponds to actions in browser-use */
tools = new Map(tools)
/** Fullscreen mask */
mask = new SimulatorMask()
/** Interactive panel */
panel = new Panel(this)
/** History records */
history: AgentHistory[] = []
constructor(config: PageAgentConfig = {}) {
super()
this.config = config
this.#llm = new LLM(this.config, this.id)
this.i18n = new I18n(this.config.language)
patchReact(this)
}
/**
* @todo maybe return something?
*/
async execute(task: string): Promise<ExecutionResult> {
if (!task) throw new Error('Task is required')
this.task = task
// Show mask and panel
this.mask.show()
this.bus.emit('panel:show')
this.bus.emit('panel:reset')
this.bus.emit('panel:update', {
type: 'input',
displayText: task,
})
if (this.#abortController) {
this.#abortController.abort()
this.#abortController = new AbortController()
}
this.history = []
try {
let step = 0
while (true) {
console.group(`step: ${step + 1}`)
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
// pause
await waitUntil(() => !this.paused)
// Update status to thinking
console.log(chalk.blue('Thinking...'))
this.bus.emit('panel:update', {
type: 'thinking',
displayText: this.i18n.t('ui.panel.thinking'),
})
const result = await this.#llm.invoke(
[
{
role: 'system',
content: this.#getSystemPrompt(),
},
{
role: 'user',
content: this.#assembleUserPrompt(),
},
],
// tools,
this.#packMacroTool(),
this.#abortController.signal
)
const toolResult = result.toolResult
const input = toolResult.input
const output = toolResult.output
const brain = {
thinking: input.thinking,
evaluation_previous_goal: input.evaluation_previous_goal,
memory: input.memory,
next_goal: input.next_goal,
}
const actionName = Object.keys(input.action)[0]
const action = {
name: actionName,
input: input.action[actionName],
output: output,
}
this.history.push({
brain,
action,
usage: result.usage,
})
console.log(chalk.green('Step finished:'), actionName)
console.groupEnd()
step++
if (step > MAX_STEPS) {
this.#onDone('Step count exceeded maximum limit', false)
return {
success: false,
data: 'Step count exceeded maximum limit',
history: this.history,
}
}
if (actionName === 'done') {
const success = action.input.success || false
const text = action.input.text || 'no text provided'
console.log(chalk.green.bold('Task completed'), success, text)
this.#onDone(text, success)
return {
success,
data: text,
history: this.history,
}
}
}
} catch (error: unknown) {
console.error('Task failed', error)
this.#onDone(String(error), false)
return {
success: false,
data: String(error),
history: this.history,
}
}
}
/**
* Merge all tools into a single MacroTool with the following input:
* - thinking: string
* - evaluation_previous_goal: string
* - memory: string
* - next_goal: string
* - action: { toolName: toolInput }
* where action must be selected from tools defined in this.tools
*
* @topic 要不要合并成一个 tool
* @facts
* - 我们需要模型每步返回 evaluation/memory/goal 等思考过程
* - browser use 合并成一个巨大的 tool
* ```json
* {
* "memory": "...",
* "goal": "...",
* "actions": [
* {
* "name": "...",
* "args": "..."
* }
* // ...
* ]
* }
* ```
* - qwen 目前必须指定 function name 来确保 tool call
* @reasoning
* - 不能为了 qwen 的缺陷而设计系统
* - 更复杂的 tool 更容易出错
* - 分散的 tool 更容易利用 ai-sdk 的重试机制,也更容易处理错误
* - 不能用额外的步骤生成这些数据,不仅性能过差,而且 goal 之类的必须和 call 一起生成
* @options
* - Plan @A
* - 和 browser use 使用完全一致的做法,合并成一个大 tool要求每次调用
* - 会把 tool 定义变得非常复杂,增加出错率
* - Plan @B
* - 每次调用两个 tool其中一个用来输出思考
* - 很难用提示词 enforce 这么复杂的规则
* - Plan @C
* - 自动为每个 tool 增加固定的 reasoning/memory/goal 等输入,并自动拦截提取这些数据
* - 会让 tool 定义变得很长
* @conclusion
* - 使用 @A
*/
#packMacroTool(): ToolSet {
const tools = this.tools
// discriminated version
// @note Success rate ~0, model seems unable to understand discriminated union
// // Create discriminated union schemas from tools
// const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
// return zod.object({
// name: zod.literal(toolName),
// input: tool.inputSchema,
// })
// })
// // Ensure at least one tool exists
// assert(actionSchemas.length, 'No tools available to create macro tool')
// const actionSchema = zod.discriminatedUnion('name', actionSchemas as any)
// union version
const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
return zod.object({
[toolName]: tool.inputSchema,
})
})
const actionSchema = zod.union(actionSchemas)
return {
[MACRO_TOOL_NAME]: tool({
// description: 'Output the result of the agent',
inputSchema: zod.object({
// thinking: zod.string().optional(),
evaluation_previous_goal: zod.string().optional(),
memory: zod.string().optional(),
next_goal: zod.string().optional(),
action: actionSchema,
}),
execute: async (input, options) => {
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
// pause
await waitUntil(() => !this.paused)
console.log(chalk.blue.bold('MacroTool execute'), input)
const action = input.action!
const toolName = Object.keys(action)[0]
const toolInput = action[toolName]
const brain = trimLines(`✅: ${input.evaluation_previous_goal}
💾: ${input.memory}
🎯: ${input.next_goal}
`)
console.log(brain)
this.bus.emit('panel:update', {
type: 'thinking',
displayText: brain,
})
// Find the corresponding tool
const tool = tools.get(toolName)
assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput, options)
this.bus.emit('panel:update', {
type: 'tool_executing',
toolName,
toolArgs: toolInput,
displayText: getToolExecutingText(toolName, toolInput, this.i18n),
})
const startTime = Date.now()
// Execute tool, passing options parameter
let result = await tool.execute!.bind(this)(toolInput, options)
const duration = Date.now() - startTime
console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
if (toolName === 'wait') {
this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.`
if (this.#totalWaitTime >= 3)
result += '\nDo NOT wait any longer unless you have a good reason.\n'
result += '</sys>'
} else {
// For other tools, reset wait time
this.#totalWaitTime = 0
}
// Briefly display execution result
const displayResult = getToolCompletedText(toolName, toolInput, this.i18n)
if (displayResult)
this.bus.emit('panel:update', {
type: 'tool_executing',
toolName,
toolArgs: toolInput,
toolResult: result,
displayText: displayResult,
duration,
})
// Wait a moment to let user see the result
await new Promise((resolve) => setTimeout(resolve, 100))
return result
},
}),
}
}
/**
* Get system prompt, dynamically replace language settings based on configured language
*/
#getSystemPrompt(): string {
let systemPrompt = SYSTEM_PROMPT
const targetLanguage = this.config.language === 'zh-CN' ? '中文' : 'English'
systemPrompt = systemPrompt.replace(
/Default working language: \*\*.*?\*\*/,
`Default working language: **${targetLanguage}**`
)
return systemPrompt
}
#assembleUserPrompt(): string {
let prompt = ''
// <agent_history>
// - <step_>
prompt += '<agent_history>\n'
this.history.forEach((history, index) => {
prompt += `<step_${index + 1}>
Evaluation of Previous Step: ${history.brain.evaluation_previous_goal}
Memory: ${history.brain.memory}
Next Goal: ${history.brain.next_goal}
Action Results: ${history.action.output}
</step_${index + 1}>
`
})
prompt += '</agent_history>\n\n'
// <agent_state>
// - <user_request>
// - <step_info>
// <agent_state>
prompt += `<agent_state>
<user_request>
${this.task}
</user_request>
<step_info>
Step ${this.history.length + 1} of ${MAX_STEPS} max possible steps
Current date and time: ${new Date().toISOString()}
</step_info>
</agent_state>
`
// <browser_state>
prompt += this.#getBrowserState()
return trimLines(prompt)
}
#onDone(text: string, success = true) {
dom.cleanUpHighlights()
// Update panel status
this.bus.emit('panel:update', {
type: success ? 'output' : 'error',
displayText: text,
})
// Task completed
this.bus.emit('panel:update', {
type: 'completed',
displayText: this.i18n.t('ui.panel.taskCompleted'),
})
this.mask.hide()
this.#abortController.abort()
}
#getBrowserState(): string {
const pageUrl = window.location.href
const pageTitle = document.title
const pi = getPageInfo()
this.#updateTree()
let prompt = trimLines(`<browser_state>
Current Page: [${pageTitle}](${pageUrl})
Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page
${VIEWPORT_EXPANSION === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'}
`)
// Page header info
const has_content_above = pi.pixels_above > 4
if (has_content_above && VIEWPORT_EXPANSION !== -1) {
prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[Start of page]\n`
}
// Current viewport info
prompt += this.simplifiedHTML
prompt += `\n`
// Page footer info
const has_content_below = pi.pixels_below > 4
if (has_content_below && VIEWPORT_EXPANSION !== -1) {
prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[End of page]\n`
}
prompt += `</browser_state>\n`
return prompt
}
/**
* Update document tree
*/
#updateTree() {
this.dispatchEvent(new Event('beforeUpdate'))
this.lastTimeUpdate = Date.now()
dom.cleanUpHighlights()
this.mask.wrapper.style.pointerEvents = 'none'
this.flatTree = dom.getFlatTree({
...this.config,
interactiveBlacklist: [
...(this.config.interactiveBlacklist || []),
...document.querySelectorAll('[data-page-agent-not-interactive]').values(),
],
})
this.mask.wrapper.style.pointerEvents = 'auto'
this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes)
this.selectorMap.clear()
this.selectorMap = dom.getSelectorMap(this.flatTree)
this.elementTextMap.clear()
this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML)
this.dispatchEvent(new Event('afterUpdate'))
}
dispose() {
console.log('Disposing PageAgent...')
this.disposed = true
dom.cleanUpHighlights()
this.flatTree = null
this.selectorMap.clear()
this.elementTextMap.clear()
this.panel.dispose()
this.mask.dispose()
this.history = []
this.#abortController.abort('PageAgent disposed')
}
}