/** * Copyright (C) 2025 Alibaba Group Holding Limited * All rights reserved. */ import { tool } from 'ai' import type { LanguageModelUsage, ToolSet } from 'ai' import chalk from 'chalk' import zod from 'zod' import type { PageAgentConfig } from './config' import { MACRO_TOOL_NAME, MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants' import * as dom from './dom' import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type' import { getPageInfo } from './dom/getPageInfo' import { I18n } from './i18n' import { LLM } from './llms' import { patchReact } from './patches/react' import SYSTEM_PROMPT from './prompts/system_prompt.md?raw' import { tools } from './tools' import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel' import { SimulatorMask } from './ui/SimulatorMask' import { trimLines, uid, waitUntil } from './utils' import { assert } from './utils/assert' import { getEventBus } from './utils/bus' export type { PageAgentConfig } export interface AgentBrain { // thinking?: string evaluation_previous_goal: string memory: string next_goal: string } export interface AgentHistory { brain: AgentBrain action: { name: string input: any output: any } usage: LanguageModelUsage } export interface ExecutionResult { success: boolean data: string history: AgentHistory[] } export class PageAgent extends EventTarget { config: PageAgentConfig id = uid() bus = getEventBus(this.id) i18n: I18n paused = false disposed = false task = '' #llm: LLM #totalWaitTime = 0 #abortController = new AbortController() /** Corresponds to eval_page in browser-use */ flatTree: FlatDomTree | null = null /** * All highlighted index-mapped interactive elements * Corresponds to DOMState.selector_map in browser-use */ selectorMap = new Map() /** highlight index -> element text */ elementTextMap = new Map() /** Corresponds to clickable_elements_to_string in browser-use */ simplifiedHTML = '' /** last time the tree was updated */ lastTimeUpdate = 0 /** Corresponds to actions in browser-use */ tools = new Map(tools) /** Fullscreen mask */ mask = new SimulatorMask() /** Interactive panel */ panel = new Panel(this) /** History records */ history: AgentHistory[] = [] constructor(config: PageAgentConfig = {}) { super() this.config = config this.#llm = new LLM(this.config, this.id) this.i18n = new I18n(this.config.language) patchReact(this) } /** * @todo maybe return something? */ async execute(task: string): Promise { if (!task) throw new Error('Task is required') this.task = task // Show mask and panel this.mask.show() this.bus.emit('panel:show') this.bus.emit('panel:reset') this.bus.emit('panel:update', { type: 'input', displayText: task, }) if (this.#abortController) { this.#abortController.abort() this.#abortController = new AbortController() } this.history = [] try { let step = 0 while (true) { console.group(`step: ${step + 1}`) // abort if (this.#abortController.signal.aborted) throw new Error('AbortError') // pause await waitUntil(() => !this.paused) // Update status to thinking console.log(chalk.blue('Thinking...')) this.bus.emit('panel:update', { type: 'thinking', displayText: this.i18n.t('ui.panel.thinking'), }) const result = await this.#llm.invoke( [ { role: 'system', content: this.#getSystemPrompt(), }, { role: 'user', content: this.#assembleUserPrompt(), }, ], // tools, this.#packMacroTool(), this.#abortController.signal ) const toolResult = result.toolResult const input = toolResult.input const output = toolResult.output const brain = { thinking: input.thinking, evaluation_previous_goal: input.evaluation_previous_goal, memory: input.memory, next_goal: input.next_goal, } const actionName = Object.keys(input.action)[0] const action = { name: actionName, input: input.action[actionName], output: output, } this.history.push({ brain, action, usage: result.usage, }) console.log(chalk.green('Step finished:'), actionName) console.groupEnd() step++ if (step > MAX_STEPS) { this.#onDone('Step count exceeded maximum limit', false) return { success: false, data: 'Step count exceeded maximum limit', history: this.history, } } if (actionName === 'done') { const success = action.input.success || false const text = action.input.text || 'no text provided' console.log(chalk.green.bold('Task completed'), success, text) this.#onDone(text, success) return { success, data: text, history: this.history, } } } } catch (error: unknown) { console.error('Task failed', error) this.#onDone(String(error), false) return { success: false, data: String(error), history: this.history, } } } /** * Merge all tools into a single MacroTool with the following input: * - thinking: string * - evaluation_previous_goal: string * - memory: string * - next_goal: string * - action: { toolName: toolInput } * where action must be selected from tools defined in this.tools */ #packMacroTool(): ToolSet { const tools = this.tools // discriminated version // @note Success rate ~0, model seems unable to understand discriminated union // // Create discriminated union schemas from tools // const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => { // return zod.object({ // name: zod.literal(toolName), // input: tool.inputSchema, // }) // }) // // Ensure at least one tool exists // assert(actionSchemas.length, 'No tools available to create macro tool') // const actionSchema = zod.discriminatedUnion('name', actionSchemas as any) // union version const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => { return zod.object({ [toolName]: tool.inputSchema, }) }) const actionSchema = zod.union(actionSchemas) return { [MACRO_TOOL_NAME]: tool({ // description: 'Output the result of the agent', inputSchema: zod.object({ // thinking: zod.string().optional(), evaluation_previous_goal: zod.string().optional(), memory: zod.string().optional(), next_goal: zod.string().optional(), action: actionSchema, }), execute: async (input, options) => { // abort if (this.#abortController.signal.aborted) throw new Error('AbortError') // pause await waitUntil(() => !this.paused) console.log(chalk.blue.bold('MacroTool execute'), input) const action = input.action! const toolName = Object.keys(action)[0] const toolInput = action[toolName] const brain = trimLines(`βœ…: ${input.evaluation_previous_goal} πŸ’Ύ: ${input.memory} 🎯: ${input.next_goal} `) console.log(brain) this.bus.emit('panel:update', { type: 'thinking', displayText: brain, }) // Find the corresponding tool const tool = tools.get(toolName) assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`) console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput, options) this.bus.emit('panel:update', { type: 'tool_executing', toolName, toolArgs: toolInput, displayText: getToolExecutingText(toolName, toolInput, this.i18n), }) const startTime = Date.now() // Execute tool, passing options parameter let result = await tool.execute!.bind(this)(toolInput, options) const duration = Date.now() - startTime console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result) if (toolName === 'wait') { this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000) result += `\n You have waited ${this.#totalWaitTime} seconds accumulatively.` if (this.#totalWaitTime >= 3) result += '\nDo NOT wait any longer unless you have a good reason.\n' result += '' } else { // For other tools, reset wait time this.#totalWaitTime = 0 } // Briefly display execution result const displayResult = getToolCompletedText(toolName, toolInput, this.i18n) if (displayResult) this.bus.emit('panel:update', { type: 'tool_executing', toolName, toolArgs: toolInput, toolResult: result, displayText: displayResult, duration, }) // Wait a moment to let user see the result await new Promise((resolve) => setTimeout(resolve, 100)) return result }, }), } } /** * Get system prompt, dynamically replace language settings based on configured language */ #getSystemPrompt(): string { let systemPrompt = SYSTEM_PROMPT const targetLanguage = this.config.language === 'zh-CN' ? 'δΈ­ζ–‡' : 'English' systemPrompt = systemPrompt.replace( /Default working language: \*\*.*?\*\*/, `Default working language: **${targetLanguage}**` ) return systemPrompt } #assembleUserPrompt(): string { let prompt = '' // // - prompt += '\n' this.history.forEach((history, index) => { prompt += ` Evaluation of Previous Step: ${history.brain.evaluation_previous_goal} Memory: ${history.brain.memory} Next Goal: ${history.brain.next_goal} Action Results: ${history.action.output} ` }) prompt += '\n\n' // // - // - // prompt += ` ${this.task} Step ${this.history.length + 1} of ${MAX_STEPS} max possible steps Current date and time: ${new Date().toISOString()} ` // prompt += this.#getBrowserState() return trimLines(prompt) } #onDone(text: string, success = true) { dom.cleanUpHighlights() // Update panel status this.bus.emit('panel:update', { type: success ? 'output' : 'error', displayText: text, }) // Task completed this.bus.emit('panel:update', { type: 'completed', displayText: this.i18n.t('ui.panel.taskCompleted'), }) this.mask.hide() this.#abortController.abort() } #getBrowserState(): string { const pageUrl = window.location.href const pageTitle = document.title const pi = getPageInfo() this.#updateTree() let prompt = trimLines(` Current Page: [${pageTitle}](${pageUrl}) Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page ${VIEWPORT_EXPANSION === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'} `) // Page header info const has_content_above = pi.pixels_above > 4 if (has_content_above && VIEWPORT_EXPANSION !== -1) { prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n` } else { prompt += `[Start of page]\n` } // Current viewport info prompt += this.simplifiedHTML prompt += `\n` // Page footer info const has_content_below = pi.pixels_below > 4 if (has_content_below && VIEWPORT_EXPANSION !== -1) { prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n` } else { prompt += `[End of page]\n` } prompt += `\n` return prompt } /** * Update document tree */ #updateTree() { this.dispatchEvent(new Event('beforeUpdate')) this.lastTimeUpdate = Date.now() dom.cleanUpHighlights() this.mask.wrapper.style.pointerEvents = 'none' this.flatTree = dom.getFlatTree({ ...this.config, interactiveBlacklist: [ ...(this.config.interactiveBlacklist || []), ...document.querySelectorAll('[data-page-agent-not-interactive]').values(), ], }) this.mask.wrapper.style.pointerEvents = 'auto' this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes) this.selectorMap.clear() this.selectorMap = dom.getSelectorMap(this.flatTree) this.elementTextMap.clear() this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML) this.dispatchEvent(new Event('afterUpdate')) } dispose() { console.log('Disposing PageAgent...') this.disposed = true dom.cleanUpHighlights() this.flatTree = null this.selectorMap.clear() this.elementTextMap.clear() this.panel.dispose() this.mask.dispose() this.history = [] this.#abortController.abort('PageAgent disposed') } }