refactor: mv getBrowserState to Controller; simplify Agent

This commit is contained in:
Simon
2026-01-15 20:17:50 +08:00
parent aefc3cfb89
commit b4c82b7833
3 changed files with 76 additions and 89 deletions

View File

@@ -110,7 +110,6 @@ export class PageAgent extends EventTarget {
taskId = '' taskId = ''
#llm: LLM #llm: LLM
#totalWaitTime = 0
#abortController = new AbortController() #abortController = new AbortController()
#llmRetryListener: ((e: Event) => void) | null = null #llmRetryListener: ((e: Event) => void) | null = null
#llmErrorListener: ((e: Event) => void) | null = null #llmErrorListener: ((e: Event) => void) | null = null
@@ -119,6 +118,9 @@ export class PageAgent extends EventTarget {
/** PageController for DOM operations */ /** PageController for DOM operations */
pageController: PageController pageController: PageController
/** Accumulated wait time in seconds, used by wait tool to track total waiting */
totalWaitTime = 0
/** History event stream */ /** History event stream */
history: HistoryEvent[] = [] history: HistoryEvent[] = []
@@ -373,20 +375,14 @@ export class PageAgent extends EventTarget {
const startTime = Date.now() const startTime = Date.now()
// Execute tool, bind `this` to PageAgent // Execute tool, bind `this` to PageAgent
let result = await tool.execute.bind(this)(toolInput) const result = await tool.execute.bind(this)(toolInput)
const duration = Date.now() - startTime const duration = Date.now() - startTime
console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result) console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
if (toolName === 'wait') { // Reset wait time for non-wait tools
this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000) if (toolName !== 'wait') {
result += `\n<sys> You have waited ${this.#totalWaitTime} seconds accumulatively.` this.totalWaitTime = 0
if (this.#totalWaitTime >= 3)
result += '\nDo NOT wait any longer unless you have a good reason.\n'
result += '</sys>'
} else {
// For other tools, reset wait time
this.#totalWaitTime = 0
} }
// Briefly display execution result // Briefly display execution result
@@ -539,51 +535,22 @@ export class PageAgent extends EventTarget {
} }
async #getBrowserState(): Promise<string> { async #getBrowserState(): Promise<string> {
const pageUrl = await this.pageController.getCurrentUrl() const state = await this.pageController.getBrowserState()
const pageTitle = await this.pageController.getPageTitle()
const pi = await this.pageController.getPageInfo()
const viewportExpansion = await this.pageController.getViewportExpansion()
await this.pageController.updateTree()
let simplifiedHTML = await this.pageController.getSimplifiedHTML()
let content = state.content
if (this.config.transformPageContent) { if (this.config.transformPageContent) {
simplifiedHTML = await this.config.transformPageContent(simplifiedHTML) content = await this.config.transformPageContent(content)
} }
let prompt = trimLines(`<browser_state> return trimLines(`<browser_state>
Current Page: [${pageTitle}](${pageUrl}) Current Page: [${state.title}](${state.url})
Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page ${state.header}
${content}
${viewportExpansion === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'} ${state.footer}
</browser_state>
`) `)
// Page header info
const has_content_above = pi.pixels_above > 4
if (has_content_above && viewportExpansion !== -1) {
prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[Start of page]\n`
}
// Current viewport info
prompt += simplifiedHTML
prompt += `\n`
// Page footer info
const has_content_below = pi.pixels_below > 4
if (has_content_below && viewportExpansion !== -1) {
prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[End of page]\n`
}
prompt += `</browser_state>\n`
return prompt
} }
dispose(reason?: string) { dispose(reason?: string) {

View File

@@ -57,6 +57,15 @@ tools.set(
const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000) const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
console.log(`actualWaitTime: ${actualWaitTime} seconds`) console.log(`actualWaitTime: ${actualWaitTime} seconds`)
await waitFor(actualWaitTime) await waitFor(actualWaitTime)
this.totalWaitTime += input.seconds
if (this.totalWaitTime >= 3) {
this.pushObservation(
`You have waited ${this.totalWaitTime} seconds accumulatively. Do NOT wait any longer unless you have a good reason.`
)
}
return `✅ Waited for ${input.seconds} seconds.` return `✅ Waited for ${input.seconds} seconds.`
}, },
}) })

View File

@@ -32,6 +32,20 @@ export interface PageControllerConfig extends dom.DomConfig {
enableMask?: boolean enableMask?: boolean
} }
/**
* Structured browser state for LLM consumption
*/
export interface BrowserState {
url: string
title: string
/** Page info + scroll position hint (e.g. "Page info: 1920x1080px...\n[Start of page]") */
header: string
/** Simplified HTML of interactive elements */
content: string
/** Page footer hint (e.g. "... 300 pixels below ..." or "[End of page]") */
footer: string
}
interface ActionResult { interface ActionResult {
success: boolean success: boolean
message: string message: string
@@ -93,42 +107,6 @@ export class PageController extends EventTarget {
return window.location.href return window.location.href
} }
/**
* Get current page title
*/
async getPageTitle(): Promise<string> {
return document.title
}
/**
* Get page scroll and size info
*/
async getPageInfo() {
return getPageInfo()
}
/**
* Get the simplified HTML representation of the page.
* This is used by LLM to understand the page structure.
*/
async getSimplifiedHTML(): Promise<string> {
return this.simplifiedHTML
}
/**
* Get text description for an element by index
*/
async getElementText(index: number): Promise<string | undefined> {
return this.elementTextMap.get(index)
}
/**
* Get total number of indexed interactive elements
*/
async getElementCount(): Promise<number> {
return this.selectorMap.size
}
/** /**
* Get last tree update timestamp * Get last tree update timestamp
*/ */
@@ -137,10 +115,43 @@ export class PageController extends EventTarget {
} }
/** /**
* Get the viewport expansion setting * Get structured browser state for LLM consumption.
* Automatically calls updateTree() to refresh the DOM state.
*/ */
async getViewportExpansion(): Promise<number> { async getBrowserState(): Promise<BrowserState> {
return this.config.viewportExpansion ?? VIEWPORT_EXPANSION const url = window.location.href
const title = document.title
const pi = getPageInfo()
const viewportExpansion = this.config.viewportExpansion ?? VIEWPORT_EXPANSION
await this.updateTree()
const content = this.simplifiedHTML
// Build header: page info + scroll position hint
const pageInfoLine = `Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page`
const elementsLabel =
viewportExpansion === -1
? 'Interactive elements from top layer of the current page (full page):'
: 'Interactive elements from top layer of the current page inside the viewport:'
const hasContentAbove = pi.pixels_above > 4
const scrollHintAbove =
hasContentAbove && viewportExpansion !== -1
? `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...`
: '[Start of page]'
const header = `${pageInfoLine}\n\n${elementsLabel}\n\n${scrollHintAbove}`
// Build footer: scroll position hint
const hasContentBelow = pi.pixels_below > 4
const footer =
hasContentBelow && viewportExpansion !== -1
? `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...`
: '[End of page]'
return { url, title, header, content, footer }
} }
// ======= DOM Tree Operations ======= // ======= DOM Tree Operations =======