From b4c82b78332791819c22e07b4a94c9a94b36c784 Mon Sep 17 00:00:00 2001
From: Simon <10131203+gaomeng1900@users.noreply.github.com>
Date: Thu, 15 Jan 2026 20:17:50 +0800
Subject: [PATCH] refactor: mv `getBrowserState` to `Controller`; simplify
`Agent`
---
packages/page-agent/src/PageAgent.ts | 67 ++++----------
packages/page-agent/src/tools/index.ts | 9 ++
.../page-controller/src/PageController.ts | 89 +++++++++++--------
3 files changed, 76 insertions(+), 89 deletions(-)
diff --git a/packages/page-agent/src/PageAgent.ts b/packages/page-agent/src/PageAgent.ts
index 64d4e18..c1402ec 100644
--- a/packages/page-agent/src/PageAgent.ts
+++ b/packages/page-agent/src/PageAgent.ts
@@ -110,7 +110,6 @@ export class PageAgent extends EventTarget {
taskId = ''
#llm: LLM
- #totalWaitTime = 0
#abortController = new AbortController()
#llmRetryListener: ((e: Event) => void) | null = null
#llmErrorListener: ((e: Event) => void) | null = null
@@ -119,6 +118,9 @@ export class PageAgent extends EventTarget {
/** PageController for DOM operations */
pageController: PageController
+ /** Accumulated wait time in seconds, used by wait tool to track total waiting */
+ totalWaitTime = 0
+
/** History event stream */
history: HistoryEvent[] = []
@@ -373,20 +375,14 @@ export class PageAgent extends EventTarget {
const startTime = Date.now()
// Execute tool, bind `this` to PageAgent
- let result = await tool.execute.bind(this)(toolInput)
+ const result = await tool.execute.bind(this)(toolInput)
const duration = Date.now() - startTime
console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
- if (toolName === 'wait') {
- this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000)
- result += `\n You have waited ${this.#totalWaitTime} seconds accumulatively.`
- if (this.#totalWaitTime >= 3)
- result += '\nDo NOT wait any longer unless you have a good reason.\n'
- result += ''
- } else {
- // For other tools, reset wait time
- this.#totalWaitTime = 0
+ // Reset wait time for non-wait tools
+ if (toolName !== 'wait') {
+ this.totalWaitTime = 0
}
// Briefly display execution result
@@ -539,51 +535,22 @@ export class PageAgent extends EventTarget {
}
async #getBrowserState(): Promise {
- const pageUrl = await this.pageController.getCurrentUrl()
- const pageTitle = await this.pageController.getPageTitle()
- const pi = await this.pageController.getPageInfo()
- const viewportExpansion = await this.pageController.getViewportExpansion()
-
- await this.pageController.updateTree()
-
- let simplifiedHTML = await this.pageController.getSimplifiedHTML()
+ const state = await this.pageController.getBrowserState()
+ let content = state.content
if (this.config.transformPageContent) {
- simplifiedHTML = await this.config.transformPageContent(simplifiedHTML)
+ content = await this.config.transformPageContent(content)
}
- let prompt = trimLines(`
- Current Page: [${pageTitle}](${pageUrl})
-
- Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page
-
- ${viewportExpansion === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'}
+ return trimLines(`
+ Current Page: [${state.title}](${state.url})
+
+ ${state.header}
+ ${content}
+ ${state.footer}
+
`)
-
- // Page header info
- const has_content_above = pi.pixels_above > 4
- if (has_content_above && viewportExpansion !== -1) {
- prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n`
- } else {
- prompt += `[Start of page]\n`
- }
-
- // Current viewport info
- prompt += simplifiedHTML
- prompt += `\n`
-
- // Page footer info
- const has_content_below = pi.pixels_below > 4
- if (has_content_below && viewportExpansion !== -1) {
- prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n`
- } else {
- prompt += `[End of page]\n`
- }
-
- prompt += `\n`
-
- return prompt
}
dispose(reason?: string) {
diff --git a/packages/page-agent/src/tools/index.ts b/packages/page-agent/src/tools/index.ts
index 9245991..4f18afc 100644
--- a/packages/page-agent/src/tools/index.ts
+++ b/packages/page-agent/src/tools/index.ts
@@ -57,6 +57,15 @@ tools.set(
const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
console.log(`actualWaitTime: ${actualWaitTime} seconds`)
await waitFor(actualWaitTime)
+
+ this.totalWaitTime += input.seconds
+
+ if (this.totalWaitTime >= 3) {
+ this.pushObservation(
+ `You have waited ${this.totalWaitTime} seconds accumulatively. Do NOT wait any longer unless you have a good reason.`
+ )
+ }
+
return `✅ Waited for ${input.seconds} seconds.`
},
})
diff --git a/packages/page-controller/src/PageController.ts b/packages/page-controller/src/PageController.ts
index df4a60a..30f750e 100644
--- a/packages/page-controller/src/PageController.ts
+++ b/packages/page-controller/src/PageController.ts
@@ -32,6 +32,20 @@ export interface PageControllerConfig extends dom.DomConfig {
enableMask?: boolean
}
+/**
+ * Structured browser state for LLM consumption
+ */
+export interface BrowserState {
+ url: string
+ title: string
+ /** Page info + scroll position hint (e.g. "Page info: 1920x1080px...\n[Start of page]") */
+ header: string
+ /** Simplified HTML of interactive elements */
+ content: string
+ /** Page footer hint (e.g. "... 300 pixels below ..." or "[End of page]") */
+ footer: string
+}
+
interface ActionResult {
success: boolean
message: string
@@ -93,42 +107,6 @@ export class PageController extends EventTarget {
return window.location.href
}
- /**
- * Get current page title
- */
- async getPageTitle(): Promise {
- return document.title
- }
-
- /**
- * Get page scroll and size info
- */
- async getPageInfo() {
- return getPageInfo()
- }
-
- /**
- * Get the simplified HTML representation of the page.
- * This is used by LLM to understand the page structure.
- */
- async getSimplifiedHTML(): Promise {
- return this.simplifiedHTML
- }
-
- /**
- * Get text description for an element by index
- */
- async getElementText(index: number): Promise {
- return this.elementTextMap.get(index)
- }
-
- /**
- * Get total number of indexed interactive elements
- */
- async getElementCount(): Promise {
- return this.selectorMap.size
- }
-
/**
* Get last tree update timestamp
*/
@@ -137,10 +115,43 @@ export class PageController extends EventTarget {
}
/**
- * Get the viewport expansion setting
+ * Get structured browser state for LLM consumption.
+ * Automatically calls updateTree() to refresh the DOM state.
*/
- async getViewportExpansion(): Promise {
- return this.config.viewportExpansion ?? VIEWPORT_EXPANSION
+ async getBrowserState(): Promise {
+ const url = window.location.href
+ const title = document.title
+ const pi = getPageInfo()
+ const viewportExpansion = this.config.viewportExpansion ?? VIEWPORT_EXPANSION
+
+ await this.updateTree()
+
+ const content = this.simplifiedHTML
+
+ // Build header: page info + scroll position hint
+ const pageInfoLine = `Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page`
+
+ const elementsLabel =
+ viewportExpansion === -1
+ ? 'Interactive elements from top layer of the current page (full page):'
+ : 'Interactive elements from top layer of the current page inside the viewport:'
+
+ const hasContentAbove = pi.pixels_above > 4
+ const scrollHintAbove =
+ hasContentAbove && viewportExpansion !== -1
+ ? `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...`
+ : '[Start of page]'
+
+ const header = `${pageInfoLine}\n\n${elementsLabel}\n\n${scrollHintAbove}`
+
+ // Build footer: scroll position hint
+ const hasContentBelow = pi.pixels_below > 4
+ const footer =
+ hasContentBelow && viewportExpansion !== -1
+ ? `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...`
+ : '[End of page]'
+
+ return { url, title, header, content, footer }
}
// ======= DOM Tree Operations =======