diff --git a/packages/page-agent/src/PageAgent.ts b/packages/page-agent/src/PageAgent.ts index 64d4e18..c1402ec 100644 --- a/packages/page-agent/src/PageAgent.ts +++ b/packages/page-agent/src/PageAgent.ts @@ -110,7 +110,6 @@ export class PageAgent extends EventTarget { taskId = '' #llm: LLM - #totalWaitTime = 0 #abortController = new AbortController() #llmRetryListener: ((e: Event) => void) | null = null #llmErrorListener: ((e: Event) => void) | null = null @@ -119,6 +118,9 @@ export class PageAgent extends EventTarget { /** PageController for DOM operations */ pageController: PageController + /** Accumulated wait time in seconds, used by wait tool to track total waiting */ + totalWaitTime = 0 + /** History event stream */ history: HistoryEvent[] = [] @@ -373,20 +375,14 @@ export class PageAgent extends EventTarget { const startTime = Date.now() // Execute tool, bind `this` to PageAgent - let result = await tool.execute.bind(this)(toolInput) + const result = await tool.execute.bind(this)(toolInput) const duration = Date.now() - startTime console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result) - if (toolName === 'wait') { - this.#totalWaitTime += Math.round(toolInput.seconds + duration / 1000) - result += `\n You have waited ${this.#totalWaitTime} seconds accumulatively.` - if (this.#totalWaitTime >= 3) - result += '\nDo NOT wait any longer unless you have a good reason.\n' - result += '' - } else { - // For other tools, reset wait time - this.#totalWaitTime = 0 + // Reset wait time for non-wait tools + if (toolName !== 'wait') { + this.totalWaitTime = 0 } // Briefly display execution result @@ -539,51 +535,22 @@ export class PageAgent extends EventTarget { } async #getBrowserState(): Promise { - const pageUrl = await this.pageController.getCurrentUrl() - const pageTitle = await this.pageController.getPageTitle() - const pi = await this.pageController.getPageInfo() - const viewportExpansion = await this.pageController.getViewportExpansion() - - await this.pageController.updateTree() - - let simplifiedHTML = await this.pageController.getSimplifiedHTML() + const state = await this.pageController.getBrowserState() + let content = state.content if (this.config.transformPageContent) { - simplifiedHTML = await this.config.transformPageContent(simplifiedHTML) + content = await this.config.transformPageContent(content) } - let prompt = trimLines(` - Current Page: [${pageTitle}](${pageUrl}) - - Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page - - ${viewportExpansion === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'} + return trimLines(` + Current Page: [${state.title}](${state.url}) + + ${state.header} + ${content} + ${state.footer} + `) - - // Page header info - const has_content_above = pi.pixels_above > 4 - if (has_content_above && viewportExpansion !== -1) { - prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n` - } else { - prompt += `[Start of page]\n` - } - - // Current viewport info - prompt += simplifiedHTML - prompt += `\n` - - // Page footer info - const has_content_below = pi.pixels_below > 4 - if (has_content_below && viewportExpansion !== -1) { - prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n` - } else { - prompt += `[End of page]\n` - } - - prompt += `\n` - - return prompt } dispose(reason?: string) { diff --git a/packages/page-agent/src/tools/index.ts b/packages/page-agent/src/tools/index.ts index 9245991..4f18afc 100644 --- a/packages/page-agent/src/tools/index.ts +++ b/packages/page-agent/src/tools/index.ts @@ -57,6 +57,15 @@ tools.set( const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000) console.log(`actualWaitTime: ${actualWaitTime} seconds`) await waitFor(actualWaitTime) + + this.totalWaitTime += input.seconds + + if (this.totalWaitTime >= 3) { + this.pushObservation( + `You have waited ${this.totalWaitTime} seconds accumulatively. Do NOT wait any longer unless you have a good reason.` + ) + } + return `✅ Waited for ${input.seconds} seconds.` }, }) diff --git a/packages/page-controller/src/PageController.ts b/packages/page-controller/src/PageController.ts index df4a60a..30f750e 100644 --- a/packages/page-controller/src/PageController.ts +++ b/packages/page-controller/src/PageController.ts @@ -32,6 +32,20 @@ export interface PageControllerConfig extends dom.DomConfig { enableMask?: boolean } +/** + * Structured browser state for LLM consumption + */ +export interface BrowserState { + url: string + title: string + /** Page info + scroll position hint (e.g. "Page info: 1920x1080px...\n[Start of page]") */ + header: string + /** Simplified HTML of interactive elements */ + content: string + /** Page footer hint (e.g. "... 300 pixels below ..." or "[End of page]") */ + footer: string +} + interface ActionResult { success: boolean message: string @@ -93,42 +107,6 @@ export class PageController extends EventTarget { return window.location.href } - /** - * Get current page title - */ - async getPageTitle(): Promise { - return document.title - } - - /** - * Get page scroll and size info - */ - async getPageInfo() { - return getPageInfo() - } - - /** - * Get the simplified HTML representation of the page. - * This is used by LLM to understand the page structure. - */ - async getSimplifiedHTML(): Promise { - return this.simplifiedHTML - } - - /** - * Get text description for an element by index - */ - async getElementText(index: number): Promise { - return this.elementTextMap.get(index) - } - - /** - * Get total number of indexed interactive elements - */ - async getElementCount(): Promise { - return this.selectorMap.size - } - /** * Get last tree update timestamp */ @@ -137,10 +115,43 @@ export class PageController extends EventTarget { } /** - * Get the viewport expansion setting + * Get structured browser state for LLM consumption. + * Automatically calls updateTree() to refresh the DOM state. */ - async getViewportExpansion(): Promise { - return this.config.viewportExpansion ?? VIEWPORT_EXPANSION + async getBrowserState(): Promise { + const url = window.location.href + const title = document.title + const pi = getPageInfo() + const viewportExpansion = this.config.viewportExpansion ?? VIEWPORT_EXPANSION + + await this.updateTree() + + const content = this.simplifiedHTML + + // Build header: page info + scroll position hint + const pageInfoLine = `Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page` + + const elementsLabel = + viewportExpansion === -1 + ? 'Interactive elements from top layer of the current page (full page):' + : 'Interactive elements from top layer of the current page inside the viewport:' + + const hasContentAbove = pi.pixels_above > 4 + const scrollHintAbove = + hasContentAbove && viewportExpansion !== -1 + ? `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...` + : '[Start of page]' + + const header = `${pageInfoLine}\n\n${elementsLabel}\n\n${scrollHintAbove}` + + // Build footer: scroll position hint + const hasContentBelow = pi.pixels_below > 4 + const footer = + hasContentBelow && viewportExpansion !== -1 + ? `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...` + : '[End of page]' + + return { url, title, header, content, footer } } // ======= DOM Tree Operations =======