refactor(PageController): implement PageController

This commit is contained in:
Simon
2025-12-05 16:18:01 +08:00
parent ad19a26a57
commit 683602bb6b
33 changed files with 823 additions and 363 deletions

View File

@@ -39,10 +39,10 @@
},
"homepage": "https://alibaba.github.io/page-agent/",
"scripts": {
"build": "MODE=lib vite build && MODE=umd vite build",
"build": "npm run build:lib && npm run build:umd",
"build:lib": "MODE=lib vite build",
"build:umd": "MODE=umd vite build",
"build:watch": "MODE=lib vite build --watch",
"build:watch": "MODE=umd vite build --watch",
"prepublishOnly": "node -e \"const fs=require('fs');['README.md','LICENSE'].forEach(f=>fs.copyFileSync('../../'+f,f))\"",
"postpublish": "node -e \"['README.md','LICENSE'].forEach(f=>{try{require('fs').unlinkSync(f)}catch{}})\""
},
@@ -52,8 +52,6 @@
"zod": "^4.1.12"
},
"devDependencies": {
"@microsoft/api-extractor": "^7.55.1",
"unplugin-dts": "^1.0.0-beta.6",
"vite-plugin-css-injected-by-js": "^3.5.2"
"@page-agent/page-controller": "*"
}
}

View File

@@ -2,17 +2,14 @@
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import { PageController } from '@page-agent/page-controller'
import chalk from 'chalk'
import zod from 'zod'
import type { PageAgentConfig } from './config'
import { MAX_STEPS, VIEWPORT_EXPANSION } from './config/constants'
import * as dom from './dom'
import { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
import { getPageInfo } from './dom/getPageInfo'
import { MAX_STEPS } from './config/constants'
import { I18n } from './i18n'
import { LLM, type Tool } from './llms'
import { patchReact } from './patches/react'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools'
import { Panel, getToolCompletedText, getToolExecutingText } from './ui/Panel'
@@ -87,19 +84,8 @@ export class PageAgent extends EventTarget {
#totalWaitTime = 0
#abortController = new AbortController()
/** Corresponds to eval_page in browser-use */
flatTree: FlatDomTree | null = null
/**
* All highlighted index-mapped interactive elements
* Corresponds to DOMState.selector_map in browser-use
*/
selectorMap = new Map<number, InteractiveElementDomNode>()
/** highlight index -> element text */
elementTextMap = new Map<number, string>()
/** Corresponds to clickable_elements_to_string in browser-use */
simplifiedHTML = '<EMPTY>'
/** last time the tree was updated */
lastTimeUpdate = 0
/** PageController for DOM operations */
pageController: PageController
/** Fullscreen mask */
mask = new SimulatorMask()
@@ -115,6 +101,9 @@ export class PageAgent extends EventTarget {
this.panel = new Panel(this)
this.tools = new Map(tools)
// Initialize PageController with config
this.pageController = new PageController(this.config)
if (this.config.customTools) {
for (const [name, tool] of Object.entries(this.config.customTools)) {
if (tool === null) {
@@ -129,8 +118,6 @@ export class PageAgent extends EventTarget {
this.tools.delete('execute_javascript')
}
patchReact(this)
window.addEventListener('beforeunload', (e) => {
if (!this.disposed) this.dispose('PAGE_UNLOADING')
})
@@ -175,7 +162,7 @@ export class PageAgent extends EventTarget {
while (true) {
await onBeforeStep.call(this, step)
console.group(`step: ${step + 1}`)
console.group(`step: ${step}`)
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
@@ -197,7 +184,7 @@ export class PageAgent extends EventTarget {
},
{
role: 'user',
content: this.#assembleUserPrompt(),
content: await this.#assembleUserPrompt(),
},
],
{ AgentOutput: this.#packMacroTool() },
@@ -392,7 +379,7 @@ export class PageAgent extends EventTarget {
return systemPrompt
}
#assembleUserPrompt(): string {
async #assembleUserPrompt(): Promise<string> {
let prompt = ''
// <agent_history>
@@ -430,13 +417,13 @@ export class PageAgent extends EventTarget {
// <browser_state>
prompt += this.#getBrowserState()
prompt += await this.#getBrowserState()
return trimLines(prompt)
}
#onDone(text: string, success = true) {
dom.cleanUpHighlights()
this.pageController.cleanUpHighlights()
// Update panel status
this.bus.emit('panel:update', {
@@ -455,37 +442,42 @@ export class PageAgent extends EventTarget {
this.#abortController.abort()
}
#getBrowserState(): string {
const pageUrl = window.location.href
const pageTitle = document.title
const pi = getPageInfo()
async #getBrowserState(): Promise<string> {
const pageUrl = await this.pageController.getCurrentUrl()
const pageTitle = await this.pageController.getPageTitle()
const pi = await this.pageController.getPageInfo()
const viewportExpansion = await this.pageController.getViewportExpansion()
this.#updateTree()
this.mask.wrapper.style.pointerEvents = 'none'
await this.pageController.updateTree()
this.mask.wrapper.style.pointerEvents = 'auto'
const simplifiedHTML = await this.pageController.getSimplifiedHTML()
let prompt = trimLines(`<browser_state>
Current Page: [${pageTitle}](${pageUrl})
Page info: ${pi.viewport_width}x${pi.viewport_height}px viewport, ${pi.page_width}x${pi.page_height}px total page size, ${pi.pages_above.toFixed(1)} pages above, ${pi.pages_below.toFixed(1)} pages below, ${pi.total_pages.toFixed(1)} total pages, at ${(pi.current_page_position * 100).toFixed(0)}% of page
${VIEWPORT_EXPANSION === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'}
${viewportExpansion === -1 ? 'Interactive elements from top layer of the current page (full page):' : 'Interactive elements from top layer of the current page inside the viewport:'}
`)
// Page header info
const has_content_above = pi.pixels_above > 4
if (has_content_above && VIEWPORT_EXPANSION !== -1) {
if (has_content_above && viewportExpansion !== -1) {
prompt += `... ${pi.pixels_above} pixels above (${pi.pages_above.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[Start of page]\n`
}
// Current viewport info
prompt += this.simplifiedHTML
prompt += simplifiedHTML
prompt += `\n`
// Page footer info
const has_content_below = pi.pixels_below > 4
if (has_content_below && VIEWPORT_EXPANSION !== -1) {
if (has_content_below && viewportExpansion !== -1) {
prompt += `... ${pi.pixels_below} pixels below (${pi.pages_below.toFixed(1)} pages) - scroll to see more ...\n`
} else {
prompt += `[End of page]\n`
@@ -496,37 +488,10 @@ export class PageAgent extends EventTarget {
return prompt
}
/**
* Update document tree
*/
#updateTree() {
this.dispatchEvent(new Event('beforeUpdate'))
this.lastTimeUpdate = Date.now()
dom.cleanUpHighlights()
this.mask.wrapper.style.pointerEvents = 'none'
this.flatTree = dom.getFlatTree({
...this.config,
interactiveBlacklist: [
...(this.config.interactiveBlacklist || []),
...document.querySelectorAll('[data-page-agent-not-interactive]').values(),
],
})
this.mask.wrapper.style.pointerEvents = 'auto'
this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes)
this.selectorMap.clear()
this.selectorMap = dom.getSelectorMap(this.flatTree)
this.elementTextMap.clear()
this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML)
this.dispatchEvent(new Event('afterUpdate'))
}
dispose(reason?: string) {
console.log('Disposing PageAgent...')
this.disposed = true
dom.cleanUpHighlights()
this.flatTree = null
this.selectorMap.clear()
this.elementTextMap.clear()
this.pageController.dispose()
this.panel.dispose()
this.mask.dispose()
this.history = []

View File

@@ -1,10 +1,3 @@
/**
* @note Since isTopElement depends on elementFromPoint,
* it returns null when out of viewport, this feature has no practical use, only differ between -1 and 0
*/
// export const VIEWPORT_EXPANSION = 100
export const VIEWPORT_EXPANSION = -1
// Dev environment: use .env config if available, otherwise fallback to testing api
export const DEFAULT_MODEL_NAME: string =
import.meta.env.DEV && import.meta.env.LLM_MODEL_NAME

View File

@@ -1,5 +1,6 @@
import type { PageControllerConfig } from '@page-agent/page-controller'
import type { AgentHistory, ExecutionResult, PageAgent } from '../PageAgent'
import type { DomConfig } from '../dom'
import type { SupportedLanguage } from '../i18n'
import type { PageAgentTool } from '../tools'
import {
@@ -94,7 +95,7 @@ export interface AgentConfig {
experimentalPreventNewPage?: boolean
}
export type PageAgentConfig = LLMConfig & AgentConfig & DomConfig
export type PageAgentConfig = LLMConfig & AgentConfig & PageControllerConfig
export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
return {

View File

@@ -1,5 +1,5 @@
/// <reference types="vite/client" />
import type { PageAgent } from './src/PageAgent'
import type { PageAgent } from './PageAgent'
declare module '*.module.css' {
const classes: Record<string, string>

View File

@@ -5,21 +5,7 @@
import zod, { type z } from 'zod'
import type { PageAgent } from '../PageAgent'
import {
clickElement,
getElementByIndex,
getSystemInfo,
inputTextElement,
scrollHorizontally,
scrollVertically,
selectOptionElement,
waitFor,
} from './actions'
// debug
import * as utils from './actions'
// @ts-expect-error debug only
window.utils = utils
import { waitFor } from '../utils'
/**
* Internal tool definition that has access to PageAgent `this` context
@@ -41,18 +27,6 @@ export function tool<TParams>(options: PageAgentTool<TParams>): PageAgentTool<TP
*/
export const tools = new Map<string, PageAgentTool>()
// tools.set(
// 'get_current_html',
// tool({
// description: 'Get the current (updated) simplified HTML of the page',
// inputSchema: zod.object({}),
// execute: function (this: PageAgent) {
// this.updateTree()
// return this.simplifiedHTML
// },
// })
// )
tools.set(
'done',
tool({
@@ -79,11 +53,11 @@ tools.set(
seconds: zod.number().min(1).max(10).default(1),
}),
execute: async function (this: PageAgent, input) {
const lastTimeUpdate = this.lastTimeUpdate
const lastTimeUpdate = await this.pageController.getLastUpdateTime()
const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
console.log(`actualWaitTime: ${actualWaitTime} seconds`)
await waitFor(actualWaitTime)
return `✅ Waited for ${input.seconds} seconds.` + (await getSystemInfo())
return `✅ Waited for ${input.seconds} seconds.`
},
})
)
@@ -98,7 +72,7 @@ tools.set(
}),
execute: async function (this: PageAgent, input) {
const answer = await this.panel.askUser(input.question)
return `✅ Received user answer: ${answer}` + (await getSystemInfo())
return `✅ Received user answer: ${answer}`
},
})
)
@@ -111,16 +85,8 @@ tools.set(
index: zod.int().min(0),
}),
execute: async function (this: PageAgent, input) {
const element = getElementByIndex(this, input.index)
const elemText = this.elementTextMap.get(input.index)
await clickElement(element)
// @workaround: Handle links that open in new tabs
if (element instanceof HTMLAnchorElement && element.target === '_blank') {
return `⚠️ Clicked link that opens in a new tab (${elemText ? elemText : input.index}). You are not capable of reading new tabs.`
}
return `✅ Clicked element (${elemText ? elemText : input.index}).` + (await getSystemInfo())
const result = await this.pageController.clickElement(input.index)
return result.message
},
})
)
@@ -134,13 +100,8 @@ tools.set(
text: zod.string(),
}),
execute: async function (this: PageAgent, input) {
const element = getElementByIndex(this, input.index)
const elemText = this.elementTextMap.get(input.index)
await inputTextElement(element, input.text)
return (
`✅ Input text (${input.text}) into element (${elemText ? elemText : input.index}).` +
(await getSystemInfo())
)
const result = await this.pageController.inputText(input.index, input.text)
return result.message
},
})
)
@@ -155,13 +116,8 @@ tools.set(
text: zod.string(),
}),
execute: async function (this: PageAgent, input) {
const element = getElementByIndex(this, input.index)
const elemText = this.elementTextMap.get(input.index)
await selectOptionElement(element as HTMLSelectElement, input.text)
return (
`✅ Selected option (${input.text}) in element (${elemText ? elemText : input.index}).` +
(await getSystemInfo())
)
const result = await this.pageController.selectOption(input.index, input.text)
return result.message
},
})
)
@@ -181,13 +137,11 @@ tools.set(
index: zod.number().int().min(0).optional(),
}),
execute: async function (this: PageAgent, input) {
const { down, num_pages, index, pixels } = input
const scroll_amount = pixels ? pixels : num_pages * (down ? 1 : -1) * window.innerHeight
const element = index !== undefined ? getElementByIndex(this, index) : null
return (await scrollVertically(down, scroll_amount, element)) + (await getSystemInfo())
const result = await this.pageController.scroll({
...input,
numPages: input.num_pages,
})
return result.message
},
})
)
@@ -203,13 +157,8 @@ tools.set(
index: zod.number().int().min(0).optional(),
}),
execute: async function (this: PageAgent, input) {
const { right, pixels, index } = input
const scroll_amount = pixels * (right ? 1 : -1)
const element = index !== undefined ? getElementByIndex(this, index) : null
return (await scrollHorizontally(right, scroll_amount, element)) + (await getSystemInfo())
const result = await this.pageController.scrollHorizontally(input)
return result.message
},
})
)
@@ -223,14 +172,8 @@ tools.set(
script: zod.string(),
}),
execute: async function (this: PageAgent, input) {
try {
// Wrap script in async function to support await
const asyncFunction = eval(`(async () => { ${input.script} })`)
const result = await asyncFunction()
return `✅ Executed JavaScript. Result: ${result}` + (await getSystemInfo())
} catch (error) {
return `❌ Error executing JavaScript: ${error}` + (await getSystemInfo())
}
const result = await this.pageController.executeJavascript(input.script)
return result.message
},
})
)

View File

@@ -23,8 +23,6 @@ export interface PageAgentEventMap {
'panel:collapse': { params: undefined }
// PageAgent status events
// 'agent:beforeUpdate': { params: undefined }
// 'agent:afterUpdate': { params: undefined }
// 'agent:execute': { params: { task: string } }
// 'agent:done': { params: { text: string; success: boolean } }
// 'agent:paused': { params: undefined }

View File

@@ -20,6 +20,10 @@ export async function waitUntil(check: () => boolean, timeout = 60 * 60_1000): P
})
}
export async function waitFor(seconds: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
}
//
export function truncate(text: string, maxLength: number): string {

View File

@@ -0,0 +1,9 @@
{
"extends": "./tsconfig.json",
"compilerOptions": {
// @workaround DTS bug
// dts do not work with monorepo path mapping
// disable path mapping for it
"paths": {}
}
}

View File

@@ -1,10 +1,16 @@
{
"extends": "../../tsconfig.json",
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"composite": true,
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo",
"noEmit": false,
"outDir": "./dist",
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo"
"allowImportingTsExtensions": false,
"baseUrl": ".",
"outDir": "dist",
"paths": {
"@page-agent/page-controller": ["../page-controller/src/PageController.ts"]
}
},
"include": ["src", "env.d.ts"]
"include": ["**/*.ts"],
"exclude": ["dist", "node_modules"],
"references": [{ "path": "../page-controller" }]
}

View File

@@ -17,7 +17,8 @@ const __dirname = dirname(fileURLToPath(import.meta.url))
const libConfig = {
clearScreen: false,
plugins: [
dts({ tsconfigPath: './tsconfig.json', bundleTypes: true }),
dts({ tsconfigPath: './tsconfig.dts.json', bundleTypes: true }),
// dts({ tsconfigPath: './tsconfig.json', bundleTypes: true, compilerOptions: { paths: {} } }),
cssInjectedByJsPlugin({ relativeCSSInjection: true }),
],
publicDir: false,
@@ -33,7 +34,7 @@ const libConfig = {
},
outDir: resolve(__dirname, 'dist', 'lib'),
rollupOptions: {
external: ['ai', 'ai-motion', 'chalk', 'zod'],
external: ['ai', 'ai-motion', 'chalk', 'zod', '@page-agent/*'],
},
minify: false,
sourcemap: true,
@@ -54,6 +55,11 @@ const umdConfig = {
esbuild: {
keepNames: true,
},
resolve: {
alias: {
'@page-agent/page-controller': resolve(__dirname, '../page-controller/src/PageController.ts'),
},
},
build: {
lib: {
entry: resolve(__dirname, 'src/entry.ts'),

View File

@@ -0,0 +1,41 @@
{
"name": "@page-agent/page-controller",
"private": false,
"version": "0.0.6",
"type": "module",
"main": "./dist/lib/page-controller.js",
"module": "./dist/lib/page-controller.js",
"types": "./dist/lib/PageController.d.ts",
"exports": {
".": {
"types": "./dist/lib/PageController.d.ts",
"import": "./dist/lib/page-controller.js",
"default": "./dist/lib/page-controller.js"
}
},
"files": [
"dist/",
"README.md",
"LICENSE"
],
"description": "Page controller for page-agent - DOM operations and element interactions",
"keywords": [
"page-agent",
"dom",
"browser-automation",
"web-automation"
],
"author": "Simon<gaomeng1900>",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/alibaba/page-agent.git"
},
"homepage": "https://alibaba.github.io/page-agent/",
"scripts": {
"build": "vite build",
"build:watch": "vite build --watch",
"prepublishOnly": "node -e \"const fs=require('fs');['README.md','LICENSE'].forEach(f=>fs.copyFileSync('../../'+f,f))\"",
"postpublish": "node -e \"['README.md','LICENSE'].forEach(f=>{try{require('fs').unlinkSync(f)}catch{}})\""
}
}

View File

@@ -0,0 +1,339 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*
* PageController - Manages DOM operations and element interactions.
* Designed to be independent of LLM and can be tested in unit tests.
* All public methods are async for potential remote calling support.
*/
import {
clickElement,
getElementByIndex,
inputTextElement,
scrollHorizontally,
scrollVertically,
selectOptionElement,
} from './actions'
import { VIEWPORT_EXPANSION } from './constants'
import * as dom from './dom'
import type { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
import { getPageInfo } from './dom/getPageInfo'
import { patchReact } from './patches/react'
/**
* Configuration for PageController
*/
export interface PageControllerConfig extends dom.DomConfig {
viewportExpansion?: number
}
interface ActionResult {
success: boolean
message: string
}
/**
* PageController manages DOM state and element interactions.
* It provides async methods for all DOM operations, keeping state isolated.
*
* @lifecycle
* - beforeUpdate: Emitted before the DOM tree is updated.
* - afterUpdate: Emitted after the DOM tree is updated.
*/
export class PageController extends EventTarget {
private config: PageControllerConfig
/** Corresponds to eval_page in browser-use */
private flatTree: FlatDomTree | null = null
/**
* All highlighted index-mapped interactive elements
* Corresponds to DOMState.selector_map in browser-use
*/
private selectorMap = new Map<number, InteractiveElementDomNode>()
/** Index -> element text description mapping */
private elementTextMap = new Map<number, string>()
/**
* Simplified HTML for LLM consumption.
* Corresponds to clickable_elements_to_string in browser-use
*/
private simplifiedHTML = '<EMPTY>'
/** last time the tree was updated */
private lastTimeUpdate = 0
constructor(config: PageControllerConfig = {}) {
super()
this.config = config
patchReact(this)
}
// ======= State Queries =======
/**
* Get current page URL
*/
async getCurrentUrl(): Promise<string> {
return window.location.href
}
/**
* Get current page title
*/
async getPageTitle(): Promise<string> {
return document.title
}
/**
* Get page scroll and size info
*/
async getPageInfo() {
return getPageInfo()
}
/**
* Get the simplified HTML representation of the page.
* This is used by LLM to understand the page structure.
*/
async getSimplifiedHTML(): Promise<string> {
return this.simplifiedHTML
}
/**
* Get text description for an element by index
*/
async getElementText(index: number): Promise<string | undefined> {
return this.elementTextMap.get(index)
}
/**
* Get total number of indexed interactive elements
*/
async getElementCount(): Promise<number> {
return this.selectorMap.size
}
/**
* Get last tree update timestamp
*/
async getLastUpdateTime(): Promise<number> {
return this.lastTimeUpdate
}
/**
* Get the viewport expansion setting
*/
async getViewportExpansion(): Promise<number> {
return this.config.viewportExpansion ?? VIEWPORT_EXPANSION
}
// ======= DOM Tree Operations =======
/**
* Update DOM tree, returns simplified HTML for LLM.
* This is the main method to refresh the page state.
*/
async updateTree(): Promise<string> {
this.dispatchEvent(new Event('beforeUpdate'))
this.lastTimeUpdate = Date.now()
dom.cleanUpHighlights()
const blacklist = [
...(this.config.interactiveBlacklist || []),
...document.querySelectorAll('[data-page-agent-not-interactive]').values(),
]
this.flatTree = dom.getFlatTree({
...this.config,
interactiveBlacklist: blacklist,
})
this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes)
this.selectorMap.clear()
this.selectorMap = dom.getSelectorMap(this.flatTree)
this.elementTextMap.clear()
this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML)
this.dispatchEvent(new Event('afterUpdate'))
return this.simplifiedHTML
}
/**
* Clean up all element highlights
*/
async cleanUpHighlights(): Promise<void> {
dom.cleanUpHighlights()
}
// ======= Element Actions =======
/**
* Click element by index
*/
async clickElement(index: number): Promise<ActionResult> {
try {
const element = getElementByIndex(this.selectorMap, index)
const elemText = this.elementTextMap.get(index)
await clickElement(element)
// Handle links that open in new tabs
if (element instanceof HTMLAnchorElement && element.target === '_blank') {
return {
success: true,
message: `✅ Clicked element (${elemText ?? index}). ⚠️ Link opens in a new tab. You are not capable of reading new tabs.`,
}
}
return {
success: true,
message: `✅ Clicked element (${elemText ?? index}).`,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to click element: ${error}`,
}
}
}
/**
* Input text into element by index
*/
async inputText(index: number, text: string): Promise<ActionResult> {
try {
const element = getElementByIndex(this.selectorMap, index)
const elemText = this.elementTextMap.get(index)
await inputTextElement(element, text)
return {
success: true,
message: `✅ Input text (${text}) into element (${elemText ?? index}).`,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to input text: ${error}`,
}
}
}
/**
* Select dropdown option by index and option text
*/
async selectOption(index: number, optionText: string): Promise<ActionResult> {
try {
const element = getElementByIndex(this.selectorMap, index)
const elemText = this.elementTextMap.get(index)
await selectOptionElement(element as HTMLSelectElement, optionText)
return {
success: true,
message: `✅ Selected option (${optionText}) in element (${elemText ?? index}).`,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to select option: ${error}`,
}
}
}
/**
* Scroll vertically
*/
async scroll(options: {
down: boolean
numPages: number
pixels?: number
index?: number
}): Promise<ActionResult> {
try {
const { down, numPages, pixels, index } = options
const scrollAmount = pixels ?? numPages * (down ? 1 : -1) * window.innerHeight
const element = index !== undefined ? getElementByIndex(this.selectorMap, index) : null
const message = await scrollVertically(down, scrollAmount, element)
return {
success: true,
message,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to scroll: ${error}`,
}
}
}
/**
* Scroll horizontally
*/
async scrollHorizontally(options: {
right: boolean
pixels: number
index?: number
}): Promise<ActionResult> {
try {
const { right, pixels, index } = options
const scrollAmount = pixels * (right ? 1 : -1)
const element = index !== undefined ? getElementByIndex(this.selectorMap, index) : null
const message = await scrollHorizontally(right, scrollAmount, element)
return {
success: true,
message,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to scroll horizontally: ${error}`,
}
}
}
/**
* Execute arbitrary JavaScript on the page
*/
async executeJavascript(script: string): Promise<ActionResult> {
try {
// Wrap script in async function to support await
const asyncFunction = eval(`(async () => { ${script} })`)
const result = await asyncFunction()
return {
success: true,
message: `✅ Executed JavaScript. Result: ${result}`,
}
} catch (error) {
return {
success: false,
message: `❌ Error executing JavaScript: ${error}`,
}
}
}
/**
* Dispose and clean up resources
*/
dispose(): void {
dom.cleanUpHighlights()
this.flatTree = null
this.selectorMap.clear()
this.elementTextMap.clear()
this.simplifiedHTML = '<EMPTY>'
}
}

View File

@@ -2,26 +2,14 @@
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import type { PageAgent } from '../PageAgent'
import type { InteractiveElementDomNode } from './dom/dom_tree/type'
// ======= general utils =======
export async function waitFor(seconds: number): Promise<void> {
async function waitFor(seconds: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
}
let currentUrl = window.location.href
export async function getSystemInfo() {
// If current URL is already up to date, no need to add message
if (currentUrl === window.location.href) return ''
await waitFor(0.3) // Wait a bit longer for page to load
currentUrl = window.location.href
return `\n<sys> Current URL changed to: ${currentUrl} </sys>`
}
// ======= dom utils =======
export async function movePointerToElement(element: HTMLElement) {
@@ -35,10 +23,13 @@ export async function movePointerToElement(element: HTMLElement) {
}
/**
* Get the HTMLElement by index from the selectorMap in PageAgent.
* Get the HTMLElement by index from a selectorMap.
*/
export function getElementByIndex(pageAgent: PageAgent, index: number): HTMLElement {
const interactiveNode = pageAgent.selectorMap.get(index)
export function getElementByIndex(
selectorMap: Map<number, InteractiveElementDomNode>,
index: number
): HTMLElement {
const interactiveNode = selectorMap.get(index)
if (!interactiveNode) {
throw new Error(`No interactive element found at index ${index}`)
}
@@ -170,7 +161,6 @@ export async function selectOptionElement(selectElement: HTMLSelectElement, opti
await waitFor(0.1) // Wait to ensure change event processing completes
}
// eslint-disable-next-line @typescript-eslint/require-await
export async function scrollIntoViewIfNeeded(element: HTMLElement) {
const el = element as any
if (el.scrollIntoViewIfNeeded) {

View File

@@ -0,0 +1,16 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
/**
* Viewport expansion for DOM tree extraction.
* -1 means full page (no viewport restriction)
* 0 means viewport only
* positive values expand the viewport by that many pixels
*
* @note Since isTopElement depends on elementFromPoint,
* it returns null when out of viewport, this feature has no practical use, only differ between -1 and 0
*/
// export const VIEWPORT_EXPANSION = 100
export const VIEWPORT_EXPANSION = -1

View File

@@ -1,5 +1,5 @@
import { VIEWPORT_EXPANSION } from '../config/constants'
import domTree from './dom_tree/index'
import { VIEWPORT_EXPANSION } from '../constants'
import domTree from './dom_tree/index.js'
import {
ElementDomNode,
FlatDomTree,

View File

@@ -1,4 +1,4 @@
import type { PageAgent } from '../PageAgent'
import type { PageController } from '../PageController'
const clearFunctions = [] as (() => void)[]
@@ -11,9 +11,9 @@ function fixAntdSelect() {
// for (const select of selects) {}
}
export function patchAntd(pageAgent: PageAgent) {
pageAgent.addEventListener('beforeUpdate', fixAntdSelect)
pageAgent.addEventListener('afterUpdate', () => {
export function patchAntd(pageController: PageController) {
pageController.addEventListener('beforeUpdate', fixAntdSelect)
pageController.addEventListener('afterUpdate', () => {
for (const fn of clearFunctions) fn()
clearFunctions.length = 0
})

View File

@@ -1,7 +1,7 @@
import type { PageAgent } from '../PageAgent'
import type { PageController } from '../PageController'
// Find common React root elements and add data-page-agent-not-interactive attribute
export function patchReact(pageAgent: PageAgent) {
export function patchReact(pageController: PageController) {
const reactRootElements = document.querySelectorAll(
'[data-reactroot], [data-reactid], [data-react-checksum], #root, #app, [id^="root-"], [id^="app-"], #adex-wrapper, #adex-root'
)

View File

@@ -0,0 +1,12 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo",
"noEmit": false,
"allowImportingTsExtensions": false,
"baseUrl": ".",
"outDir": "dist"
},
"include": ["**/*.ts", "**/*.js"],
"exclude": ["dist", "node_modules"]
}

View File

@@ -0,0 +1,41 @@
// @ts-check
import chalk from 'chalk'
import { dirname, resolve } from 'path'
import dts from 'unplugin-dts/vite'
import { fileURLToPath } from 'url'
import { defineConfig } from 'vite'
import cssInjectedByJsPlugin from 'vite-plugin-css-injected-by-js'
const __dirname = dirname(fileURLToPath(import.meta.url))
console.log(chalk.cyan(`📦 Building @page-agent/page-controller`))
export default defineConfig({
clearScreen: false,
plugins: [
dts({ tsconfigPath: './tsconfig.json', bundleTypes: true }),
cssInjectedByJsPlugin({ relativeCSSInjection: true }),
],
publicDir: false,
esbuild: {
keepNames: true,
},
build: {
lib: {
entry: resolve(__dirname, 'src/PageController.ts'),
name: 'PageController',
fileName: 'page-controller',
formats: ['es'],
},
outDir: resolve(__dirname, 'dist', 'lib'),
rollupOptions: {
external: [],
},
minify: false,
sourcemap: true,
cssCodeSplit: true,
},
define: {
'process.env.NODE_ENV': '"production"',
},
})

View File

@@ -9,10 +9,8 @@
"preview": "vite preview",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"page-agent": "*"
},
"devDependencies": {
"page-agent": "*",
"@tailwindcss/vite": "^4.1.14",
"@types/react": "^19.2.2",
"@types/react-dom": "^19.2.1",

View File

@@ -9,7 +9,7 @@ export default function Configuration() {
className="mb-8"
language="typescript"
code={`// config
type PageAgentConfig = LLMConfig & AgentConfig & DomConfig
type PageAgentConfig = LLMConfig & AgentConfig & PageControllerConfig
interface LLMConfig {
baseURL?: string
@@ -93,12 +93,13 @@ interface AgentConfig {
experimentalPreventNewPage?: boolean
}
interface DomConfig {
interface PageControllerConfig {
interactiveBlacklist?: (Element | (() => Element))[]
interactiveWhitelist?: (Element | (() => Element))[]
include_attributes?: string[]
highlightOpacity?: number
highlightLabelOpacity?: number
viewportExpansion?: number
}
`}

View File

@@ -1,17 +1,21 @@
{
"extends": "../../tsconfig.json",
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"composite": true,
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo",
"baseUrl": "./",
"noEmit": false,
"allowImportingTsExtensions": false,
"baseUrl": ".",
"outDir": "dist",
"paths": {
// Self root
"@/*": ["src/*"],
// Simplified monorepo solution (raw npm workspace with hoisting)
"@page-agent/page-controller": ["../page-controller/src/PageController.ts"],
"page-agent": ["../page-agent/src/PageAgent.ts"]
}
},
"include": ["src", "env.d.ts"],
"references": [{ "path": "../page-agent" }]
"include": ["**/*.ts", "**/*.tsx"],
"exclude": ["dist", "node_modules"],
"references": [{ "path": "../page-controller" }, { "path": "../page-agent" }]
}

View File

@@ -18,7 +18,8 @@ export default defineConfig({
// Self root
'@': resolve(__dirname, 'src'),
// Simplified monorepo solution (raw npm workspace with hoisting)
// Monorepo packages (always bundle local code instead of npm versions)
'@page-agent/page-controller': resolve(__dirname, '../page-controller/src/PageController.ts'),
'page-agent': resolve(__dirname, '../page-agent/src/PageAgent.ts'),
},
},