refactor(PageController): implement PageController

This commit is contained in:
Simon
2025-12-05 16:18:01 +08:00
parent ad19a26a57
commit 683602bb6b
33 changed files with 823 additions and 363 deletions

View File

@@ -0,0 +1,41 @@
{
"name": "@page-agent/page-controller",
"private": false,
"version": "0.0.6",
"type": "module",
"main": "./dist/lib/page-controller.js",
"module": "./dist/lib/page-controller.js",
"types": "./dist/lib/PageController.d.ts",
"exports": {
".": {
"types": "./dist/lib/PageController.d.ts",
"import": "./dist/lib/page-controller.js",
"default": "./dist/lib/page-controller.js"
}
},
"files": [
"dist/",
"README.md",
"LICENSE"
],
"description": "Page controller for page-agent - DOM operations and element interactions",
"keywords": [
"page-agent",
"dom",
"browser-automation",
"web-automation"
],
"author": "Simon<gaomeng1900>",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/alibaba/page-agent.git"
},
"homepage": "https://alibaba.github.io/page-agent/",
"scripts": {
"build": "vite build",
"build:watch": "vite build --watch",
"prepublishOnly": "node -e \"const fs=require('fs');['README.md','LICENSE'].forEach(f=>fs.copyFileSync('../../'+f,f))\"",
"postpublish": "node -e \"['README.md','LICENSE'].forEach(f=>{try{require('fs').unlinkSync(f)}catch{}})\""
}
}

View File

@@ -0,0 +1,339 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*
* PageController - Manages DOM operations and element interactions.
* Designed to be independent of LLM and can be tested in unit tests.
* All public methods are async for potential remote calling support.
*/
import {
clickElement,
getElementByIndex,
inputTextElement,
scrollHorizontally,
scrollVertically,
selectOptionElement,
} from './actions'
import { VIEWPORT_EXPANSION } from './constants'
import * as dom from './dom'
import type { FlatDomTree, InteractiveElementDomNode } from './dom/dom_tree/type'
import { getPageInfo } from './dom/getPageInfo'
import { patchReact } from './patches/react'
/**
* Configuration for PageController
*/
export interface PageControllerConfig extends dom.DomConfig {
viewportExpansion?: number
}
interface ActionResult {
success: boolean
message: string
}
/**
* PageController manages DOM state and element interactions.
* It provides async methods for all DOM operations, keeping state isolated.
*
* @lifecycle
* - beforeUpdate: Emitted before the DOM tree is updated.
* - afterUpdate: Emitted after the DOM tree is updated.
*/
export class PageController extends EventTarget {
private config: PageControllerConfig
/** Corresponds to eval_page in browser-use */
private flatTree: FlatDomTree | null = null
/**
* All highlighted index-mapped interactive elements
* Corresponds to DOMState.selector_map in browser-use
*/
private selectorMap = new Map<number, InteractiveElementDomNode>()
/** Index -> element text description mapping */
private elementTextMap = new Map<number, string>()
/**
* Simplified HTML for LLM consumption.
* Corresponds to clickable_elements_to_string in browser-use
*/
private simplifiedHTML = '<EMPTY>'
/** last time the tree was updated */
private lastTimeUpdate = 0
constructor(config: PageControllerConfig = {}) {
super()
this.config = config
patchReact(this)
}
// ======= State Queries =======
/**
* Get current page URL
*/
async getCurrentUrl(): Promise<string> {
return window.location.href
}
/**
* Get current page title
*/
async getPageTitle(): Promise<string> {
return document.title
}
/**
* Get page scroll and size info
*/
async getPageInfo() {
return getPageInfo()
}
/**
* Get the simplified HTML representation of the page.
* This is used by LLM to understand the page structure.
*/
async getSimplifiedHTML(): Promise<string> {
return this.simplifiedHTML
}
/**
* Get text description for an element by index
*/
async getElementText(index: number): Promise<string | undefined> {
return this.elementTextMap.get(index)
}
/**
* Get total number of indexed interactive elements
*/
async getElementCount(): Promise<number> {
return this.selectorMap.size
}
/**
* Get last tree update timestamp
*/
async getLastUpdateTime(): Promise<number> {
return this.lastTimeUpdate
}
/**
* Get the viewport expansion setting
*/
async getViewportExpansion(): Promise<number> {
return this.config.viewportExpansion ?? VIEWPORT_EXPANSION
}
// ======= DOM Tree Operations =======
/**
* Update DOM tree, returns simplified HTML for LLM.
* This is the main method to refresh the page state.
*/
async updateTree(): Promise<string> {
this.dispatchEvent(new Event('beforeUpdate'))
this.lastTimeUpdate = Date.now()
dom.cleanUpHighlights()
const blacklist = [
...(this.config.interactiveBlacklist || []),
...document.querySelectorAll('[data-page-agent-not-interactive]').values(),
]
this.flatTree = dom.getFlatTree({
...this.config,
interactiveBlacklist: blacklist,
})
this.simplifiedHTML = dom.flatTreeToString(this.flatTree, this.config.include_attributes)
this.selectorMap.clear()
this.selectorMap = dom.getSelectorMap(this.flatTree)
this.elementTextMap.clear()
this.elementTextMap = dom.getElementTextMap(this.simplifiedHTML)
this.dispatchEvent(new Event('afterUpdate'))
return this.simplifiedHTML
}
/**
* Clean up all element highlights
*/
async cleanUpHighlights(): Promise<void> {
dom.cleanUpHighlights()
}
// ======= Element Actions =======
/**
* Click element by index
*/
async clickElement(index: number): Promise<ActionResult> {
try {
const element = getElementByIndex(this.selectorMap, index)
const elemText = this.elementTextMap.get(index)
await clickElement(element)
// Handle links that open in new tabs
if (element instanceof HTMLAnchorElement && element.target === '_blank') {
return {
success: true,
message: `✅ Clicked element (${elemText ?? index}). ⚠️ Link opens in a new tab. You are not capable of reading new tabs.`,
}
}
return {
success: true,
message: `✅ Clicked element (${elemText ?? index}).`,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to click element: ${error}`,
}
}
}
/**
* Input text into element by index
*/
async inputText(index: number, text: string): Promise<ActionResult> {
try {
const element = getElementByIndex(this.selectorMap, index)
const elemText = this.elementTextMap.get(index)
await inputTextElement(element, text)
return {
success: true,
message: `✅ Input text (${text}) into element (${elemText ?? index}).`,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to input text: ${error}`,
}
}
}
/**
* Select dropdown option by index and option text
*/
async selectOption(index: number, optionText: string): Promise<ActionResult> {
try {
const element = getElementByIndex(this.selectorMap, index)
const elemText = this.elementTextMap.get(index)
await selectOptionElement(element as HTMLSelectElement, optionText)
return {
success: true,
message: `✅ Selected option (${optionText}) in element (${elemText ?? index}).`,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to select option: ${error}`,
}
}
}
/**
* Scroll vertically
*/
async scroll(options: {
down: boolean
numPages: number
pixels?: number
index?: number
}): Promise<ActionResult> {
try {
const { down, numPages, pixels, index } = options
const scrollAmount = pixels ?? numPages * (down ? 1 : -1) * window.innerHeight
const element = index !== undefined ? getElementByIndex(this.selectorMap, index) : null
const message = await scrollVertically(down, scrollAmount, element)
return {
success: true,
message,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to scroll: ${error}`,
}
}
}
/**
* Scroll horizontally
*/
async scrollHorizontally(options: {
right: boolean
pixels: number
index?: number
}): Promise<ActionResult> {
try {
const { right, pixels, index } = options
const scrollAmount = pixels * (right ? 1 : -1)
const element = index !== undefined ? getElementByIndex(this.selectorMap, index) : null
const message = await scrollHorizontally(right, scrollAmount, element)
return {
success: true,
message,
}
} catch (error) {
return {
success: false,
message: `❌ Failed to scroll horizontally: ${error}`,
}
}
}
/**
* Execute arbitrary JavaScript on the page
*/
async executeJavascript(script: string): Promise<ActionResult> {
try {
// Wrap script in async function to support await
const asyncFunction = eval(`(async () => { ${script} })`)
const result = await asyncFunction()
return {
success: true,
message: `✅ Executed JavaScript. Result: ${result}`,
}
} catch (error) {
return {
success: false,
message: `❌ Error executing JavaScript: ${error}`,
}
}
}
/**
* Dispose and clean up resources
*/
dispose(): void {
dom.cleanUpHighlights()
this.flatTree = null
this.selectorMap.clear()
this.elementTextMap.clear()
this.simplifiedHTML = '<EMPTY>'
}
}

View File

@@ -2,26 +2,14 @@
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import type { PageAgent } from '../PageAgent'
import type { InteractiveElementDomNode } from './dom/dom_tree/type'
// ======= general utils =======
export async function waitFor(seconds: number): Promise<void> {
async function waitFor(seconds: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
}
let currentUrl = window.location.href
export async function getSystemInfo() {
// If current URL is already up to date, no need to add message
if (currentUrl === window.location.href) return ''
await waitFor(0.3) // Wait a bit longer for page to load
currentUrl = window.location.href
return `\n<sys> Current URL changed to: ${currentUrl} </sys>`
}
// ======= dom utils =======
export async function movePointerToElement(element: HTMLElement) {
@@ -35,10 +23,13 @@ export async function movePointerToElement(element: HTMLElement) {
}
/**
* Get the HTMLElement by index from the selectorMap in PageAgent.
* Get the HTMLElement by index from a selectorMap.
*/
export function getElementByIndex(pageAgent: PageAgent, index: number): HTMLElement {
const interactiveNode = pageAgent.selectorMap.get(index)
export function getElementByIndex(
selectorMap: Map<number, InteractiveElementDomNode>,
index: number
): HTMLElement {
const interactiveNode = selectorMap.get(index)
if (!interactiveNode) {
throw new Error(`No interactive element found at index ${index}`)
}
@@ -170,7 +161,6 @@ export async function selectOptionElement(selectElement: HTMLSelectElement, opti
await waitFor(0.1) // Wait to ensure change event processing completes
}
// eslint-disable-next-line @typescript-eslint/require-await
export async function scrollIntoViewIfNeeded(element: HTMLElement) {
const el = element as any
if (el.scrollIntoViewIfNeeded) {

View File

@@ -0,0 +1,16 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
/**
* Viewport expansion for DOM tree extraction.
* -1 means full page (no viewport restriction)
* 0 means viewport only
* positive values expand the viewport by that many pixels
*
* @note Since isTopElement depends on elementFromPoint,
* it returns null when out of viewport, this feature has no practical use, only differ between -1 and 0
*/
// export const VIEWPORT_EXPANSION = 100
export const VIEWPORT_EXPANSION = -1

View File

@@ -1,5 +1,5 @@
import { VIEWPORT_EXPANSION } from '../config/constants'
import domTree from './dom_tree/index'
import { VIEWPORT_EXPANSION } from '../constants'
import domTree from './dom_tree/index.js'
import {
ElementDomNode,
FlatDomTree,

View File

@@ -0,0 +1,20 @@
import type { PageController } from '../PageController'
const clearFunctions = [] as (() => void)[]
/**
* antd 的 select 是 div 包 input 的结构,所有信息都在 input 标签上,
* 但是 input 不可见,也不会出现在清洗后的树里,因此这里把他提上来
*/
function fixAntdSelect() {
const selects = [...document.querySelectorAll('input[role="combobox"]')]
// for (const select of selects) {}
}
export function patchAntd(pageController: PageController) {
pageController.addEventListener('beforeUpdate', fixAntdSelect)
pageController.addEventListener('afterUpdate', () => {
for (const fn of clearFunctions) fn()
clearFunctions.length = 0
})
}

View File

@@ -0,0 +1,16 @@
import type { PageController } from '../PageController'
// Find common React root elements and add data-page-agent-not-interactive attribute
export function patchReact(pageController: PageController) {
const reactRootElements = document.querySelectorAll(
'[data-reactroot], [data-reactid], [data-react-checksum], #root, #app, [id^="root-"], [id^="app-"], #adex-wrapper, #adex-root'
)
for (const element of reactRootElements) {
element.setAttribute('data-page-agent-not-interactive', 'true')
}
}
/**
* @todo (Heavy, might have false negatives) Interaction detection, if element width/height equals body offsetWidth/Height, consider it root element and non-interactive (React often attaches many events to root elements, causing false positives)
*/

View File

@@ -0,0 +1,12 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo",
"noEmit": false,
"allowImportingTsExtensions": false,
"baseUrl": ".",
"outDir": "dist"
},
"include": ["**/*.ts", "**/*.js"],
"exclude": ["dist", "node_modules"]
}

View File

@@ -0,0 +1,41 @@
// @ts-check
import chalk from 'chalk'
import { dirname, resolve } from 'path'
import dts from 'unplugin-dts/vite'
import { fileURLToPath } from 'url'
import { defineConfig } from 'vite'
import cssInjectedByJsPlugin from 'vite-plugin-css-injected-by-js'
const __dirname = dirname(fileURLToPath(import.meta.url))
console.log(chalk.cyan(`📦 Building @page-agent/page-controller`))
export default defineConfig({
clearScreen: false,
plugins: [
dts({ tsconfigPath: './tsconfig.json', bundleTypes: true }),
cssInjectedByJsPlugin({ relativeCSSInjection: true }),
],
publicDir: false,
esbuild: {
keepNames: true,
},
build: {
lib: {
entry: resolve(__dirname, 'src/PageController.ts'),
name: 'PageController',
fileName: 'page-controller',
formats: ['es'],
},
outDir: resolve(__dirname, 'dist', 'lib'),
rollupOptions: {
external: [],
},
minify: false,
sourcemap: true,
cssCodeSplit: true,
},
define: {
'process.env.NODE_ENV': '"production"',
},
})