diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..658ad59 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,94 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.0] - 2026-01-19 + +### 🎉 First Stable Release + +PageAgent is now ready for production use. The API is stable and breaking changes will follow semantic versioning. + +### Features + +#### Core + +- **PageAgent** - Main entry class with built-in UI Panel +- **PageAgentCore** - Headless agent class for custom UI or programmatic use +- **DOM Analysis** - Text-based DOM extraction with high-intensity dehydration +- **LLM Support** - Works with OpenAI, Claude, DeepSeek, Qwen, and other OpenAI-compatible APIs +- **Tool System** - Built-in tools for click, input, scroll, select, and more +- **Custom Tools** - Extend agent capabilities with your own tools (experimental) +- **Lifecycle Hooks** - Hook into agent execution (experimental) +- **Instructions System** - System-level and page-level instructions to guide agent behavior +- **Data Masking** - Transform page content before sending to LLM + +#### Page Controller + +- **Element Interactions** - Click, input text, select options, scroll +- **Visual Mask** - Blocks user interaction during automation +- **DOM Tree Extraction** - Efficient page structure extraction for LLM consumption + +#### UI + +- **Interactive Panel** - Real-time task progress and agent thinking display +- **Ask User Tool** - Agent can ask users for clarification +- **i18n Support** - English and Chinese localization + +### Configuration + +```typescript +interface PageAgentConfig { + // LLM Configuration (required) + baseURL: string + apiKey: string + model: string + temperature?: number + maxRetries?: number + customFetch?: typeof fetch + + // Agent Configuration + language?: 'en-US' | 'zh-CN' + maxSteps?: number // default: 20 + customTools?: Record // experimental + instructions?: InstructionsConfig + transformPageContent?: (content: string) => string | Promise + experimentalScriptExecutionTool?: boolean // default: false + + // Lifecycle Hooks (experimental) + onBeforeTask?: (agent, result) => void + onAfterTask?: (agent, result) => void + onBeforeStep?: (agent, stepCount) => void + onAfterStep?: (agent, history) => void + onDispose?: (agent, reason?) => void + + // Page Controller Configuration + enableMask?: boolean // default: true + viewportExpansion?: number + interactiveBlacklist?: Element[] + interactiveWhitelist?: Element[] +} +``` + +### Packages + +| Package | Description | +| ----------------------------- | ---------------------------------- | +| `page-agent` | Main entry with UI Panel | +| `@page-agent/core` | Core agent logic without UI | +| `@page-agent/llms` | LLM client with retry logic | +| `@page-agent/page-controller` | DOM operations and visual feedback | +| `@page-agent/ui` | Panel and i18n | + +### Known Limitations + +- Single-page application only (cannot navigate across pages) +- No visual recognition (relies on DOM structure) +- Limited interaction support (no hover, drag-drop, canvas operations) +- See [Limitations](https://alibaba.github.io/page-agent/#/docs/introduction/limitations) for details + +### Acknowledgments + +This project builds upon the excellent work of [browser-use](https://github.com/browser-use/browser-use). DOM processing components and prompts are adapted from browser-use (MIT License). diff --git a/packages/core/src/PageAgentCore.ts b/packages/core/src/PageAgentCore.ts index baa00ce..dd98daf 100644 --- a/packages/core/src/PageAgentCore.ts +++ b/packages/core/src/PageAgentCore.ts @@ -21,7 +21,7 @@ import { MacroToolInput, MacroToolResult, } from './types' -import { normalizeResponse, trimLines, uid } from './utils' +import { normalizeResponse, trimLines, uid, waitFor } from './utils' import { assert } from './utils/assert' export { type PageAgentConfig } @@ -184,12 +184,12 @@ export class PageAgentCore extends EventTarget { this.tools.delete('ask_user') } - const onBeforeStep = this.config.onBeforeStep || (() => void 0) - const onAfterStep = this.config.onAfterStep || (() => void 0) - const onBeforeTask = this.config.onBeforeTask || (() => void 0) - const onAfterTask = this.config.onAfterTask || (() => void 0) + const onBeforeStep = this.config.onBeforeStep + const onAfterStep = this.config.onAfterStep + const onBeforeTask = this.config.onBeforeTask + const onAfterTask = this.config.onAfterTask - await onBeforeTask.call(this) + await onBeforeTask?.(this) // Show mask await this.pageController.showMask() @@ -215,7 +215,7 @@ export class PageAgentCore extends EventTarget { while (true) { await this.#generateObservations(step) - await onBeforeStep.call(this, step) + await onBeforeStep?.(this, step) console.group(`step: ${step}`) @@ -271,7 +271,7 @@ export class PageAgentCore extends EventTarget { console.log(chalk.green('Step finished:'), actionName) console.groupEnd() - await onAfterStep.call(this, this.history) + await onAfterStep?.(this, this.history) step++ if (step > this.config.maxSteps) { @@ -281,7 +281,7 @@ export class PageAgentCore extends EventTarget { data: 'Step count exceeded maximum limit', history: this.history, } - await onAfterTask.call(this, result) + await onAfterTask?.(this, result) return result } if (actionName === 'done') { @@ -294,7 +294,7 @@ export class PageAgentCore extends EventTarget { data: text, history: this.history, } - await onAfterTask.call(this, result) + await onAfterTask?.(this, result) return result } } @@ -308,7 +308,7 @@ export class PageAgentCore extends EventTarget { data: errorMessage, history: this.history, } - await onAfterTask.call(this, result) + await onAfterTask?.(this, result) return result } } @@ -473,6 +473,7 @@ export class PageAgentCore extends EventTarget { if (currentURL !== this.states.lastURL) { this.pushObservation(`Page navigated to → ${currentURL}`) this.states.lastURL = currentURL + await waitFor(500) // wait for page to stabilize } // Warn about remaining steps @@ -584,6 +585,6 @@ export class PageAgentCore extends EventTarget { // Emit dispose event for UI cleanup this.dispatchEvent(new Event('dispose')) - this.config.onDispose?.call(this, reason) + this.config.onDispose?.(this, reason) } } diff --git a/packages/core/src/config/index.ts b/packages/core/src/config/index.ts index 3f3cbc8..9c75267 100644 --- a/packages/core/src/config/index.ts +++ b/packages/core/src/config/index.ts @@ -69,20 +69,52 @@ export interface AgentConfig { getPageInstructions?: (url: string) => string | undefined | null } - // lifecycle hooks - // @todo: use event instead of hooks - // @todo: remove `this` binding, pass agent as explicit parameter instead - - onBeforeStep?: (this: PageAgentCore, stepCnt: number) => Promise | void - onAfterStep?: (this: PageAgentCore, history: HistoricalEvent[]) => Promise | void - onBeforeTask?: (this: PageAgentCore) => Promise | void - onAfterTask?: (this: PageAgentCore, result: ExecutionResult) => Promise | void + /** + * Lifecycle hooks for task execution. + * @experimental API may change in future versions. + * + * All hooks receive the agent instance as first parameter. + */ /** - * @note this hook can block the disposal process - * @todo remove `this` binding, pass agent as explicit parameter instead + * Called before each step execution. + * @experimental + * @param agent - The PageAgentCore instance + * @param stepCount - Current step number (0-indexed) */ - onDispose?: (this: PageAgentCore, reason?: string) => void + onBeforeStep?: (agent: PageAgentCore, stepCount: number) => Promise | void + + /** + * Called after each step execution. + * @experimental + * @param agent - The PageAgentCore instance + * @param history - Current history of events + */ + onAfterStep?: (agent: PageAgentCore, history: HistoricalEvent[]) => Promise | void + + /** + * Called before task execution starts. + * @experimental + * @param agent - The PageAgentCore instance + */ + onBeforeTask?: (agent: PageAgentCore) => Promise | void + + /** + * Called after task execution completes (success or failure). + * @experimental + * @param agent - The PageAgentCore instance + * @param result - The execution result + */ + onAfterTask?: (agent: PageAgentCore, result: ExecutionResult) => Promise | void + + /** + * Called when the agent is disposed. + * @experimental + * @note This hook can block the disposal process if it's async. + * @param agent - The PageAgentCore instance + * @param reason - Optional reason for disposal + */ + onDispose?: (agent: PageAgentCore, reason?: string) => void // page behavior hooks @@ -109,21 +141,6 @@ export interface AgentConfig { * } */ transformPageContent?: (content: string) => Promise | string - - /** - * TODO: @unimplemented - * hook when action causes a new page to be opened - * @note PageAgent will try to detect new pages and decide if it's caused by an action. But not very reliable. - * @todo remove `this` binding, pass agent as explicit parameter instead - */ - // onNewPageOpen?: (this: PageAgent, url: string) => Promise | void - - /** - * TODO: @unimplemented - * try to navigate to a new page instead of opening a new tab/window. - * @note will unload the current page when a action tries to open a new page. so that things keep in the same tab/window. - */ - // experimentalPreventNewPage?: boolean } export type PageAgentConfig = LLMConfig & AgentConfig & PageControllerConfig diff --git a/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx b/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx index 4f53ae1..4be8d22 100644 --- a/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx +++ b/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx @@ -213,35 +213,48 @@ const result = await agent.execute('Fill in the form with test data')`} {/* Lifecycle Hooks */}

{isZh ? '生命周期钩子' : 'Lifecycle Hooks'}

+
+

+ ⚠️ {isZh ? '警告' : 'Warning'}:{' '} + {isZh + ? '这些接口高度实验性,可能在未来版本中发生变化。建议优先使用事件系统(Events)来监听 Agent 状态。' + : 'These APIs are highly experimental and may change in future versions. Prefer using the Events system for monitoring agent state.'} +

+
+

+ {isZh + ? '所有生命周期钩子都接收 agent 实例作为第一个参数,便于在回调中访问 Agent 状态和方法。' + : 'All lifecycle hooks receive the agent instance as first parameter, making it easy to access agent state and methods in callbacks.'} +

void | Promise', + type: '(agent: PageAgentCore, stepCount: number) => void | Promise', description: isZh ? '每个步骤执行前调用' : 'Called before each step execution', status: 'experimental', }, { name: 'onAfterStep', - type: '(history: HistoricalEvent[]) => void | Promise', + type: '(agent: PageAgentCore, history: HistoricalEvent[]) => void | Promise', description: isZh ? '每个步骤执行后调用' : 'Called after each step execution', status: 'experimental', }, { name: 'onBeforeTask', - type: '() => void | Promise', + type: '(agent: PageAgentCore) => void | Promise', description: isZh ? '任务开始前调用' : 'Called before task starts', status: 'experimental', }, { name: 'onAfterTask', - type: '(result: ExecutionResult) => void | Promise', + type: '(agent: PageAgentCore, result: ExecutionResult) => void | Promise', description: isZh ? '任务结束后调用' : 'Called after task ends', status: 'experimental', }, { name: 'onDispose', - type: '(reason?: string) => void', + type: '(agent: PageAgentCore, reason?: string) => void', description: isZh ? 'Agent 销毁时调用' : 'Called when agent is disposed', status: 'experimental', },