refactor: rename page-agent to page-agent-core

This commit is contained in:
Simon
2026-01-19 16:06:07 +08:00
parent 09c3084629
commit c9f049a733
21 changed files with 8433 additions and 8543 deletions

View File

@@ -0,0 +1,595 @@
/**
* Copyright (C) 2025 Alibaba Group Holding Limited
* All rights reserved.
*/
import { LLM, type Tool } from '@page-agent/llms'
import { PageController } from '@page-agent/page-controller'
import chalk from 'chalk'
import zod from 'zod'
import { type PageAgentConfig } from './config'
import { MAX_STEPS } from './config/constants'
import SYSTEM_PROMPT from './prompts/system_prompt.md?raw'
import { tools } from './tools'
import {
AgentActivity,
AgentReflection,
AgentStatus,
AgentStep,
ExecutionResult,
HistoricalEvent,
MacroToolInput,
MacroToolResult,
} from './types'
import { normalizeResponse, trimLines, uid } from './utils'
import { assert } from './utils/assert'
export { type PageAgentConfig }
export { tool, type PageAgentTool } from './tools'
/**
* AI agent for browser DOM automation.
*
* @remarks
* ## Event System
* - `statuschange` - Agent status transitions (idle → running → completed/error)
* - `historychange` - History events updated (persistent, part of agent memory)
* - `activity` - Real-time activity feedback (transient, for UI only)
* - `dispose` - Agent cleanup triggered
*
* ## Information Streams
* 1. **History Events** (`history` array)
* - Persistent event stream that forms agent's memory
* - Included in LLM context across steps
* - Types: steps, observations, user takeovers, llm errors
*
* 2. **Activity Events** (via `activity` event)
* - Transient UI feedback during task execution
* - NOT included in LLM context
* - Types: thinking, executing, executed, retrying, error
*/
export class PageAgentCore extends EventTarget {
config: PageAgentConfig
id = uid()
tools: typeof tools
disposed = false
task = ''
taskId = ''
/** Agent execution status */
#status: AgentStatus = 'idle'
/**
* Callback for when agent needs user input (ask_user tool)
* If not set, ask_user tool will be disabled
* @example onAskUser: (q) => window.prompt(q) || ''
*/
onAskUser?: (question: string) => Promise<string>
#llm: LLM
#abortController = new AbortController()
/** PageController for DOM operations */
pageController: PageController
/** Runtime states for tracking across steps */
states = {
/** Accumulated wait time in seconds, used by wait tool */
totalWaitTime: 0,
/** Last known URL for detecting navigation */
lastURL: '',
}
/** History events */
history: HistoricalEvent[] = []
constructor(config: PageAgentConfig) {
super()
this.config = config
this.#llm = new LLM(this.config)
this.tools = new Map(tools)
// Initialize PageController with config (mask enabled by default)
this.pageController =
this.config.pageController ??
new PageController({
...this.config,
enableMask: this.config.enableMask ?? true,
})
// Listen to LLM retry events
this.#llm.addEventListener('retry', (e) => {
const { attempt, maxAttempts } = (e as CustomEvent).detail
this.emitActivity({ type: 'retrying', attempt, maxAttempts })
// Also push to history for panel rendering
this.history.push({
type: 'error',
errorType: 'retry',
message: `LLM retry attempt ${attempt} of ${maxAttempts}`,
attempt,
maxAttempts,
})
this.#emitHistoryChange()
})
this.#llm.addEventListener('error', (e) => {
const { error } = (e as CustomEvent).detail
const message = String(error)
this.emitActivity({ type: 'error', message })
// Also push to history for panel rendering
this.history.push({
type: 'error',
errorType: 'error',
message,
})
this.#emitHistoryChange()
})
if (this.config.customTools) {
for (const [name, tool] of Object.entries(this.config.customTools)) {
if (tool === null) {
this.tools.delete(name)
continue
}
this.tools.set(name, tool)
}
}
if (!this.config.experimentalScriptExecutionTool) {
this.tools.delete('execute_javascript')
}
}
/** Get current agent status */
get status(): AgentStatus {
return this.#status
}
/** Emit statuschange event */
#emitStatusChange(): void {
this.dispatchEvent(new Event('statuschange'))
}
/** Emit historychange event */
#emitHistoryChange(): void {
this.dispatchEvent(new Event('historychange'))
}
/**
* Emit activity event - for transient UI feedback
* @param activity - Current agent activity
*/
emitActivity(activity: AgentActivity): void {
this.dispatchEvent(new CustomEvent('activity', { detail: activity }))
}
/** Update status and emit event */
#setStatus(status: AgentStatus): void {
if (this.#status !== status) {
this.#status = status
this.#emitStatusChange()
}
}
/**
* Push a persistent observation to the history event stream.
* This will be visible in <agent_history> and remain in memory across steps.
*/
pushObservation(content: string): void {
this.history.push({ type: 'observation', content })
this.#emitHistoryChange()
}
async execute(task: string): Promise<ExecutionResult> {
if (!task) throw new Error('Task is required')
this.task = task
this.taskId = uid()
// Disable ask_user tool if onAskUser is not set
if (!this.onAskUser) {
this.tools.delete('ask_user')
}
const onBeforeStep = this.config.onBeforeStep || (() => void 0)
const onAfterStep = this.config.onAfterStep || (() => void 0)
const onBeforeTask = this.config.onBeforeTask || (() => void 0)
const onAfterTask = this.config.onAfterTask || (() => void 0)
await onBeforeTask.call(this)
// Show mask
await this.pageController.showMask()
if (this.#abortController) {
this.#abortController.abort()
this.#abortController = new AbortController()
}
this.history = []
this.#setStatus('running')
this.#emitHistoryChange()
// Reset states
this.states = {
totalWaitTime: 0,
lastURL: '',
}
try {
let step = 0
while (true) {
await this.#generateObservations(step)
await onBeforeStep.call(this, step)
console.group(`step: ${step}`)
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
// Thinking
console.log(chalk.blue('Thinking...'))
this.emitActivity({ type: 'thinking' })
const result = await this.#llm.invoke(
[
{
role: 'system',
content: this.#getSystemPrompt(),
},
{
role: 'user',
content: await this.#assembleUserPrompt(),
},
],
{ AgentOutput: this.#packMacroTool() },
this.#abortController.signal,
{
toolChoiceName: 'AgentOutput',
normalizeResponse,
}
)
const macroResult = result.toolResult as MacroToolResult
const input = macroResult.input
const output = macroResult.output
const reflection: Partial<AgentReflection> = {
evaluation_previous_goal: input.evaluation_previous_goal,
memory: input.memory,
next_goal: input.next_goal,
}
const actionName = Object.keys(input.action)[0]
const action: AgentStep['action'] = {
name: actionName,
input: input.action[actionName],
output: output,
}
this.history.push({
type: 'step',
reflection,
action,
usage: result.usage,
} as AgentStep)
this.#emitHistoryChange()
console.log(chalk.green('Step finished:'), actionName)
console.groupEnd()
await onAfterStep.call(this, this.history)
step++
if (step > MAX_STEPS) {
this.#onDone('Step count exceeded maximum limit', false)
const result: ExecutionResult = {
success: false,
data: 'Step count exceeded maximum limit',
history: this.history,
}
await onAfterTask.call(this, result)
return result
}
if (actionName === 'done') {
const success = action.input?.success ?? false
const text = action.input?.text || 'no text provided'
console.log(chalk.green.bold('Task completed'), success, text)
this.#onDone(text, success)
const result: ExecutionResult = {
success,
data: text,
history: this.history,
}
await onAfterTask.call(this, result)
return result
}
}
} catch (error: unknown) {
console.error('Task failed', error)
const errorMessage = String(error)
this.emitActivity({ type: 'error', message: errorMessage })
this.#onDone(errorMessage, false)
const result: ExecutionResult = {
success: false,
data: errorMessage,
history: this.history,
}
await onAfterTask.call(this, result)
return result
}
}
/**
* Merge all tools into a single MacroTool with the following input:
* - thinking: string
* - evaluation_previous_goal: string
* - memory: string
* - next_goal: string
* - action: { toolName: toolInput }
* where action must be selected from tools defined in this.tools
*/
#packMacroTool(): Tool<MacroToolInput, MacroToolResult> {
const tools = this.tools
const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
return zod.object({ [toolName]: tool.inputSchema }).describe(tool.description)
})
const actionSchema = zod.union(
actionSchemas as unknown as [zod.ZodType, zod.ZodType, ...zod.ZodType[]]
)
const macroToolSchema = zod.object({
// thinking: zod.string().optional(),
evaluation_previous_goal: zod.string().optional(),
memory: zod.string().optional(),
next_goal: zod.string().optional(),
action: actionSchema,
})
return {
description: 'You MUST call this tool every step. Outputs your reflections and next action.',
inputSchema: macroToolSchema as zod.ZodType<MacroToolInput>,
execute: async (input: MacroToolInput): Promise<MacroToolResult> => {
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')
console.log(chalk.blue.bold('MacroTool execute'), input)
const action = input.action
const toolName = Object.keys(action)[0]
const toolInput = action[toolName]
// Build reflection text, only include non-empty fields
const reflectionLines: string[] = []
if (input.evaluation_previous_goal)
reflectionLines.push(`✅: ${input.evaluation_previous_goal}`)
if (input.memory) reflectionLines.push(`💾: ${input.memory}`)
if (input.next_goal) reflectionLines.push(`🎯: ${input.next_goal}`)
const reflectionText = reflectionLines.length > 0 ? reflectionLines.join('\n') : ''
if (reflectionText) {
console.log(reflectionText)
}
// Find the corresponding tool
const tool = tools.get(toolName)
assert(tool, `Tool ${toolName} not found. (@note should have been caught before this!!!)`)
console.log(chalk.blue.bold(`Executing tool: ${toolName}`), toolInput)
// Emit executing activity
this.emitActivity({ type: 'executing', tool: toolName, input: toolInput })
const startTime = Date.now()
// Execute tool, bind `this` to PageAgent
const result = await tool.execute.bind(this)(toolInput)
const duration = Date.now() - startTime
console.log(chalk.green.bold(`Tool (${toolName}) executed for ${duration}ms`), result)
// Emit executed activity
this.emitActivity({
type: 'executed',
tool: toolName,
input: toolInput,
output: result,
duration,
})
// Reset wait time for non-wait tools
if (toolName !== 'wait') {
this.states.totalWaitTime = 0
}
// Return structured result
return {
input,
output: result,
}
},
}
}
/**
* Get system prompt, dynamically replace language settings based on configured language
*/
#getSystemPrompt(): string {
let systemPrompt = SYSTEM_PROMPT
const targetLanguage = this.config.language === 'zh-CN' ? '中文' : 'English'
systemPrompt = systemPrompt.replace(
/Default working language: \*\*.*?\*\*/,
`Default working language: **${targetLanguage}**`
)
return systemPrompt
}
/**
* Get instructions from config and format as XML block
*/
async #getInstructions(): Promise<string> {
const { instructions } = this.config
if (!instructions) return ''
const systemInstructions = instructions.system?.trim()
const url = await this.pageController.getCurrentUrl()
let pageInstructions: string | undefined
if (instructions.getPageInstructions) {
try {
pageInstructions = instructions.getPageInstructions(url)?.trim()
} catch (error) {
console.error(
chalk.red('[PageAgent] Failed to execute getPageInstructions callback:'),
error
)
}
}
if (!systemInstructions && !pageInstructions) return ''
let result = '<instructions>\n'
if (systemInstructions) {
result += `<system_instructions>\n${systemInstructions}\n</system_instructions>\n`
}
if (pageInstructions) {
result += `<page_instructions>\n${pageInstructions}\n</page_instructions>\n`
}
result += '</instructions>\n\n'
return result
}
/**
* Generate observations before each step
* - URL change detection
* - Too many steps warning
* @todo loop detection
* @todo console error
*/
async #generateObservations(stepCount: number): Promise<void> {
// Detect URL change
const currentURL = await this.pageController.getCurrentUrl()
if (currentURL !== this.states.lastURL) {
this.pushObservation(`Page navigated to → ${currentURL}`)
this.states.lastURL = currentURL
}
// Warn about remaining steps
const remaining = MAX_STEPS - stepCount
if (remaining === 5) {
this.pushObservation(
`⚠️ Only ${remaining} steps remaining. Consider wrapping up or calling done with partial results.`
)
} else if (remaining === 2) {
this.pushObservation(
`⚠️ Critical: Only ${remaining} steps left! You must finish the task or call done immediately.`
)
}
}
async #assembleUserPrompt(): Promise<string> {
let prompt = ''
// <instructions> (optional)
prompt += await this.#getInstructions()
// <agent_state>
// - <user_request>
// - <step_info>
// <agent_state>
const stepCount = this.history.filter((e) => e.type === 'step').length
prompt += `<agent_state>
<user_request>
${this.task}
</user_request>
<step_info>
Step ${stepCount + 1} of ${MAX_STEPS} max possible steps
Current date and time: ${new Date().toISOString()}
</step_info>
</agent_state>
`
// <agent_history>
// - <step_N> for steps
// - <sys> for observations and system messages
prompt += '\n<agent_history>\n'
let stepIndex = 0
for (const event of this.history) {
if (event.type === 'step') {
stepIndex++
prompt += `<step_${stepIndex}>
Evaluation of Previous Step: ${event.reflection.evaluation_previous_goal}
Memory: ${event.reflection.memory}
Next Goal: ${event.reflection.next_goal}
Action Results: ${event.action.output}
</step_${stepIndex}>
`
} else if (event.type === 'observation') {
prompt += `<sys>${event.content}</sys>\n`
} else if (event.type === 'user_takeover') {
prompt += `<sys>User took over control and made changes to the page.</sys>\n`
} else if (event.type === 'error') {
// Error events are mainly for panel rendering, not included in LLM context
// to avoid polluting the agent's reasoning with transient errors
}
}
prompt += '</agent_history>\n\n'
// <browser_state>
prompt += await this.#getBrowserState()
return trimLines(prompt)
}
#onDone(text: string, success = true) {
this.pageController.cleanUpHighlights()
this.pageController.hideMask() // No await - fire and forget
this.#setStatus(success ? 'completed' : 'error')
this.#abortController.abort()
}
async #getBrowserState(): Promise<string> {
const state = await this.pageController.getBrowserState()
let content = state.content
if (this.config.transformPageContent) {
content = await this.config.transformPageContent(content)
}
return trimLines(`<browser_state>
Current Page: [${state.title}](${state.url})
${state.header}
${content}
${state.footer}
</browser_state>
`)
}
dispose(reason?: string) {
console.log('Disposing PageAgent...')
this.disposed = true
this.pageController.dispose()
this.history = []
this.#abortController.abort(reason ?? 'PageAgent disposed')
// Emit dispose event for UI cleanup
this.dispatchEvent(new Event('dispose'))
this.config.onDispose?.call(this, reason)
}
}

View File

@@ -0,0 +1,2 @@
// Agent-specific constants (LLM constants moved to @page-agent/llms)
export const MAX_STEPS = 20

View File

@@ -0,0 +1,130 @@
import type { LLMConfig } from '@page-agent/llms'
import type { PageController, PageControllerConfig } from '@page-agent/page-controller'
import type { PageAgentCore } from '../PageAgentCore'
import type { PageAgentTool } from '../tools'
import type { ExecutionResult, HistoricalEvent } from '../types'
export type { LLMConfig }
/** Supported UI languages */
export type SupportedLanguage = 'en-US' | 'zh-CN'
export interface AgentConfig {
// theme?: 'light' | 'dark'
language?: SupportedLanguage
/**
* Custom tools to extend PageAgent capabilities
* @experimental
* @note You can also override or remove internal tools by using the same name.
* @see PageAgentTool
*
* @example
* // override internal tool
* import { tool } from 'page-agent'
* const customTools = {
* ask_user: tool({
* description:
* 'Ask the user or parent model a question and wait for their answer. Use this if you need more information or clarification.',
* inputSchema: zod.object({
* question: zod.string(),
* }),
* execute: async function (this: PageAgent, input) {
* const answer = await do_some_thing(input.question)
* return "✅ Received user answer: " + answer
* },
* })
* }
*
* @example
* // remove internal tool
* const customTools = {
* ask_user: null // never ask user questions
* }
*/
customTools?: Record<string, PageAgentTool | null>
/**
* Instructions to guide the agent's behavior
*/
instructions?: {
/**
* Global system-level instructions, applied to all tasks
*/
system?: string
/**
* Dynamic page-level instructions callback
* Called before each step to get instructions for the current page
* @param url - Current page URL (window.location.href)
* @returns Instructions string, or undefined/null to skip
*/
getPageInstructions?: (url: string) => string | undefined | null
}
// lifecycle hooks
// @todo: use event instead of hooks
// @todo: remove `this` binding, pass agent as explicit parameter instead
onBeforeStep?: (this: PageAgentCore, stepCnt: number) => Promise<void> | void
onAfterStep?: (this: PageAgentCore, history: HistoricalEvent[]) => Promise<void> | void
onBeforeTask?: (this: PageAgentCore) => Promise<void> | void
onAfterTask?: (this: PageAgentCore, result: ExecutionResult) => Promise<void> | void
/**
* @note this hook can block the disposal process
* @todo remove `this` binding, pass agent as explicit parameter instead
*/
onDispose?: (this: PageAgentCore, reason?: string) => void
// page behavior hooks
/**
* @experimental
* Enable the experimental script execution tool that allows executing generated JavaScript code on the page.
* @note Can cause unpredictable side effects.
* @note May bypass some safe guards and data-masking mechanisms.
*/
experimentalScriptExecutionTool?: boolean
/**
* Transform page content before sending to LLM.
* Called after DOM extraction and simplification, before LLM invocation.
* Use cases: inspect extraction results, modify page info, mask sensitive data.
*
* @param content - Simplified page content that will be sent to LLM
* @returns Transformed content
*
* @example
* // Mask phone numbers
* transformPageContent: async (content) => {
* return content.replace(/1[3-9]\d{9}/g, '***********')
* }
*/
transformPageContent?: (content: string) => Promise<string> | string
/**
* @experimental
* Custom PageController instance to control page navigation and actions
* @note If not provided, a default PageController will be created
*/
pageController?: PageController
/**
* TODO: @unimplemented
* hook when action causes a new page to be opened
* @note PageAgent will try to detect new pages and decide if it's caused by an action. But not very reliable.
* @todo remove `this` binding, pass agent as explicit parameter instead
*/
// onNewPageOpen?: (this: PageAgent, url: string) => Promise<void> | void
/**
* TODO: @unimplemented
* try to navigate to a new page instead of opening a new tab/window.
* @note will unload the current page when a action tries to open a new page. so that things keep in the same tab/window.
*/
// experimentalPreventNewPage?: boolean
}
export type PageAgentConfig = LLMConfig & AgentConfig & PageControllerConfig

6
packages/core/src/env.d.ts vendored Normal file
View File

@@ -0,0 +1,6 @@
/// <reference types="vite/client" />
declare module '*.md?raw' {
const content: string
export default content
}

View File

@@ -0,0 +1 @@
system_prompt.md

View File

@@ -0,0 +1,155 @@
You are an AI agent designed to operate in an iterative loop to automate browser tasks. Your ultimate goal is accomplishing the task provided in <user_request>.
<intro>
You excel at following tasks:
1. Navigating complex websites and extracting precise information
2. Automating form submissions and interactive web actions
3. Gathering and saving information
4. Operate effectively in an agent loop
5. Efficiently performing diverse web tasks
</intro>
<language_settings>
- Default working language: **中文**
- Use the language that user is using. Return in user's language.
</language_settings>
<input>
At every step, your input will consist of:
1. <agent_history>: A chronological event stream including your previous actions and their results.
2. <agent_state>: Current <user_request> and <step_info>.
3. <browser_state>: Current URL, interactive elements indexed for actions, and visible page content.
</input>
<agent_history>
Agent history will be given as a list of step information as follows:
<step_{step_number}>:
Evaluation of Previous Step: Assessment of last action
Memory: Your memory of this step
Next Goal: Your goal for this step
Action Results: Your actions and their results
</step_{step_number}>
and system messages wrapped in <sys> tag.
</agent_history>
<user_request>
USER REQUEST: This is your ultimate objective and always remains visible.
- This has the highest priority. Make the user happy.
- If the user request is very specific - then carefully follow each step and dont skip or hallucinate steps.
- If the task is open ended you can plan yourself how to get it done.
</user_request>
<browser_state>
1. Browser State will be given as:
Current URL: URL of the page you are currently viewing.
Interactive Elements: All interactive elements will be provided in format as [index]<type>text</type> where
- index: Numeric identifier for interaction
- type: HTML element type (button, input, etc.)
- text: Element description
Examples:
[33]<div>User form</div>
\t*[35]<button aria-label='Submit form'>Submit</button>
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Pure text elements without [] are not interactive.
</browser_state>
<browser_rules>
Strictly follow these rules while using the browser and navigating the web:
- Only interact with elements that have a numeric [index] assigned.
- Only use indexes that are explicitly provided.
- If the page changes after, for example, an input text action, analyze if you need to interact with new elements, e.g. selecting the right option from the list.
- By default, only elements in the visible viewport are listed. Use scrolling actions if you suspect relevant content is offscreen which you need to interact with. Scroll ONLY if there are more pixels below or above the page.
- You can scroll by a specific number of pages using the num_pages parameter (e.g., 0.5 for half page, 2.0 for two pages).
- All the elements that are scrollable are marked with `data-scrollable` attribute. Including the scrollable distance in every directions. You can scroll *the element* in case some area are overflowed.
- If a captcha appears, tell user you can not solve captcha. finished the task and ask user to solve it.
- If expected elements are missing, try scrolling, or navigating back.
- If the page is not fully loaded, use the `wait` action.
- Do not repeat one action for more than 3 times unless some conditions changed.
- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
- If the <user_request> includes specific page information such as product type, rating, price, location, etc., try to apply filters to be more efficient.
- The <user_request> is the ultimate goal. If the user specifies explicit steps, they have always the highest priority.
- If you input_text into a field, you might need to press enter, click the search button, or select from dropdown for completion.
- Don't login into a page if you don't have to. Don't login if you don't have the credentials.
- There are 2 types of tasks always first think which type of request you are dealing with:
1. Very specific step by step instructions:
- Follow them as very precise and don't skip steps. Try to complete everything as requested.
2. Open ended tasks. Plan yourself, be creative in achieving them.
- If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
</browser_rules>
<capability>
- You can only handle single page app. Do not jump out of current page.
- Do not click on link if it will open in a new page (etc. <a target="_blank">)
- It is ok to fail the task.
- User can be wrong. If the request of user is not achievable, inappropriate or you do not have enough information or tools to achieve it. Tell user to make a better request.
- Webpage can be broken. All webpages or apps have bugs. Some bug will make it hard for your job. It's encouraged to tell user the problem of current page. Your feedbacks (including failing) are valuable for user.
- Trying to hard can be harmful. Repeating some action back and forth or pushing for a complex procedure with little knowledge can cause unwanted result and harmful side-effects. User would rather you to complete the task with a fail.
- If you do not have knowledge for the current webpage or task. You must require user to give specific instructions and detailed steps.
</capability>
<task_completion_rules>
You must call the `done` action in one of three cases:
- When you have fully completed the USER REQUEST.
- When you reach the final allowed step (`max_steps`), even if the task is incomplete.
- When you feel stuck or unable to solve user request. Or user request is not clear or contains inappropriate content.
- If it is ABSOLUTELY IMPOSSIBLE to continue.
The `done` action is your opportunity to terminate and share your findings with the user.
- Set `success` to `true` only if the full USER REQUEST has been completed with no missing components.
- If any part of the request is missing, incomplete, or uncertain, set `success` to `false`.
- You can use the `text` field of the `done` action to communicate your findings and to provide a coherent reply to the user and fulfill the USER REQUEST.
- You are ONLY ALLOWED to call `done` as a single action. Don't call it together with other actions.
- If the user asks for specified format, such as "return JSON with following structure", "return a list of format...", MAKE sure to use the right format in your answer.
- If the user asks for a structured output, your `done` action's schema may be modified. Take this schema into account when solving the task!
</task_completion_rules>
<reasoning_rules>
Exhibit the following reasoning patterns to successfully achieve the <user_request>:
- Reason about <agent_history> to track progress and context toward <user_request>.
- Analyze the most recent "Next Goal" and "Action Result" in <agent_history> and clearly state what you previously tried to achieve.
- Analyze all relevant items in <agent_history> and <browser_state> to understand your state.
- Explicitly judge success/failure/uncertainty of the last action. Never assume an action succeeded just because it appears to be executed in your last step in <agent_history>. If the expected change is missing, mark the last action as failed (or uncertain) and plan a recovery.
- Analyze whether you are stuck, e.g. when you repeat the same actions multiple times without any progress. Then consider alternative approaches e.g. scrolling for more context or ask user for help.
- Ask user for help if you have any difficulty. Keep user in the loop.
- If you see information relevant to <user_request>, plan saving the information to memory.
- Always reason about the <user_request>. Make sure to carefully analyze the specific steps and information required. E.g. specific filters, specific form fields, specific information to search. Make sure to always compare the current trajectory with the user request and think carefully if thats how the user requested it.
</reasoning_rules>
<examples>
Here are examples of good output patterns. Use them as reference but never copy them directly.
<evaluation_examples>
- Positive Examples:
"evaluation_previous_goal": "Successfully navigated to the product page and found the target information. Verdict: Success"
"evaluation_previous_goal": "Clicked the login button and user authentication form appeared. Verdict: Success"
</evaluation_examples>
<memory_examples>
"memory": "Found many pending reports that need to be analyzed in the main page. Successfully processed the first 2 reports on quarterly sales data and moving on to inventory analysis and customer feedback reports."
</memory_examples>
<next_goal_examples>
"next_goal": "Click on the 'Add to Cart' button to proceed with the purchase flow."
"next_goal": "Extract details from the first item on the page."
</next_goal_examples>
</examples>
<output>
You must ALWAYS respond with a valid JSON in this exact format:
{
"evaluation_previous_goal": "Concise one-sentence analysis of your last action. Clearly state success, failure, or uncertain.",
"memory": "1-3 concise sentences of specific memory of this step and overall progress. You should put here everything that will help you track progress in future steps. Like counting pages visited, items found, etc.",
"next_goal": "State the next immediate goal and action to achieve it, in one clear sentence."
"action":{"one_action_name": {// action-specific parameter}}
}
</output>

View File

@@ -0,0 +1,198 @@
/**
* Internal tools for PageAgent.
* @note Adapted from browser-use
*/
import zod, { type z } from 'zod'
import type { PageAgentCore } from '../PageAgentCore'
import { waitFor } from '../utils'
/**
* Internal tool definition that has access to PageAgent `this` context
*/
export interface PageAgentTool<TParams = any> {
// name: string
description: string
inputSchema: z.ZodType<TParams>
execute: (this: PageAgentCore, args: TParams) => Promise<string>
}
export function tool<TParams>(options: PageAgentTool<TParams>): PageAgentTool<TParams> {
return options
}
/**
* Internal tools for PageAgent.
* Note: Using any to allow different parameter types for each tool
*/
export const tools = new Map<string, PageAgentTool>()
tools.set(
'done',
tool({
description:
'Complete task - provide a summary of results for the user. Set success=True if task completed successfully, false otherwise. Text should be your response to the user summarizing results.',
inputSchema: zod.object({
text: zod.string(),
success: zod.boolean().default(true),
}),
execute: async function (this: PageAgentCore, input) {
// @note main loop will handle this one
// this.onDone(input.text, input.success)
return Promise.resolve('Task completed')
},
})
)
tools.set(
'wait',
tool({
description:
'Wait for x seconds. default 1s (max 10 seconds, min 1 second). This can be used to wait until the page or data is fully loaded.',
inputSchema: zod.object({
seconds: zod.number().min(1).max(10).default(1),
}),
execute: async function (this: PageAgentCore, input) {
const lastTimeUpdate = await this.pageController.getLastUpdateTime()
const actualWaitTime = Math.max(0, input.seconds - (Date.now() - lastTimeUpdate) / 1000)
console.log(`actualWaitTime: ${actualWaitTime} seconds`)
await waitFor(actualWaitTime)
this.states.totalWaitTime += input.seconds
if (this.states.totalWaitTime >= 3) {
this.pushObservation(
`You have waited ${this.states.totalWaitTime} seconds accumulatively. Do NOT wait any longer unless you have a good reason.`
)
}
return `✅ Waited for ${input.seconds} seconds.`
},
})
)
tools.set(
'ask_user',
tool({
description:
'Ask the user a question and wait for their answer. Use this if you need more information or clarification.',
inputSchema: zod.object({
question: zod.string(),
}),
execute: async function (this: PageAgentCore, input) {
if (!this.onAskUser) {
throw new Error('ask_user tool requires onAskUser callback to be set')
}
const answer = await this.onAskUser(input.question)
return `User answered: ${answer}`
},
})
)
tools.set(
'click_element_by_index',
tool({
description: 'Click element by index',
inputSchema: zod.object({
index: zod.int().min(0),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.clickElement(input.index)
return result.message
},
})
)
tools.set(
'input_text',
tool({
description: 'Click and input text into a input interactive element',
inputSchema: zod.object({
index: zod.int().min(0),
text: zod.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.inputText(input.index, input.text)
return result.message
},
})
)
tools.set(
'select_dropdown_option',
tool({
description:
'Select dropdown option for interactive element index by the text of the option you want to select',
inputSchema: zod.object({
index: zod.int().min(0),
text: zod.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.selectOption(input.index, input.text)
return result.message
},
})
)
/**
* @note Reference from browser-use
*/
tools.set(
'scroll',
tool({
description:
'Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 1.0 for one page, etc.). Optional index parameter to scroll within a specific element or its scroll container (works well for dropdowns and custom UI components). Optional pixels parameter to scroll by a specific number of pixels instead of pages.',
inputSchema: zod.object({
down: zod.boolean().default(true),
num_pages: zod.number().min(0).max(10).optional().default(0.1),
pixels: zod.number().int().min(0).optional(),
index: zod.number().int().min(0).optional(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.scroll({
...input,
numPages: input.num_pages,
})
return result.message
},
})
)
tools.set(
'scroll_horizontally',
tool({
description:
'Scroll the page or element horizontally (set right=True to scroll right, right=False to scroll left, pixels=number of pixels to scroll). Optional index parameter to scroll within a specific element or its scroll container (works well for wide tables).',
inputSchema: zod.object({
right: zod.boolean().default(true),
pixels: zod.number().int().min(0),
index: zod.number().int().min(0).optional(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.scrollHorizontally(input)
return result.message
},
})
)
tools.set(
'execute_javascript',
tool({
description:
'Execute JavaScript code on the current page. Supports async/await syntax. Use with caution!',
inputSchema: zod.object({
script: zod.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.executeJavascript(input.script)
return result.message
},
})
)
// @todo get_dropdown_options
// @todo select_dropdown_option
// @todo send_keys
// @todo upload_file
// @todo go_back
// @todo extract_structured_data

109
packages/core/src/types.ts Normal file
View File

@@ -0,0 +1,109 @@
/**
* Agent reflection state - the reflection-before-action model
*
* Every tool call must first reflect on:
* - evaluation_previous_goal: How well did the previous action achieve its goal?
* - memory: Key information to remember for future steps
* - next_goal: What should be accomplished in the next action?
*/
export interface AgentReflection {
evaluation_previous_goal: string
memory: string
next_goal: string
}
/**
* MacroTool input structure
*
* This is the core abstraction that enforces the "reflection-before-action" mental model.
* Before executing any action, the LLM must output its reasoning state.
*/
export interface MacroToolInput extends Partial<AgentReflection> {
action: Record<string, any>
}
/**
* MacroTool output structure
*/
export interface MacroToolResult {
input: MacroToolInput
output: string
}
/**
* A single agent step with reflection and action
*/
export interface AgentStep {
type: 'step'
reflection: Partial<AgentReflection>
action: {
name: string
input: any
output: string
}
usage: {
promptTokens: number
completionTokens: number
totalTokens: number
cachedTokens?: number
reasoningTokens?: number
}
}
/**
* Persistent observation event (stays in memory)
*/
export interface ObservationEvent {
type: 'observation'
content: string
}
/**
* User takeover event
*/
export interface UserTakeoverEvent {
type: 'user_takeover'
}
/**
* Error event (retry or error from LLM)
*/
export interface ErrorEvent {
type: 'error'
errorType: 'retry' | 'error'
message: string
attempt?: number
maxAttempts?: number
}
/**
* Union type for all history events
*/
export type HistoricalEvent = AgentStep | ObservationEvent | UserTakeoverEvent | ErrorEvent
/**
* Agent execution status
*/
export type AgentStatus = 'idle' | 'running' | 'completed' | 'error'
/**
* Agent activity - transient state for immediate UI feedback.
*
* Unlike historical events (which are persisted), activities are ephemeral
* and represent "what the agent is doing right now". UI components should
* listen to 'activity' events to show real-time feedback.
*
* Note: There is no 'idle' activity - absence of activity events means idle.
*/
export type AgentActivity =
| { type: 'thinking' }
| { type: 'executing'; tool: string; input: unknown }
| { type: 'executed'; tool: string; input: unknown; output: string; duration: number }
| { type: 'retrying'; attempt: number; maxAttempts: number }
| { type: 'error'; message: string }
export interface ExecutionResult {
success: boolean
data: string
history: HistoricalEvent[]
}

View File

@@ -0,0 +1,17 @@
import chalk from 'chalk'
/**
* Simple assertion function that throws an error if the condition is falsy
* @param condition - The condition to assert
* @param message - Optional error message
* @throws Error if condition is falsy
*/
export function assert(condition: unknown, message?: string, silent?: boolean): asserts condition {
if (!condition) {
const errorMessage = message ?? 'Assertion failed'
if (!silent) console.error(chalk.red(`❌ assert: ${errorMessage}`))
throw new Error(errorMessage)
}
}

View File

@@ -0,0 +1,157 @@
import chalk from 'chalk'
/**
* Normalize LLM response and fix common format issues.
*
* Handles:
* - No tool_calls but JSON in message.content (fallback)
* - Model returns action name as tool call instead of AgentOutput
* - Arguments wrapped as double JSON string
* - Nested function call format
* - Missing action field (fallback to wait)
* - etc.
*/
export function normalizeResponse(response: any): any {
let resolvedArguments = null as any
const choice = (response as { choices?: Choice[] }).choices?.[0]
if (!choice) throw new Error('No choices in response')
const message = choice.message
if (!message) throw new Error('No message in choice')
const toolCall = message.tool_calls?.[0]
// fix level and location of arguments
if (toolCall?.function?.arguments) {
resolvedArguments = safeJsonParse(toolCall.function.arguments)
// case: sometimes the model only returns the action level
if (toolCall.function.name && toolCall.function.name !== 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #1: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
// case: sometimes the model returns json in content instead of tool_calls
if (message.content) {
const content = message.content.trim()
const jsonInContent = retrieveJsonFromString(content)
if (jsonInContent) {
resolvedArguments = safeJsonParse(jsonInContent)
// case: sometimes the content json includes upper level wrapper
if (resolvedArguments?.name === 'AgentOutput') {
console.log(chalk.yellow(`[normalizeResponse] #2: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.arguments)
}
// case: sometimes even 2-levels of wrapping
if (resolvedArguments?.type === 'function') {
console.log(chalk.yellow(`[normalizeResponse] #3: fixing tool_call`))
resolvedArguments = safeJsonParse(resolvedArguments.function.arguments)
}
// case: and sometimes action level only
// todo: needs better detection logic
if (
!resolvedArguments?.action &&
!resolvedArguments?.evaluation_previous_goal &&
!resolvedArguments?.memory &&
!resolvedArguments?.next_goal &&
!resolvedArguments?.thinking
) {
console.log(chalk.yellow(`[normalizeResponse] #4: fixing tool_call`))
resolvedArguments = { action: safeJsonParse(resolvedArguments) }
}
} else {
throw new Error('No tool_call and the message content does not contain valid JSON')
}
} else {
throw new Error('No tool_call nor message content is present')
}
}
// fix double stringified arguments
resolvedArguments = safeJsonParse(resolvedArguments)
if (resolvedArguments.action) {
resolvedArguments.action = safeJsonParse(resolvedArguments.action)
}
// fix incomplete formats
if (!resolvedArguments.action) {
console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
resolvedArguments.action = { name: 'wait', input: { seconds: 1 } }
}
// pack back to standard format
return {
...response,
choices: [
{
...choice,
message: {
...message,
tool_calls: [
{
...(toolCall || {}),
function: {
...(toolCall?.function || {}),
name: 'AgentOutput',
arguments: JSON.stringify(resolvedArguments),
},
},
],
},
},
],
}
}
/**
* Safely parse JSON, return original input if not json.
*/
function safeJsonParse(input: any): any {
if (typeof input === 'string') {
try {
return JSON.parse(input.trim())
} catch {
return input
}
}
return input
}
/**
* Extract and parse JSON from a string.
* - Treat content between the first `{` and the last `}` as JSON.
* - Try to parse that content as JSON and return the parsed value (object/array/primitive) if successful, otherwise return null.
*/
function retrieveJsonFromString(str: string): any {
try {
const json = /({[\s\S]*})/.exec(str) ?? []
if (json.length === 0) {
return null
}
return JSON.parse(json[0]!)
} catch {
return null
}
}
interface Choice {
message?: {
role?: 'assistant'
content?: string
tool_calls?: {
id?: string
type?: 'function'
function?: {
name?: string
arguments?: string
}
}[]
}
index?: 0
finish_reason?: 'tool_calls'
}

View File

@@ -0,0 +1,87 @@
export { normalizeResponse } from './autoFixer'
/**
* Wait until condition becomes true
* @returns Returns when condition becomes true, throws otherwise
* @param timeout Timeout in milliseconds, default 0 means no timeout, throws error on timeout
*/
export async function waitUntil(check: () => boolean, timeout = 60 * 60_1000): Promise<boolean> {
if (check()) return true
return new Promise((resolve, reject) => {
const start = Date.now()
const interval = setInterval(() => {
if (check()) {
clearInterval(interval)
resolve(true)
} else if (Date.now() - start > timeout) {
clearInterval(interval)
reject(new Error('Timeout waiting for condition to become true'))
}
}, 100)
})
}
export async function waitFor(seconds: number): Promise<void> {
await new Promise((resolve) => setTimeout(resolve, seconds * 1000))
}
//
export function truncate(text: string, maxLength: number): string {
if (text.length > maxLength) {
return text.substring(0, maxLength) + '...'
}
return text
}
//
export function trimLines(text: string): string {
return text
.split('\n')
.map((line) => line.trim())
.join('\n')
}
//
export function randomID(existingIDs?: string[]): string {
let id = Math.random().toString(36).substring(2, 11)
if (!existingIDs) {
return id
}
const MAX_TRY = 1000
let tryCount = 0
while (existingIDs.includes(id)) {
id = Math.random().toString(36).substring(2, 11)
tryCount++
if (tryCount > MAX_TRY) {
throw new Error('randomID: too many try')
}
}
return id
}
//
const _global = globalThis as any
if (!_global.__PAGE_AGENT_IDS__) {
_global.__PAGE_AGENT_IDS__ = []
}
const ids = _global.__PAGE_AGENT_IDS__
/**
* Generate a random ID.
* @note Unique within this window.
*/
export function uid() {
const id = randomID(ids)
ids.push(id)
return id
}