diff --git a/package-lock.json b/package-lock.json index 0e9fa44..18ab1c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11060,7 +11060,7 @@ }, "packages/extension": { "name": "@page-agent/ext", - "version": "0.1.2", + "version": "0.1.3", "hasInstallScript": true, "dependencies": { "@page-agent/core": "1.1.0", diff --git a/packages/extension/docs/extension_api.md b/packages/extension/docs/extension_api.md index f9e9058..2a0bbda 100644 --- a/packages/extension/docs/extension_api.md +++ b/packages/extension/docs/extension_api.md @@ -33,7 +33,6 @@ import type { AgentStatus, ExecutionResult, HistoricalEvent, - LLMConfig, } from '@page-agent/core' // Wait for extension injection (up to 1 second) @@ -48,18 +47,13 @@ async function waitForExtension(timeout = 1000): Promise { // Usage if (await waitForExtension()) { - const result = await window.PAGE_AGENT_EXT!.execute( - 'Click the login button', - { - baseURL: 'https://api.openai.com/v1', - apiKey: 'your-api-key', - model: 'gpt-5.2', - }, - { - onStatusChange: (status) => console.log('Status:', status), - onActivity: (activity) => console.log('Activity:', activity), - } - ) + const result = await window.PAGE_AGENT_EXT!.execute('Click the login button', { + baseURL: 'https://api.openai.com/v1', + apiKey: 'your-api-key', + model: 'gpt-5.2', + onStatusChange: (status) => console.log('Status:', status), + onActivity: (activity) => console.log('Activity:', activity), + }) console.log('Result:', result) } ``` @@ -76,7 +70,7 @@ Extension version string (e.g., `"1.0.0"`). This is exposed separately to allow Main API namespace object containing: -#### `PAGE_AGENT_EXT.execute(task, llmConfig, hooks?)` +#### `PAGE_AGENT_EXT.execute(task, config)` Execute an agent task. @@ -85,8 +79,7 @@ Execute an agent task. | Name | Type | Required | Description | |------|------|----------|-------------| | `task` | `string` | Yes | Task description | -| `llmConfig` | `LLMConfig` | Yes | LLM configuration | -| `hooks` | `ExecuteHooks` | No | Event callbacks | +| `config` | `ExecuteConfig` | Yes | Execution configuration (LLM settings, options, and event callbacks) | **Returns:** `Promise` @@ -104,21 +97,26 @@ import type { AgentStatus, ExecutionResult, HistoricalEvent, - LLMConfig, } from '@page-agent/core' -export interface ExecuteHooks { +export interface ExecuteConfig { + baseURL: string + apiKey: string + model: string + + /** + * Whether to include the initial tab (that holds this main world script) in the task. + * @default true + */ + includeInitialTab?: boolean + onStatusChange?: (status: AgentStatus) => void onActivity?: (activity: AgentActivity) => void onHistoryUpdate?: (history: HistoricalEvent[]) => void onDispose?: () => void } -export type Execute = ( - task: string, - llmConfig: LLMConfig, - hooks?: ExecuteHooks -) => Promise +export type Execute = (task: string, config: ExecuteConfig) => Promise ``` ### AgentStatus @@ -164,16 +162,6 @@ type HistoricalEvent = | { type: 'error'; message: string; rawResponse?: unknown } ``` -### LLMConfig - -```typescript -interface LLMConfig { - baseURL: string // e.g. 'https://api.openai.com/v1' - apiKey: string - model: string // e.g. 'gpt-5.2' -} -``` - ### ExecutionResult ```typescript @@ -205,44 +193,63 @@ if (result.success) { } ``` -### With Event Hooks +### Exclude Initial Tab + +By default, the agent includes the initial tab (where the script runs) in the task. Set `includeInitialTab: false` to exclude it: ```typescript -await window.PAGE_AGENT_EXT!.execute( - 'Navigate to the settings page', - llmConfig, +const result = await window.PAGE_AGENT_EXT!.execute( + 'Open a new tab and search for page-agent on GitHub', { - onStatusChange: (status) => { - updateUI({ agentStatus: status }) - }, - onActivity: (activity) => { - switch (activity.type) { - case 'thinking': - showSpinner('Agent is thinking...') - break - case 'executing': - showSpinner(`Executing: ${activity.tool}`) - break - case 'executed': - log(`${activity.tool} completed in ${activity.duration}ms`) - break - case 'error': - showError(activity.message) - break - } - }, - onHistoryUpdate: (history) => { - renderHistory(history) - }, + baseURL: 'https://api.openai.com/v1', + apiKey: process.env.OPENAI_API_KEY!, + model: 'gpt-5.2', + includeInitialTab: false, // Agent will open new tabs only } ) ``` +### With Event Callbacks + +```typescript +await window.PAGE_AGENT_EXT!.execute('Navigate to the settings page', { + baseURL: 'https://api.openai.com/v1', + apiKey: process.env.OPENAI_API_KEY!, + model: 'gpt-5.2', + onStatusChange: (status) => { + updateUI({ agentStatus: status }) + }, + onActivity: (activity) => { + switch (activity.type) { + case 'thinking': + showSpinner('Agent is thinking...') + break + case 'executing': + showSpinner(`Executing: ${activity.tool}`) + break + case 'executed': + log(`${activity.tool} completed in ${activity.duration}ms`) + break + case 'error': + showError(activity.message) + break + } + }, + onHistoryUpdate: (history) => { + renderHistory(history) + }, +}) +``` + ### Stop Execution ```typescript // Start a task -window.PAGE_AGENT_EXT!.execute('Scroll through all pages', llmConfig) +window.PAGE_AGENT_EXT!.execute('Scroll through all pages', { + baseURL: 'https://api.openai.com/v1', + apiKey: process.env.OPENAI_API_KEY!, + model: 'gpt-5.2', +}) // Later, stop it window.PAGE_AGENT_EXT!.dispose() @@ -258,24 +265,25 @@ import type { AgentStatus, ExecutionResult, HistoricalEvent, - LLMConfig, } from '@page-agent/core' +interface ExecuteConfig { + baseURL: string + apiKey: string + model: string + includeInitialTab?: boolean + onStatusChange?: (status: AgentStatus) => void + onActivity?: (activity: AgentActivity) => void + onHistoryUpdate?: (history: HistoricalEvent[]) => void + onDispose?: () => void +} + declare global { interface Window { PAGE_AGENT_EXT_VERSION?: string PAGE_AGENT_EXT?: { version: string - execute: ( - task: string, - llmConfig: LLMConfig, - hooks?: { - onStatusChange?: (status: AgentStatus) => void - onActivity?: (activity: AgentActivity) => void - onHistoryUpdate?: (history: HistoricalEvent[]) => void - onDispose?: () => void - } - ) => Promise + execute: (task: string, config: ExecuteConfig) => Promise dispose: () => void } } diff --git a/packages/extension/docs/extension_api_zh.md b/packages/extension/docs/extension_api_zh.md index 128b014..7cc64fd 100644 --- a/packages/extension/docs/extension_api_zh.md +++ b/packages/extension/docs/extension_api_zh.md @@ -33,7 +33,6 @@ import type { AgentStatus, ExecutionResult, HistoricalEvent, - LLMConfig, } from '@page-agent/core' // 等待插件注入(最多 1 秒) @@ -48,18 +47,13 @@ async function waitForExtension(timeout = 1000): Promise { // 使用 if (await waitForExtension()) { - const result = await window.PAGE_AGENT_EXT!.execute( - '点击登录按钮', - { - baseURL: 'https://api.openai.com/v1', - apiKey: 'your-api-key', - model: 'gpt-5.2', - }, - { - onStatusChange: (status) => console.log('状态:', status), - onActivity: (activity) => console.log('活动:', activity), - } - ) + const result = await window.PAGE_AGENT_EXT!.execute('点击登录按钮', { + baseURL: 'https://api.openai.com/v1', + apiKey: 'your-api-key', + model: 'gpt-5.2', + onStatusChange: (status) => console.log('状态:', status), + onActivity: (activity) => console.log('活动:', activity), + }) console.log('结果:', result) } ``` @@ -76,7 +70,7 @@ if (await waitForExtension()) { 主 API 命名空间对象,包含: -#### `PAGE_AGENT_EXT.execute(task, llmConfig, hooks?)` +#### `PAGE_AGENT_EXT.execute(task, config)` 执行 Agent 任务。 @@ -85,8 +79,7 @@ if (await waitForExtension()) { | 名称 | 类型 | 必填 | 说明 | |------|------|------|------| | `task` | `string` | 是 | 任务描述 | -| `llmConfig` | `LLMConfig` | 是 | LLM 配置 | -| `hooks` | `ExecuteHooks` | 否 | 事件回调 | +| `config` | `ExecuteConfig` | 是 | 执行配置(LLM 设置、选项和事件回调) | **返回:** `Promise` @@ -104,21 +97,26 @@ import type { AgentStatus, ExecutionResult, HistoricalEvent, - LLMConfig, } from '@page-agent/core' -export interface ExecuteHooks { +export interface ExecuteConfig { + baseURL: string + apiKey: string + model: string + + /** + * 是否将初始标签页(运行此脚本的页面)包含在任务中。 + * @default true + */ + includeInitialTab?: boolean + onStatusChange?: (status: AgentStatus) => void onActivity?: (activity: AgentActivity) => void onHistoryUpdate?: (history: HistoricalEvent[]) => void onDispose?: () => void } -export type Execute = ( - task: string, - llmConfig: LLMConfig, - hooks?: ExecuteHooks -) => Promise +export type Execute = (task: string, config: ExecuteConfig) => Promise ``` ### AgentStatus @@ -164,16 +162,6 @@ type HistoricalEvent = | { type: 'error'; message: string; rawResponse?: unknown } ``` -### LLMConfig - -```typescript -interface LLMConfig { - baseURL: string // 例如 'https://api.openai.com/v1' - apiKey: string - model: string // 例如 'gpt-5.2' -} -``` - ### ExecutionResult ```typescript @@ -205,44 +193,63 @@ if (result.success) { } ``` +### 排除初始标签页 + +默认情况下,Agent 会将初始标签页(运行脚本的页面)包含在任务中。设置 `includeInitialTab: false` 可以排除它: + +```typescript +const result = await window.PAGE_AGENT_EXT!.execute( + '打开新标签页并在 GitHub 上搜索 page-agent', + { + baseURL: 'https://api.openai.com/v1', + apiKey: process.env.OPENAI_API_KEY!, + model: 'gpt-5.2', + includeInitialTab: false, // Agent 只会打开新标签页 + } +) +``` + ### 使用事件回调 ```typescript -await window.PAGE_AGENT_EXT!.execute( - '导航到设置页面', - llmConfig, - { - onStatusChange: (status) => { - updateUI({ agentStatus: status }) - }, - onActivity: (activity) => { - switch (activity.type) { - case 'thinking': - showSpinner('Agent 正在思考...') - break - case 'executing': - showSpinner(`正在执行: ${activity.tool}`) - break - case 'executed': - log(`${activity.tool} 完成,耗时 ${activity.duration}ms`) - break - case 'error': - showError(activity.message) - break - } - }, - onHistoryUpdate: (history) => { - renderHistory(history) - }, - } -) +await window.PAGE_AGENT_EXT!.execute('导航到设置页面', { + baseURL: 'https://api.openai.com/v1', + apiKey: process.env.OPENAI_API_KEY!, + model: 'gpt-5.2', + onStatusChange: (status) => { + updateUI({ agentStatus: status }) + }, + onActivity: (activity) => { + switch (activity.type) { + case 'thinking': + showSpinner('Agent 正在思考...') + break + case 'executing': + showSpinner(`正在执行: ${activity.tool}`) + break + case 'executed': + log(`${activity.tool} 完成,耗时 ${activity.duration}ms`) + break + case 'error': + showError(activity.message) + break + } + }, + onHistoryUpdate: (history) => { + renderHistory(history) + }, +}) ``` ### 停止执行 ```typescript // 启动任务 -window.PAGE_AGENT_EXT!.execute('滚动浏览所有页面', llmConfig) +window.PAGE_AGENT_EXT!.execute('滚动浏览所有页面', { + baseURL: 'https://api.openai.com/v1', + apiKey: process.env.OPENAI_API_KEY!, + model: 'gpt-5.2', +}) // 稍后停止 window.PAGE_AGENT_EXT!.dispose() @@ -258,24 +265,25 @@ import type { AgentStatus, ExecutionResult, HistoricalEvent, - LLMConfig, } from '@page-agent/core' +interface ExecuteConfig { + baseURL: string + apiKey: string + model: string + includeInitialTab?: boolean + onStatusChange?: (status: AgentStatus) => void + onActivity?: (activity: AgentActivity) => void + onHistoryUpdate?: (history: HistoricalEvent[]) => void + onDispose?: () => void +} + declare global { interface Window { PAGE_AGENT_EXT_VERSION?: string PAGE_AGENT_EXT?: { version: string - execute: ( - task: string, - llmConfig: LLMConfig, - hooks?: { - onStatusChange?: (status: AgentStatus) => void - onActivity?: (activity: AgentActivity) => void - onHistoryUpdate?: (history: HistoricalEvent[]) => void - onDispose?: () => void - } - ) => Promise + execute: (task: string, config: ExecuteConfig) => Promise dispose: () => void } } diff --git a/packages/extension/package.json b/packages/extension/package.json index 165004c..e663d80 100644 --- a/packages/extension/package.json +++ b/packages/extension/package.json @@ -1,7 +1,7 @@ { "name": "@page-agent/ext", "private": true, - "version": "0.1.2", + "version": "0.1.3", "type": "module", "scripts": { "dev": "wxt", diff --git a/packages/extension/src/agent/MultiPageAgent.ts b/packages/extension/src/agent/MultiPageAgent.ts index c56eea5..4590008 100644 --- a/packages/extension/src/agent/MultiPageAgent.ts +++ b/packages/extension/src/agent/MultiPageAgent.ts @@ -17,7 +17,7 @@ function detectLanguage(): 'en-US' | 'zh-CN' { * - can be used from a side panel or a content script */ export class MultiPageAgent extends PageAgentCore { - constructor(config: Omit) { + constructor(config: Omit & { includeInitialTab?: boolean }) { // multi page controller const tabsController = new TabsController() const pageController = new RemotePageController(tabsController) @@ -31,6 +31,9 @@ export class MultiPageAgent extends PageAgentCore { `Default working language: **${targetLanguage}**` ) + // include initial tab for controlling + const includeInitialTab = config.includeInitialTab ?? true + /** * When the agent is in side-panel and user closed the side-panel. * There is no chance for isAgentRunning to be set false. @@ -47,7 +50,7 @@ export class MultiPageAgent extends PageAgentCore { customSystemPrompt: systemPrompt, onBeforeTask: async (agent) => { - await tabsController.init(agent.task) + await tabsController.init(agent.task, includeInitialTab) heartBeatInterval = window.setInterval(() => { chrome.storage.local.set({ diff --git a/packages/extension/src/agent/TabsController.ts b/packages/extension/src/agent/TabsController.ts index 8e3b3a6..ab30941 100644 --- a/packages/extension/src/agent/TabsController.ts +++ b/packages/extension/src/agent/TabsController.ts @@ -12,7 +12,7 @@ export class TabsController extends EventTarget { private task: string = '' private windowId: number | null = null - async init(task: string) { + async init(task: string, includeInitialTab: boolean = true) { this.task = task this.tabs = [] this.currentTabId = null @@ -26,17 +26,19 @@ export class TabsController extends EventTarget { }) this.initialTabId = result.tabId - this.currentTabId = result.tabId - - this.tabs.push({ - id: result.tabId, - isInitial: true, - }) if (!this.initialTabId) { throw new Error('Failed to get initial tab ID') } + if (includeInitialTab) { + this.currentTabId = this.initialTabId + this.tabs.push({ + id: result.tabId, + isInitial: true, + }) + } + await this.updateCurrentTabId(this.currentTabId) const tabChangeHandler = (message: any): void => { @@ -230,6 +232,10 @@ export class TabsController extends EventTarget { `| ${tab.id} | ${url} | ${title} | ${this.currentTabId === tab.id ? '✅' : ''} |` ) } + if (!this.tabs.length) { + summaries.push('\nNo tabs available. Open a tab if needed.') + } + return summaries.join('\n') } diff --git a/packages/extension/src/entrypoints/content.ts b/packages/extension/src/entrypoints/content.ts index 8855b81..0ff076c 100644 --- a/packages/extension/src/entrypoints/content.ts +++ b/packages/extension/src/entrypoints/content.ts @@ -69,11 +69,11 @@ async function exposeAgentToPage() { } try { - const { task, llmConfig } = payload + const { task, config } = payload // create when used - multiPageAgent = new MultiPageAgent(llmConfig) + multiPageAgent = new MultiPageAgent(config) // events diff --git a/packages/extension/src/entrypoints/main-world.ts b/packages/extension/src/entrypoints/main-world.ts index 04e9488..606a756 100644 --- a/packages/extension/src/entrypoints/main-world.ts +++ b/packages/extension/src/entrypoints/main-world.ts @@ -1,19 +1,24 @@ import type { AgentActivity, AgentStatus, ExecutionResult, HistoricalEvent } from '@page-agent/core' -import type { LLMConfig } from '@page-agent/llms' -export interface ExecuteHooks { +export type Execute = (task: string, config: ExecuteConfig) => Promise + +export interface ExecuteConfig { + baseURL: string + apiKey: string + model: string + + /** + * Whether to include the initial tab (that holds this main world script) in the task. + * @default true + */ + includeInitialTab?: boolean + onStatusChange?: (status: AgentStatus) => void onActivity?: (activity: AgentActivity) => void onHistoryUpdate?: (history: HistoricalEvent[]) => void onDispose?: () => void } -export type Execute = ( - task: string, - llmConfig: LLMConfig, - hooks?: ExecuteHooks -) => Promise - export default defineUnlistedScript(() => { let _lastId = 0 function getId() { @@ -21,13 +26,13 @@ export default defineUnlistedScript(() => { return _lastId } - const execute: Execute = async (task, llmConfig, hooks) => { + const execute: Execute = async (task, config) => { if (typeof task !== 'string') throw new Error('Task must be a string') if (task.trim().length === 0) throw new Error('Task cannot be empty') - if (!llmConfig) throw new Error('LLM config is required') - if (!llmConfig.baseURL) throw new Error('LLM config must have a baseURL') - if (!llmConfig.apiKey) throw new Error('LLM config must have an apiKey') - if (!llmConfig.model) throw new Error('LLM config must have a model') + if (!config) throw new Error('Config is required') + if (!config.baseURL) throw new Error('Config must have a baseURL') + if (!config.apiKey) throw new Error('Config must have an apiKey') + if (!config.model) throw new Error('Config must have a model') const id = getId() @@ -40,30 +45,31 @@ export default defineUnlistedScript(() => { // events - if (data.action === 'status_change_event' && hooks?.onStatusChange) { - hooks.onStatusChange(data.payload) + if (data.action === 'status_change_event' && config.onStatusChange) { + config.onStatusChange(data.payload) return } - if (data.action === 'activity_event' && hooks?.onActivity) { - hooks.onActivity(data.payload) + if (data.action === 'activity_event' && config.onActivity) { + config.onActivity(data.payload) return } - if (data.action === 'history_change_event' && hooks?.onHistoryUpdate) { - hooks.onHistoryUpdate(data.payload) + if (data.action === 'history_change_event' && config.onHistoryUpdate) { + config.onHistoryUpdate(data.payload) return } - if (data.action === 'dispose_event' && hooks?.onDispose) { - hooks.onDispose() + if (data.action === 'dispose_event' && config.onDispose) { + config.onDispose() + window.removeEventListener('message', handleMessage) return } - // result - if (data.action !== 'execute_result') return + // execute_result + window.removeEventListener('message', handleMessage) if (data.error) { @@ -73,6 +79,7 @@ export default defineUnlistedScript(() => { } } + // @note will be removed on dispose or result window.addEventListener('message', handleMessage) }) @@ -81,7 +88,15 @@ export default defineUnlistedScript(() => { channel: 'PAGE_AGENT_EXT_REQUEST', id, action: 'execute', - payload: { task, llmConfig }, + payload: { + task, + config: { + baseURL: config.baseURL, + apiKey: config.apiKey, + model: config.model, + includeInitialTab: config.includeInitialTab, + }, + }, }, '*' ) diff --git a/packages/website/src/pages/docs/features/chrome-extension/page.tsx b/packages/website/src/pages/docs/features/chrome-extension/page.tsx index 552f61e..6266b0f 100644 --- a/packages/website/src/pages/docs/features/chrome-extension/page.tsx +++ b/packages/website/src/pages/docs/features/chrome-extension/page.tsx @@ -152,27 +152,24 @@ localStorage.setItem('PageAgentExtUserAuthToken', '') -

- PAGE_AGENT_EXT.execute(task, llmConfig, hooks?) -

+

PAGE_AGENT_EXT.execute(task, config)

{isZh - ? '使用 LLM 配置执行任务。返回一个 Promise,在任务完成时 resolve。可选的 hooks 参数用于监听任务执行过程中的事件。' - : 'Execute a task with LLM configuration. Returns a Promise that resolves when the task completes. Optional hooks parameter for listening to events during task execution.'} + ? '使用配置执行任务。返回一个 Promise,在任务完成时 resolve。config 参数包含 LLM 设置、选项和事件回调。' + : 'Execute a task with configuration. Returns a Promise that resolves when the task completes. Config includes LLM settings, options, and event callbacks.'}

console.log('状态变化:', status), onActivity: activity => console.log('活动:', activity), onHistoryUpdate: history => console.log('历史更新:', history), @@ -181,15 +178,14 @@ const result = await window.PAGE_AGENT_EXT.execute( ) console.log(result) // 任务执行结果` - : `// Execute a task with LLM configuration and hooks + : `// Execute a task with configuration const result = await window.PAGE_AGENT_EXT.execute( 'Search for "page-agent" on GitHub and open the first result', { baseURL: 'https://api.openai.com/v1', apiKey: 'your-api-key', - model: 'gpt-5-2' - }, - { + model: 'gpt-5-2', + // includeInitialTab: false, // Set to false to exclude initial tab onStatusChange: status => console.log('Status change:', status), onActivity: activity => console.log('Activity:', activity), onHistoryUpdate: history => console.log('History update:', history), @@ -221,41 +217,26 @@ window.PAGE_AGENT_EXT.dispose()` /> - {/* LLM Config */} + {/* ExecuteConfig */}
-

{isZh ? 'LLM 配置' : 'LLM Configuration'}

- - -
- - {/* Execute Hooks */} -
-

{isZh ? 'Execute Hooks' : 'Execute Hooks'}

+

{isZh ? '执行配置' : 'Execute Configuration'}

{isZh - ? '通过 hooks 参数,你可以监听任务执行过程中的各种事件,实现实时更新 UI、日志记录等功能。' - : 'With hooks parameter, you can listen to various events during task execution for real-time UI updates, logging, and more.'} + ? 'config 参数包含 LLM 设置、选项和事件回调,用于控制任务执行行为。' + : 'The config parameter includes LLM settings, options, and event callbacks to control task execution behavior.'}

void @@ -268,7 +249,14 @@ window.PAGE_AGENT_EXT.dispose()` // Agent 被停止时调用 onDispose?: () => void }` - : `interface ExecuteHooks { + : `interface ExecuteConfig { + baseURL: string // LLM API endpoint + apiKey: string // API key + model: string // Model name + + // Whether to include the initial tab in the task, default true + includeInitialTab?: boolean + // Called when agent status changes (idle, running, error, completed, etc.) onStatusChange?: (status: AgentStatus) => void