Merge branch 'main' into feat/free-qwen-by-default

This commit is contained in:
Simon
2026-02-27 15:51:19 +08:00
12 changed files with 647 additions and 490 deletions

922
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -35,29 +35,29 @@
"prepare": "husky"
},
"devDependencies": {
"@commitlint/cli": "^20.3.1",
"@commitlint/config-conventional": "^20.3.1",
"@commitlint/cli": "^20.4.2",
"@commitlint/config-conventional": "^20.4.2",
"@eslint/js": "^9.39.2",
"@microsoft/api-extractor": "^7.56.3",
"@tailwindcss/vite": "^4.1.18",
"@microsoft/api-extractor": "^7.57.3",
"@tailwindcss/vite": "^4.2.1",
"@trivago/prettier-plugin-sort-imports": "^6.0.2",
"@types/node": "^25.2.2",
"@types/node": "^25.3.0",
"@vitejs/plugin-react-swc": "^4.1.0",
"chalk": "^5.6.2",
"concurrently": "^9.2.1",
"dotenv": "^17.2.4",
"dotenv": "^17.3.1",
"eslint": "^9.39.2",
"eslint-config-prettier": "^10.1.8",
"eslint-plugin-react-dom": "^2.12.2",
"eslint-plugin-react-dom": "^2.13.0",
"eslint-plugin-react-hooks": "^7.0.1",
"eslint-plugin-react-refresh": "^0.5.0",
"eslint-plugin-react-x": "^2.12.2",
"eslint-plugin-react-refresh": "^0.5.2",
"eslint-plugin-react-x": "^2.13.0",
"globals": "^17.0.0",
"husky": "^9.1.7",
"lint-staged": "^16.2.4",
"prettier": "^3.8.0",
"typescript": "^5.9.3",
"typescript-eslint": "^8.55.0",
"typescript-eslint": "^8.56.1",
"unplugin-dts": "^1.0.0-beta.6",
"vite": "^7.3.1",
"vite-plugin-css-injected-by-js": "^3.5.2",

View File

@@ -5,7 +5,7 @@
import { InvokeError, LLM, type Tool } from '@page-agent/llms'
import type { BrowserState, PageController } from '@page-agent/page-controller'
import chalk from 'chalk'
import * as zod from 'zod'
import * as z from 'zod'
import { type PageAgentConfig, type SupportedLanguage } from './config'
import { DEFAULT_MAX_STEPS } from './config/constants'
@@ -248,16 +248,16 @@ export class PageAgentCore extends EventTarget {
{ role: 'user' as const, content: await this.#assembleUserPrompt() },
]
const tools = { AgentOutput: this.#packMacroTool() }
const macroTool = { AgentOutput: this.#packMacroTool() }
// invoke LLM
console.log(chalk.blue.bold('🧠 Thinking...'))
this.#emitActivity({ type: 'thinking' })
const result = await this.#llm.invoke(messages, tools, this.#abortController.signal, {
const result = await this.#llm.invoke(messages, macroTool, this.#abortController.signal, {
toolChoiceName: 'AgentOutput',
normalizeResponse,
normalizeResponse: (res) => normalizeResponse(res, this.tools),
})
// assemble history
@@ -358,24 +358,22 @@ export class PageAgentCore extends EventTarget {
const tools = this.tools
const actionSchemas = Array.from(tools.entries()).map(([toolName, tool]) => {
return zod.object({ [toolName]: tool.inputSchema }).describe(tool.description)
return z.object({ [toolName]: tool.inputSchema }).describe(tool.description)
})
const actionSchema = zod.union(
actionSchemas as unknown as [zod.ZodType, zod.ZodType, ...zod.ZodType[]]
)
const actionSchema = z.union(actionSchemas as unknown as [z.ZodType, z.ZodType, ...z.ZodType[]])
const macroToolSchema = zod.object({
// thinking: zod.string().optional(),
evaluation_previous_goal: zod.string().optional(),
memory: zod.string().optional(),
next_goal: zod.string().optional(),
const macroToolSchema = z.object({
// thinking: z.string().optional(),
evaluation_previous_goal: z.string().optional(),
memory: z.string().optional(),
next_goal: z.string().optional(),
action: actionSchema,
})
return {
description: 'You MUST call this tool every step!',
inputSchema: macroToolSchema as zod.ZodType<MacroToolInput>,
inputSchema: macroToolSchema as z.ZodType<MacroToolInput>,
execute: async (input: MacroToolInput): Promise<MacroToolResult> => {
// abort
if (this.#abortController.signal.aborted) throw new Error('AbortError')

View File

@@ -2,7 +2,7 @@
* Internal tools for PageAgent.
* @note Adapted from browser-use
*/
import * as zod from 'zod'
import * as z from 'zod'
import type { PageAgentCore } from '../PageAgentCore'
import { waitFor } from '../utils'
@@ -13,7 +13,7 @@ import { waitFor } from '../utils'
export interface PageAgentTool<TParams = any> {
// name: string
description: string
inputSchema: zod.ZodType<TParams>
inputSchema: z.ZodType<TParams>
execute: (this: PageAgentCore, args: TParams) => Promise<string>
}
@@ -32,9 +32,9 @@ tools.set(
tool({
description:
'Complete task. Text is your final response to the user — keep it concise unless the user explicitly asks for detail.',
inputSchema: zod.object({
text: zod.string(),
success: zod.boolean().default(true),
inputSchema: z.object({
text: z.string(),
success: z.boolean().default(true),
}),
execute: async function (this: PageAgentCore, input) {
// @note main loop will handle this one
@@ -47,8 +47,8 @@ tools.set(
'wait',
tool({
description: 'Wait for x seconds. Can be used to wait until the page or data is fully loaded.',
inputSchema: zod.object({
seconds: zod.number().min(1).max(10).default(1),
inputSchema: z.object({
seconds: z.number().min(1).max(10).default(1),
}),
execute: async function (this: PageAgentCore, input) {
// try to subtract LLM calling time from the actual wait time
@@ -67,8 +67,8 @@ tools.set(
tool({
description:
'Ask the user a question and wait for their answer. Use this if you need more information or clarification.',
inputSchema: zod.object({
question: zod.string(),
inputSchema: z.object({
question: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
if (!this.onAskUser) {
@@ -84,8 +84,8 @@ tools.set(
'click_element_by_index',
tool({
description: 'Click element by index',
inputSchema: zod.object({
index: zod.int().min(0),
inputSchema: z.object({
index: z.int().min(0),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.clickElement(input.index)
@@ -98,9 +98,9 @@ tools.set(
'input_text',
tool({
description: 'Click and type text into an interactive input element',
inputSchema: zod.object({
index: zod.int().min(0),
text: zod.string(),
inputSchema: z.object({
index: z.int().min(0),
text: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.inputText(input.index, input.text)
@@ -114,9 +114,9 @@ tools.set(
tool({
description:
'Select dropdown option for interactive element index by the text of the option you want to select',
inputSchema: zod.object({
index: zod.int().min(0),
text: zod.string(),
inputSchema: z.object({
index: z.int().min(0),
text: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.selectOption(input.index, input.text)
@@ -132,11 +132,11 @@ tools.set(
'scroll',
tool({
description: 'Scroll the page vertically. Use index for scroll elements (dropdowns/custom UI).',
inputSchema: zod.object({
down: zod.boolean().default(true),
num_pages: zod.number().min(0).max(10).optional().default(0.1),
pixels: zod.number().int().min(0).optional(),
index: zod.number().int().min(0).optional(),
inputSchema: z.object({
down: z.boolean().default(true),
num_pages: z.number().min(0).max(10).optional().default(0.1),
pixels: z.number().int().min(0).optional(),
index: z.number().int().min(0).optional(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.scroll({
@@ -156,10 +156,10 @@ tools.set(
tool({
description:
'Scroll the page horizontally, or within a specific element by index. Useful for wide tables.',
inputSchema: zod.object({
right: zod.boolean().default(true),
pixels: zod.number().int().min(0),
index: zod.number().int().min(0).optional(),
inputSchema: z.object({
right: z.boolean().default(true),
pixels: z.number().int().min(0),
index: z.number().int().min(0).optional(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.scrollHorizontally(input)
@@ -173,8 +173,8 @@ tools.set(
tool({
description:
'Execute JavaScript code on the current page. Supports async/await syntax. Use with caution!',
inputSchema: zod.object({
script: zod.string(),
inputSchema: z.object({
script: z.string(),
}),
execute: async function (this: PageAgentCore, input) {
const result = await this.pageController.executeJavascript(input.script)

View File

@@ -1,4 +1,8 @@
import { InvokeError, InvokeErrorType } from '@page-agent/llms'
import chalk from 'chalk'
import * as z from 'zod'
import type { PageAgentTool } from '../tools'
/**
* Normalize LLM response and fix common format issues.
@@ -9,9 +13,10 @@ import chalk from 'chalk'
* - Arguments wrapped as double JSON string
* - Nested function call format
* - Missing action field (fallback to wait)
* - Primitive action input for single-field tools (e.g. `{"click_element_by_index": 2}`)
* - etc.
*/
export function normalizeResponse(response: any): any {
export function normalizeResponse(response: any, tools?: Map<string, PageAgentTool>): any {
let resolvedArguments = null as any
const choice = (response as { choices?: Choice[] }).choices?.[0]
@@ -78,6 +83,11 @@ export function normalizeResponse(response: any): any {
resolvedArguments.action = safeJsonParse(resolvedArguments.action)
}
// validate and fix action input using tool schemas
if (resolvedArguments.action && tools) {
resolvedArguments.action = validateAction(resolvedArguments.action, tools)
}
// fix incomplete formats
if (!resolvedArguments.action) {
console.log(chalk.yellow(`[normalizeResponse] #5: fixing tool_call`))
@@ -108,6 +118,55 @@ export function normalizeResponse(response: any): any {
}
}
/**
* Validate action against tool schemas. Provides clear error messages
* instead of letting the union schema produce unreadable errors.
*
* Also coerces primitive inputs for single-field tools:
* e.g. `{"click_element_by_index": 2}` → `{"click_element_by_index": {"index": 2}}`
*/
function validateAction(action: any, tools: Map<string, PageAgentTool>): any {
if (typeof action !== 'object' || action === null) return action
const toolName = Object.keys(action)[0]
if (!toolName) return action
const tool = tools.get(toolName)
if (!tool) {
const available = Array.from(tools.keys()).join(', ')
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Unknown action "${toolName}". Available: ${available}`
)
}
let value = action[toolName]
const schema = tool.inputSchema
// coerce primitive input for single-field tools
if (schema instanceof z.ZodObject && value !== null && typeof value !== 'object') {
const requiredKey = Object.keys(schema.shape).find(
(k) => !(schema.shape as Record<string, z.ZodType>)[k].safeParse(undefined).success
)
if (requiredKey) {
console.log(
chalk.yellow(`[normalizeResponse] coercing primitive action input for "${toolName}"`)
)
value = { [requiredKey]: value }
}
}
const result = schema.safeParse(value)
if (!result.success) {
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
`Invalid input for action "${toolName}": ${z.prettifyError(result.error)}`
)
}
return { [toolName]: result.data }
}
/**
* Safely parse JSON, return original input if not json.
*/

View File

@@ -17,25 +17,25 @@
"@radix-ui/react-slot": "^1.2.4",
"@radix-ui/react-switch": "^1.2.6",
"@radix-ui/react-tooltip": "^1.2.8",
"@types/chrome": "^0.1.34",
"@types/react": "^19.2.13",
"@types/chrome": "^0.1.37",
"@types/react": "^19.2.14",
"@types/react-dom": "^19.2.1",
"@wxt-dev/module-react": "^1.1.5",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"idb": "^8.0.3",
"lucide-react": "^0.563.0",
"motion": "^12.34.0",
"lucide-react": "^0.575.0",
"motion": "^12.34.3",
"next-themes": "^0.4.6",
"react": "^19.2.4",
"react-dom": "^19.2.4",
"rough-notation": "^0.5.1",
"simple-icons": "^16.8.0",
"simple-icons": "^16.9.0",
"sonner": "^2.0.7",
"tailwind-merge": "^3.4.0",
"tailwind-merge": "^3.5.0",
"tailwindcss": "^4.1.14",
"tw-animate-css": "^1.4.0",
"wxt": "^0.20.14"
"wxt": "^0.20.18"
},
"dependencies": {
"@page-agent/core": "1.3.0",

View File

@@ -6,14 +6,14 @@
* - switch_to_tab: Switch to an existing tab
* - close_tab: Close a tab (optionally switch to another)
*/
import * as zod from 'zod'
import * as z from 'zod'
import type { TabsController } from './TabsController'
/** Tool definition compatible with PageAgentCore customTools */
interface TabTool {
description: string
inputSchema: zod.ZodType
inputSchema: z.ZodType
execute: (input: unknown) => Promise<string>
}
@@ -26,8 +26,8 @@ export function createTabTools(tabsController: TabsController): Record<string, T
open_new_tab: {
description:
'Open a new browser tab with the specified URL. The new tab becomes the current tab for all subsequent page operations.',
inputSchema: zod.object({
url: zod.string().describe('The URL to open in the new tab'),
inputSchema: z.object({
url: z.string().describe('The URL to open in the new tab'),
}),
execute: async (input: unknown) => {
const { url } = input as { url: string }
@@ -42,8 +42,8 @@ export function createTabTools(tabsController: TabsController): Record<string, T
switch_to_tab: {
description:
'Switch to an existing tab by its ID. After switching, all page operations will target the new current tab. You can only switch to tabs in the tab list shown in browser state.',
inputSchema: zod.object({
tab_id: zod.number().int().describe('The tab ID to switch to'),
inputSchema: z.object({
tab_id: z.number().int().describe('The tab ID to switch to'),
}),
execute: async (input: unknown) => {
const { tab_id } = input as { tab_id: number }
@@ -58,8 +58,8 @@ export function createTabTools(tabsController: TabsController): Record<string, T
close_tab: {
description:
'Close a tab by its ID. Cannot close the initial tab. Optionally specify which tab to switch to after closing.',
inputSchema: zod.object({
tab_id: zod.number().int().describe('The tab ID to close'),
inputSchema: z.object({
tab_id: z.number().int().describe('The tab ID to close'),
}),
execute: async (input: unknown) => {
const { tab_id } = input as { tab_id: number }

View File

@@ -1,6 +1,8 @@
/**
* OpenAI Client implementation
*/
import * as z from 'zod'
import { InvokeError, InvokeErrorType } from './errors'
import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
import { modelPatch, zodToOpenAITool } from './utils'
@@ -182,7 +184,7 @@ export class OpenAIClient implements LLMClient {
// Validate with schema
const validation = tool.inputSchema.safeParse(parsedArgs)
if (!validation.success) {
console.error(validation.error)
console.error(z.prettifyError(validation.error))
throw new InvokeError(
InvokeErrorType.INVALID_TOOL_ARGS,
'Tool arguments validation failed',

View File

@@ -1 +0,0 @@
/// <reference types="vite/client" />

View File

@@ -1,9 +1,10 @@
import { OpenAIClient } from './OpenAIClient'
import { DEFAULT_TEMPERATURE, LLM_MAX_RETRIES } from './constants'
import { InvokeError } from './errors'
import { InvokeError, InvokeErrorType } from './errors'
import type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool } from './types'
export type { InvokeError, InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }
export { InvokeError, InvokeErrorType }
export type { InvokeOptions, InvokeResult, LLMClient, LLMConfig, Message, Tool }
export function parseLLMConfig(config: LLMConfig): Required<LLMConfig> {
// Runtime validation as defensive programming (types already guarantee these)

View File

@@ -1,7 +1,7 @@
/**
* Core types for LLM integration
*/
import type { z } from 'zod'
import type * as z from 'zod'
/**
* Message format - OpenAI standard (industry standard)

View File

@@ -15,19 +15,19 @@
"@radix-ui/react-slot": "^1.2.4",
"@radix-ui/react-switch": "^1.2.6",
"@radix-ui/react-tooltip": "^1.2.8",
"@types/react": "^19.2.13",
"@types/react": "^19.2.14",
"@types/react-dom": "^19.2.1",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"lucide-react": "^0.563.0",
"motion": "^12.34.0",
"lucide-react": "^0.575.0",
"motion": "^12.34.3",
"next-themes": "^0.4.6",
"react": "^19.2.4",
"react-dom": "^19.2.4",
"rough-notation": "^0.5.1",
"simple-icons": "^16.8.0",
"simple-icons": "^16.9.0",
"sonner": "^2.0.7",
"tailwind-merge": "^3.4.0",
"tailwind-merge": "^3.5.0",
"tailwindcss": "^4.1.14",
"tw-animate-css": "^1.4.0",
"wouter": "^3.9.0"