From 09bdf9ddafffc72398b2f70bea22fe1afc5ecdab Mon Sep 17 00:00:00 2001 From: Simon <10131203+gaomeng1900@users.noreply.github.com> Date: Wed, 4 Mar 2026 18:53:24 +0800 Subject: [PATCH] feat(core): experimental support for llms.txt (#157) * feat(core): experimental support for llms.txt * docs: experimentalLlmsTxt --- docs/CHANGELOG.md | 3 ++- packages/core/src/PageAgentCore.ts | 19 +++++++++----- packages/core/src/config/index.ts | 8 ++++++ packages/core/src/utils/index.ts | 25 +++++++++++++++++++ .../docs/advanced/page-agent-core/page.tsx | 9 +++++++ 5 files changed, 57 insertions(+), 7 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 07f0611..b9bd8c5 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -122,9 +122,10 @@ PageAgent is now ready for production use. The API is stable and breaking change - **Ask User Tool** - Agent can ask users for clarification - **i18n Support** - English and Chinese localization -### Configuration +### Configuration ```typescript +// Version 1.0.0 interface PageAgentConfig { // LLM Configuration (required) baseURL: string diff --git a/packages/core/src/PageAgentCore.ts b/packages/core/src/PageAgentCore.ts index 04f88f2..2571d67 100644 --- a/packages/core/src/PageAgentCore.ts +++ b/packages/core/src/PageAgentCore.ts @@ -21,7 +21,7 @@ import type { MacroToolInput, MacroToolResult, } from './types' -import { assert, normalizeResponse, uid, waitFor } from './utils' +import { assert, fetchLlmsTxt, normalizeResponse, uid, waitFor } from './utils' export { type PageAgentConfig } export type { SupportedLanguage } @@ -222,6 +222,7 @@ export class PageAgentCore extends EventTarget { this.history = [] this.#setStatus('running') this.#emitHistoryChange() + this.#observations = [] // Reset internal states this.#states = { totalWaitTime: 0, lastURL: '', browserState: null } @@ -462,14 +463,13 @@ export class PageAgentCore extends EventTarget { * Get instructions from config */ async #getInstructions(): Promise { - const { instructions } = this.config - if (!instructions) return '' + const { instructions, experimentalLlmsTxt } = this.config - const systemInstructions = instructions.system?.trim() + const systemInstructions = instructions?.system?.trim() let pageInstructions: string | undefined const url = this.#states.browserState?.url || '' - if (instructions.getPageInstructions && url) { + if (instructions?.getPageInstructions && url) { try { pageInstructions = instructions.getPageInstructions(url)?.trim() } catch (error) { @@ -479,7 +479,10 @@ export class PageAgentCore extends EventTarget { ) } } - if (!systemInstructions && !pageInstructions) return '' + + const llmsTxt = experimentalLlmsTxt && url ? await fetchLlmsTxt(url) : undefined + + if (!systemInstructions && !pageInstructions && !llmsTxt) return '' let result = '\n' @@ -491,6 +494,10 @@ export class PageAgentCore extends EventTarget { result += `\n${pageInstructions}\n\n` } + if (llmsTxt) { + result += `\n${llmsTxt}\n\n` + } + result += '\n\n' return result diff --git a/packages/core/src/config/index.ts b/packages/core/src/config/index.ts index 9db66b1..0ad553c 100644 --- a/packages/core/src/config/index.ts +++ b/packages/core/src/config/index.ts @@ -126,6 +126,14 @@ export interface AgentConfig { */ experimentalScriptExecutionTool?: boolean + /** + * @experimental + * Fetch /llms.txt from current site origin and include as context. + * Only fetched once per origin per task. + * @default false + */ + experimentalLlmsTxt?: boolean + /** * Transform page content before sending to LLM. * Called after DOM extraction and simplification, before LLM invocation. diff --git a/packages/core/src/utils/index.ts b/packages/core/src/utils/index.ts index 7644849..e62d819 100644 --- a/packages/core/src/utils/index.ts +++ b/packages/core/src/utils/index.ts @@ -57,6 +57,31 @@ export function uid() { return id } +const llmsTxtCache = new Map() + +/** Fetch /llms.txt for a URL's origin. Cached per origin, `null` = tried and not found. */ +export async function fetchLlmsTxt(url: string): Promise { + const origin = new URL(url).origin + if (llmsTxtCache.has(origin)) return llmsTxtCache.get(origin)! + + const endpoint = `${origin}/llms.txt` + let result: string | null = null + try { + console.log(chalk.gray(`[llms.txt] Fetching ${endpoint}`)) + const res = await fetch(endpoint, { signal: AbortSignal.timeout(3000) }) + if (res.ok) { + result = await res.text() + console.log(chalk.green(`[llms.txt] Found (${result.length} chars)`)) + } else { + console.log(chalk.gray(`[llms.txt] ${res.status} for ${endpoint}`)) + } + } catch (e) { + console.log(chalk.gray(`[llms.txt] Failed for ${endpoint}`), e) + } + llmsTxtCache.set(origin, result) + return result +} + /** * Simple assertion function that throws an error if the condition is falsy * @param condition - The condition to assert diff --git a/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx b/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx index 9476041..7f41371 100644 --- a/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx +++ b/packages/website/src/pages/docs/advanced/page-agent-core/page.tsx @@ -185,6 +185,15 @@ const result = await agent.execute('Fill in the form with test data')`} ? '启用实验性 JavaScript 执行工具' : 'Enable experimental JavaScript execution tool', }, + { + name: 'experimentalLlmsTxt', + type: 'boolean', + defaultValue: 'false', + status: 'experimental', + description: isZh + ? '从当前站点根目录获取 /llms.txt 并作为上下文提供给 LLM,每个 origin 仅请求一次' + : 'Fetch /llms.txt from site origin and include as LLM context, fetched once per origin', + }, ]} />