feat(core): experimental support for llms.txt (#157)

* feat(core): experimental support for llms.txt

* docs: experimentalLlmsTxt
This commit is contained in:
Simon
2026-03-04 18:53:24 +08:00
committed by GitHub
parent b58d2a09ef
commit 09bdf9ddaf
5 changed files with 57 additions and 7 deletions

View File

@@ -122,9 +122,10 @@ PageAgent is now ready for production use. The API is stable and breaking change
- **Ask User Tool** - Agent can ask users for clarification - **Ask User Tool** - Agent can ask users for clarification
- **i18n Support** - English and Chinese localization - **i18n Support** - English and Chinese localization
### Configuration ### Configuration
```typescript ```typescript
// Version 1.0.0
interface PageAgentConfig { interface PageAgentConfig {
// LLM Configuration (required) // LLM Configuration (required)
baseURL: string baseURL: string

View File

@@ -21,7 +21,7 @@ import type {
MacroToolInput, MacroToolInput,
MacroToolResult, MacroToolResult,
} from './types' } from './types'
import { assert, normalizeResponse, uid, waitFor } from './utils' import { assert, fetchLlmsTxt, normalizeResponse, uid, waitFor } from './utils'
export { type PageAgentConfig } export { type PageAgentConfig }
export type { SupportedLanguage } export type { SupportedLanguage }
@@ -222,6 +222,7 @@ export class PageAgentCore extends EventTarget {
this.history = [] this.history = []
this.#setStatus('running') this.#setStatus('running')
this.#emitHistoryChange() this.#emitHistoryChange()
this.#observations = []
// Reset internal states // Reset internal states
this.#states = { totalWaitTime: 0, lastURL: '', browserState: null } this.#states = { totalWaitTime: 0, lastURL: '', browserState: null }
@@ -462,14 +463,13 @@ export class PageAgentCore extends EventTarget {
* Get instructions from config * Get instructions from config
*/ */
async #getInstructions(): Promise<string> { async #getInstructions(): Promise<string> {
const { instructions } = this.config const { instructions, experimentalLlmsTxt } = this.config
if (!instructions) return ''
const systemInstructions = instructions.system?.trim() const systemInstructions = instructions?.system?.trim()
let pageInstructions: string | undefined let pageInstructions: string | undefined
const url = this.#states.browserState?.url || '' const url = this.#states.browserState?.url || ''
if (instructions.getPageInstructions && url) { if (instructions?.getPageInstructions && url) {
try { try {
pageInstructions = instructions.getPageInstructions(url)?.trim() pageInstructions = instructions.getPageInstructions(url)?.trim()
} catch (error) { } catch (error) {
@@ -479,7 +479,10 @@ export class PageAgentCore extends EventTarget {
) )
} }
} }
if (!systemInstructions && !pageInstructions) return ''
const llmsTxt = experimentalLlmsTxt && url ? await fetchLlmsTxt(url) : undefined
if (!systemInstructions && !pageInstructions && !llmsTxt) return ''
let result = '<instructions>\n' let result = '<instructions>\n'
@@ -491,6 +494,10 @@ export class PageAgentCore extends EventTarget {
result += `<page_instructions>\n${pageInstructions}\n</page_instructions>\n` result += `<page_instructions>\n${pageInstructions}\n</page_instructions>\n`
} }
if (llmsTxt) {
result += `<llms_txt>\n${llmsTxt}\n</llms_txt>\n`
}
result += '</instructions>\n\n' result += '</instructions>\n\n'
return result return result

View File

@@ -126,6 +126,14 @@ export interface AgentConfig {
*/ */
experimentalScriptExecutionTool?: boolean experimentalScriptExecutionTool?: boolean
/**
* @experimental
* Fetch /llms.txt from current site origin and include as context.
* Only fetched once per origin per task.
* @default false
*/
experimentalLlmsTxt?: boolean
/** /**
* Transform page content before sending to LLM. * Transform page content before sending to LLM.
* Called after DOM extraction and simplification, before LLM invocation. * Called after DOM extraction and simplification, before LLM invocation.

View File

@@ -57,6 +57,31 @@ export function uid() {
return id return id
} }
const llmsTxtCache = new Map<string, string | null>()
/** Fetch /llms.txt for a URL's origin. Cached per origin, `null` = tried and not found. */
export async function fetchLlmsTxt(url: string): Promise<string | null> {
const origin = new URL(url).origin
if (llmsTxtCache.has(origin)) return llmsTxtCache.get(origin)!
const endpoint = `${origin}/llms.txt`
let result: string | null = null
try {
console.log(chalk.gray(`[llms.txt] Fetching ${endpoint}`))
const res = await fetch(endpoint, { signal: AbortSignal.timeout(3000) })
if (res.ok) {
result = await res.text()
console.log(chalk.green(`[llms.txt] Found (${result.length} chars)`))
} else {
console.log(chalk.gray(`[llms.txt] ${res.status} for ${endpoint}`))
}
} catch (e) {
console.log(chalk.gray(`[llms.txt] Failed for ${endpoint}`), e)
}
llmsTxtCache.set(origin, result)
return result
}
/** /**
* Simple assertion function that throws an error if the condition is falsy * Simple assertion function that throws an error if the condition is falsy
* @param condition - The condition to assert * @param condition - The condition to assert

View File

@@ -185,6 +185,15 @@ const result = await agent.execute('Fill in the form with test data')`}
? '启用实验性 JavaScript 执行工具' ? '启用实验性 JavaScript 执行工具'
: 'Enable experimental JavaScript execution tool', : 'Enable experimental JavaScript execution tool',
}, },
{
name: 'experimentalLlmsTxt',
type: 'boolean',
defaultValue: 'false',
status: 'experimental',
description: isZh
? '从当前站点根目录获取 /llms.txt 并作为上下文提供给 LLM每个 origin 仅请求一次'
: 'Fetch /llms.txt from site origin and include as LLM context, fetched once per origin',
},
]} ]}
/> />