feat(controller): improve scroll action

This commit is contained in:
Simon
2026-04-02 22:05:47 +08:00
parent b8fd1aaebc
commit 85a33ac1a4
3 changed files with 41 additions and 12 deletions

View File

@@ -131,7 +131,8 @@ tools.set(
tools.set(
'scroll',
tool({
description: 'Scroll the page vertically. Use index for scroll elements (dropdowns/custom UI).',
description:
'Scroll vertically. Without index: scrolls the document. With index: scrolls the container at that index (or its nearest scrollable ancestor). Use index of a data-scrollable element to scroll a specific area.',
inputSchema: z.object({
down: z.boolean().default(true),
num_pages: z.number().min(0).max(10).optional().default(0.1),
@@ -155,7 +156,7 @@ tools.set(
'scroll_horizontally',
tool({
description:
'Scroll the page horizontally, or within a specific element by index. Useful for wide tables.',
'Scroll horizontally. Without index: scrolls the document. With index: scrolls the container at that index (or its nearest scrollable ancestor). Use index of a data-scrollable element to scroll a specific area.',
inputSchema: z.object({
right: z.boolean().default(true),
pixels: z.number().int().min(0),

View File

@@ -339,9 +339,20 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
el.scrollHeight > el.clientHeight &&
bigEnough(el)
// @deprecated Heuristic container search.
// Unreliable in multi-panel layouts. Should guide LLMs to use indexed scroll for consistency.
// TODO: remove this fallback
// try to find the nearest scrollable container
// document.activeElement is usually body.
// After a successful element.focus(), activeElement become the nearest focusable parent
let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement
// Something is wrong if it falls back to global '*' search
// TODO: Return error message instead of global '*' search
el = canScroll(el)
? el
: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
@@ -372,6 +383,10 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
return `✅ Scrolled page by ${scrolled}px.`
} else {
// Container scroll
const warningMsg = `The document is not scrollable. Falling back to container scroll.`
console.log(`[PageController] ${warningMsg}`)
const scrollBefore = el!.scrollTop
const scrollMax = el!.scrollHeight - el!.clientHeight
@@ -383,18 +398,18 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
if (Math.abs(scrolled) < 1) {
return dy > 0
? `⚠️ Already at the bottom of container (${el!.tagName}), cannot scroll down further.`
: `⚠️ Already at the top of container (${el!.tagName}), cannot scroll up further.`
? `⚠️ ${warningMsg} Already at the bottom of container (${el!.tagName}), cannot scroll down further.`
: `⚠️ ${warningMsg} Already at the top of container (${el!.tagName}), cannot scroll up further.`
}
const reachedBottom = dy > 0 && scrollAfter >= scrollMax - 1
const reachedTop = dy < 0 && scrollAfter <= 1
if (reachedBottom)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the bottom.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the bottom.`
if (reachedTop)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the top.`
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the top.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px.`
}
}
@@ -456,6 +471,7 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
// Page-level scrolling (default or fallback)
const dx = scroll_amount
const bigEnough = (el: HTMLElement) => el.clientWidth >= window.innerWidth * 0.5
const canScroll = (el: HTMLElement | null) =>
el &&
@@ -463,6 +479,9 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
el.scrollWidth > el.clientWidth &&
bigEnough(el)
// @deprecated Same heuristic container search as scrollVertically.
// TODO: Remove once LLMs reliably use indexed scrolling via data-scrollable.
let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement
@@ -497,6 +516,9 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
return `✅ Scrolled page horizontally by ${scrolled}px.`
} else {
// Container scroll
const warningMsg = `The document is not scrollable. Falling back to container scroll.`
console.log(`[PageController] ${warningMsg}`)
const scrollBefore = el!.scrollLeft
const scrollMax = el!.scrollWidth - el!.clientWidth
@@ -508,17 +530,17 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
if (Math.abs(scrolled) < 1) {
return dx > 0
? `⚠️ Already at the right edge of container (${el!.tagName}), cannot scroll right further.`
: `⚠️ Already at the left edge of container (${el!.tagName}), cannot scroll left further.`
? `⚠️ ${warningMsg} Already at the right edge of container (${el!.tagName}), cannot scroll right further.`
: `⚠️ ${warningMsg} Already at the left edge of container (${el!.tagName}), cannot scroll left further.`
}
const reachedRight = dx > 0 && scrollAfter >= scrollMax - 1
const reachedLeft = dx < 0 && scrollAfter <= 1
if (reachedRight)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the right edge.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the right edge.`
if (reachedLeft)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the left edge.`
return `✅ Scrolled container (${el!.tagName}) horizontally by ${scrolled}px.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the left edge.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) horizontally by ${scrolled}px.`
}
}

View File

@@ -1378,6 +1378,12 @@ export default (
return true
}
// Scrollable containers are always distinct — the LLM needs their index for targeted scrolling.
// Check extraData (already set by isScrollableElement in isInteractiveElement) to avoid redundant layout reads.
if (extraData.get(element)?.scrollable) {
return true
}
// Default to false: if it's interactive but doesn't match above,
// assume it triggers the same action as the parent.
return false