Merge pull request #390 from alibaba/fix/controller-improve-scrolling

feat(controller): improve scroll container detection and tool guidance
This commit is contained in:
Simon
2026-04-03 17:29:38 +08:00
committed by GitHub
3 changed files with 60 additions and 18 deletions

View File

@@ -131,7 +131,8 @@ tools.set(
tools.set(
'scroll',
tool({
description: 'Scroll the page vertically. Use index for scroll elements (dropdowns/custom UI).',
description:
'Scroll vertically. Without index: scrolls the document. With index: scrolls the container at that index (or its nearest scrollable ancestor). Use index of a data-scrollable element to scroll a specific area.',
inputSchema: z.object({
down: z.boolean().default(true),
num_pages: z.number().min(0).max(10).optional().default(0.1),
@@ -155,7 +156,7 @@ tools.set(
'scroll_horizontally',
tool({
description:
'Scroll the page horizontally, or within a specific element by index. Useful for wide tables.',
'Scroll horizontally. Without index: scrolls the document. With index: scrolls the container at that index (or its nearest scrollable ancestor). Use index of a data-scrollable element to scroll a specific area.',
inputSchema: z.object({
right: z.boolean().default(true),
pixels: z.number().int().min(0),

View File

@@ -287,7 +287,10 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
while (currentElement && attempts < 10) {
const computedStyle = window.getComputedStyle(currentElement)
const hasScrollableY = /(auto|scroll|overlay)/.test(computedStyle.overflowY)
const hasScrollableY =
/(auto|scroll|overlay)/.test(computedStyle.overflowY) ||
(computedStyle.scrollbarWidth && computedStyle.scrollbarWidth !== 'auto') ||
(computedStyle.scrollbarGutter && computedStyle.scrollbarGutter !== 'auto')
const canScrollVertically = currentElement.scrollHeight > currentElement.clientHeight
if (hasScrollableY && canScrollVertically) {
@@ -339,9 +342,20 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
el.scrollHeight > el.clientHeight &&
bigEnough(el)
// @deprecated Heuristic container search.
// Unreliable in multi-panel layouts. Should guide LLMs to use indexed scroll for consistency.
// TODO: remove this fallback
// try to find the nearest scrollable container
// document.activeElement is usually body.
// After a successful element.focus(), activeElement become the nearest focusable parent
let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement
// Something is wrong if it falls back to global '*' search
// TODO: Return error message instead of global '*' search
el = canScroll(el)
? el
: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
@@ -372,6 +386,10 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
return `✅ Scrolled page by ${scrolled}px.`
} else {
// Container scroll
const warningMsg = `The document is not scrollable. Falling back to container scroll.`
console.log(`[PageController] ${warningMsg}`)
const scrollBefore = el!.scrollTop
const scrollMax = el!.scrollHeight - el!.clientHeight
@@ -383,18 +401,18 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
if (Math.abs(scrolled) < 1) {
return dy > 0
? `⚠️ Already at the bottom of container (${el!.tagName}), cannot scroll down further.`
: `⚠️ Already at the top of container (${el!.tagName}), cannot scroll up further.`
? `⚠️ ${warningMsg} Already at the bottom of container (${el!.tagName}), cannot scroll down further.`
: `⚠️ ${warningMsg} Already at the top of container (${el!.tagName}), cannot scroll up further.`
}
const reachedBottom = dy > 0 && scrollAfter >= scrollMax - 1
const reachedTop = dy < 0 && scrollAfter <= 1
if (reachedBottom)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the bottom.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the bottom.`
if (reachedTop)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the top.`
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the top.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px.`
}
}
@@ -411,7 +429,10 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
while (currentElement && attempts < 10) {
const computedStyle = window.getComputedStyle(currentElement)
const hasScrollableX = /(auto|scroll|overlay)/.test(computedStyle.overflowX)
const hasScrollableX =
/(auto|scroll|overlay)/.test(computedStyle.overflowX) ||
(computedStyle.scrollbarWidth && computedStyle.scrollbarWidth !== 'auto') ||
(computedStyle.scrollbarGutter && computedStyle.scrollbarGutter !== 'auto')
const canScrollHorizontally = currentElement.scrollWidth > currentElement.clientWidth
if (hasScrollableX && canScrollHorizontally) {
@@ -456,6 +477,7 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
// Page-level scrolling (default or fallback)
const dx = scroll_amount
const bigEnough = (el: HTMLElement) => el.clientWidth >= window.innerWidth * 0.5
const canScroll = (el: HTMLElement | null) =>
el &&
@@ -463,6 +485,9 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
el.scrollWidth > el.clientWidth &&
bigEnough(el)
// @deprecated Same heuristic container search as scrollVertically.
// TODO: Remove once LLMs reliably use indexed scrolling via data-scrollable.
let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement
@@ -497,6 +522,9 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
return `✅ Scrolled page horizontally by ${scrolled}px.`
} else {
// Container scroll
const warningMsg = `The document is not scrollable. Falling back to container scroll.`
console.log(`[PageController] ${warningMsg}`)
const scrollBefore = el!.scrollLeft
const scrollMax = el!.scrollWidth - el!.clientWidth
@@ -508,17 +536,17 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
if (Math.abs(scrolled) < 1) {
return dx > 0
? `⚠️ Already at the right edge of container (${el!.tagName}), cannot scroll right further.`
: `⚠️ Already at the left edge of container (${el!.tagName}), cannot scroll left further.`
? `⚠️ ${warningMsg} Already at the right edge of container (${el!.tagName}), cannot scroll right further.`
: `⚠️ ${warningMsg} Already at the left edge of container (${el!.tagName}), cannot scroll left further.`
}
const reachedRight = dx > 0 && scrollAfter >= scrollMax - 1
const reachedLeft = dx < 0 && scrollAfter <= 1
if (reachedRight)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the right edge.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the right edge.`
if (reachedLeft)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the left edge.`
return `✅ Scrolled container (${el!.tagName}) horizontally by ${scrolled}px.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the left edge.`
return ` ${warningMsg} Scrolled container (${el!.tagName}) horizontally by ${scrolled}px.`
}
}

View File

@@ -503,11 +503,16 @@ export default (
const overflowX = style.overflowX
const overflowY = style.overflowY
// Check scrollable distances
// scrollbar-width/scrollbar-gutter are only set on elements designed to scroll;
// their presence signals scroll intent even when overflow is hidden (e.g. overflow: auto on :hover)
const hasScrollbarSignal =
(style.scrollbarWidth && style.scrollbarWidth !== 'auto') ||
(style.scrollbarGutter && style.scrollbarGutter !== 'auto')
const scrollableX = overflowX === 'auto' || overflowX === 'scroll'
const scrollableY = overflowY === 'auto' || overflowY === 'scroll'
if (!scrollableX && !scrollableY) {
if (!scrollableX && !scrollableY && !hasScrollbarSignal) {
return null // Not scrollable in any direction
}
@@ -521,11 +526,11 @@ export default (
return null // Not scrollable
}
if (!scrollableY && scrollWidth < threshold) {
if (!scrollableY && !hasScrollbarSignal && scrollWidth < threshold) {
return null // Not scrollable horizontally
}
if (!scrollableX && scrollHeight < threshold) {
if (!scrollableX && !hasScrollbarSignal && scrollHeight < threshold) {
return null // Not scrollable vertically
}
@@ -547,6 +552,8 @@ export default (
scrollData: scrollData,
})
console.log('scrollData!!!', scrollData)
return scrollData
}
@@ -1378,6 +1385,12 @@ export default (
return true
}
// Scrollable containers are always distinct — the LLM needs their index for targeted scrolling.
// Check extraData (already set by isScrollableElement in isInteractiveElement) to avoid redundant layout reads.
if (extraData.get(element)?.scrollable) {
return true
}
// Default to false: if it's interactive but doesn't match above,
// assume it triggers the same action as the parent.
return false