Merge pull request #390 from alibaba/fix/controller-improve-scrolling

feat(controller): improve scroll container detection and tool guidance
This commit is contained in:
Simon
2026-04-03 17:29:38 +08:00
committed by GitHub
3 changed files with 60 additions and 18 deletions

View File

@@ -131,7 +131,8 @@ tools.set(
tools.set( tools.set(
'scroll', 'scroll',
tool({ tool({
description: 'Scroll the page vertically. Use index for scroll elements (dropdowns/custom UI).', description:
'Scroll vertically. Without index: scrolls the document. With index: scrolls the container at that index (or its nearest scrollable ancestor). Use index of a data-scrollable element to scroll a specific area.',
inputSchema: z.object({ inputSchema: z.object({
down: z.boolean().default(true), down: z.boolean().default(true),
num_pages: z.number().min(0).max(10).optional().default(0.1), num_pages: z.number().min(0).max(10).optional().default(0.1),
@@ -155,7 +156,7 @@ tools.set(
'scroll_horizontally', 'scroll_horizontally',
tool({ tool({
description: description:
'Scroll the page horizontally, or within a specific element by index. Useful for wide tables.', 'Scroll horizontally. Without index: scrolls the document. With index: scrolls the container at that index (or its nearest scrollable ancestor). Use index of a data-scrollable element to scroll a specific area.',
inputSchema: z.object({ inputSchema: z.object({
right: z.boolean().default(true), right: z.boolean().default(true),
pixels: z.number().int().min(0), pixels: z.number().int().min(0),

View File

@@ -287,7 +287,10 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
while (currentElement && attempts < 10) { while (currentElement && attempts < 10) {
const computedStyle = window.getComputedStyle(currentElement) const computedStyle = window.getComputedStyle(currentElement)
const hasScrollableY = /(auto|scroll|overlay)/.test(computedStyle.overflowY) const hasScrollableY =
/(auto|scroll|overlay)/.test(computedStyle.overflowY) ||
(computedStyle.scrollbarWidth && computedStyle.scrollbarWidth !== 'auto') ||
(computedStyle.scrollbarGutter && computedStyle.scrollbarGutter !== 'auto')
const canScrollVertically = currentElement.scrollHeight > currentElement.clientHeight const canScrollVertically = currentElement.scrollHeight > currentElement.clientHeight
if (hasScrollableY && canScrollVertically) { if (hasScrollableY && canScrollVertically) {
@@ -339,9 +342,20 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
el.scrollHeight > el.clientHeight && el.scrollHeight > el.clientHeight &&
bigEnough(el) bigEnough(el)
// @deprecated Heuristic container search.
// Unreliable in multi-panel layouts. Should guide LLMs to use indexed scroll for consistency.
// TODO: remove this fallback
// try to find the nearest scrollable container
// document.activeElement is usually body.
// After a successful element.focus(), activeElement become the nearest focusable parent
let el: HTMLElement | null = document.activeElement as HTMLElement | null let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement while (el && !canScroll(el) && el !== document.body) el = el.parentElement
// Something is wrong if it falls back to global '*' search
// TODO: Return error message instead of global '*' search
el = canScroll(el) el = canScroll(el)
? el ? el
: Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) || : Array.from(document.querySelectorAll<HTMLElement>('*')).find(canScroll) ||
@@ -372,6 +386,10 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
return `✅ Scrolled page by ${scrolled}px.` return `✅ Scrolled page by ${scrolled}px.`
} else { } else {
// Container scroll // Container scroll
const warningMsg = `The document is not scrollable. Falling back to container scroll.`
console.log(`[PageController] ${warningMsg}`)
const scrollBefore = el!.scrollTop const scrollBefore = el!.scrollTop
const scrollMax = el!.scrollHeight - el!.clientHeight const scrollMax = el!.scrollHeight - el!.clientHeight
@@ -383,18 +401,18 @@ export async function scrollVertically(scroll_amount: number, element?: HTMLElem
if (Math.abs(scrolled) < 1) { if (Math.abs(scrolled) < 1) {
return dy > 0 return dy > 0
? `⚠️ Already at the bottom of container (${el!.tagName}), cannot scroll down further.` ? `⚠️ ${warningMsg} Already at the bottom of container (${el!.tagName}), cannot scroll down further.`
: `⚠️ Already at the top of container (${el!.tagName}), cannot scroll up further.` : `⚠️ ${warningMsg} Already at the top of container (${el!.tagName}), cannot scroll up further.`
} }
const reachedBottom = dy > 0 && scrollAfter >= scrollMax - 1 const reachedBottom = dy > 0 && scrollAfter >= scrollMax - 1
const reachedTop = dy < 0 && scrollAfter <= 1 const reachedTop = dy < 0 && scrollAfter <= 1
if (reachedBottom) if (reachedBottom)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the bottom.` return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the bottom.`
if (reachedTop) if (reachedTop)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the top.` return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the top.`
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px.` return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px.`
} }
} }
@@ -411,7 +429,10 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
while (currentElement && attempts < 10) { while (currentElement && attempts < 10) {
const computedStyle = window.getComputedStyle(currentElement) const computedStyle = window.getComputedStyle(currentElement)
const hasScrollableX = /(auto|scroll|overlay)/.test(computedStyle.overflowX) const hasScrollableX =
/(auto|scroll|overlay)/.test(computedStyle.overflowX) ||
(computedStyle.scrollbarWidth && computedStyle.scrollbarWidth !== 'auto') ||
(computedStyle.scrollbarGutter && computedStyle.scrollbarGutter !== 'auto')
const canScrollHorizontally = currentElement.scrollWidth > currentElement.clientWidth const canScrollHorizontally = currentElement.scrollWidth > currentElement.clientWidth
if (hasScrollableX && canScrollHorizontally) { if (hasScrollableX && canScrollHorizontally) {
@@ -456,6 +477,7 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
// Page-level scrolling (default or fallback) // Page-level scrolling (default or fallback)
const dx = scroll_amount const dx = scroll_amount
const bigEnough = (el: HTMLElement) => el.clientWidth >= window.innerWidth * 0.5 const bigEnough = (el: HTMLElement) => el.clientWidth >= window.innerWidth * 0.5
const canScroll = (el: HTMLElement | null) => const canScroll = (el: HTMLElement | null) =>
el && el &&
@@ -463,6 +485,9 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
el.scrollWidth > el.clientWidth && el.scrollWidth > el.clientWidth &&
bigEnough(el) bigEnough(el)
// @deprecated Same heuristic container search as scrollVertically.
// TODO: Remove once LLMs reliably use indexed scrolling via data-scrollable.
let el: HTMLElement | null = document.activeElement as HTMLElement | null let el: HTMLElement | null = document.activeElement as HTMLElement | null
while (el && !canScroll(el) && el !== document.body) el = el.parentElement while (el && !canScroll(el) && el !== document.body) el = el.parentElement
@@ -497,6 +522,9 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
return `✅ Scrolled page horizontally by ${scrolled}px.` return `✅ Scrolled page horizontally by ${scrolled}px.`
} else { } else {
// Container scroll // Container scroll
const warningMsg = `The document is not scrollable. Falling back to container scroll.`
console.log(`[PageController] ${warningMsg}`)
const scrollBefore = el!.scrollLeft const scrollBefore = el!.scrollLeft
const scrollMax = el!.scrollWidth - el!.clientWidth const scrollMax = el!.scrollWidth - el!.clientWidth
@@ -508,17 +536,17 @@ export async function scrollHorizontally(scroll_amount: number, element?: HTMLEl
if (Math.abs(scrolled) < 1) { if (Math.abs(scrolled) < 1) {
return dx > 0 return dx > 0
? `⚠️ Already at the right edge of container (${el!.tagName}), cannot scroll right further.` ? `⚠️ ${warningMsg} Already at the right edge of container (${el!.tagName}), cannot scroll right further.`
: `⚠️ Already at the left edge of container (${el!.tagName}), cannot scroll left further.` : `⚠️ ${warningMsg} Already at the left edge of container (${el!.tagName}), cannot scroll left further.`
} }
const reachedRight = dx > 0 && scrollAfter >= scrollMax - 1 const reachedRight = dx > 0 && scrollAfter >= scrollMax - 1
const reachedLeft = dx < 0 && scrollAfter <= 1 const reachedLeft = dx < 0 && scrollAfter <= 1
if (reachedRight) if (reachedRight)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the right edge.` return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the right edge.`
if (reachedLeft) if (reachedLeft)
return `✅ Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the left edge.` return ` ${warningMsg} Scrolled container (${el!.tagName}) by ${scrolled}px. Reached the left edge.`
return `✅ Scrolled container (${el!.tagName}) horizontally by ${scrolled}px.` return ` ${warningMsg} Scrolled container (${el!.tagName}) horizontally by ${scrolled}px.`
} }
} }

View File

@@ -503,11 +503,16 @@ export default (
const overflowX = style.overflowX const overflowX = style.overflowX
const overflowY = style.overflowY const overflowY = style.overflowY
// Check scrollable distances // scrollbar-width/scrollbar-gutter are only set on elements designed to scroll;
// their presence signals scroll intent even when overflow is hidden (e.g. overflow: auto on :hover)
const hasScrollbarSignal =
(style.scrollbarWidth && style.scrollbarWidth !== 'auto') ||
(style.scrollbarGutter && style.scrollbarGutter !== 'auto')
const scrollableX = overflowX === 'auto' || overflowX === 'scroll' const scrollableX = overflowX === 'auto' || overflowX === 'scroll'
const scrollableY = overflowY === 'auto' || overflowY === 'scroll' const scrollableY = overflowY === 'auto' || overflowY === 'scroll'
if (!scrollableX && !scrollableY) { if (!scrollableX && !scrollableY && !hasScrollbarSignal) {
return null // Not scrollable in any direction return null // Not scrollable in any direction
} }
@@ -521,11 +526,11 @@ export default (
return null // Not scrollable return null // Not scrollable
} }
if (!scrollableY && scrollWidth < threshold) { if (!scrollableY && !hasScrollbarSignal && scrollWidth < threshold) {
return null // Not scrollable horizontally return null // Not scrollable horizontally
} }
if (!scrollableX && scrollHeight < threshold) { if (!scrollableX && !hasScrollbarSignal && scrollHeight < threshold) {
return null // Not scrollable vertically return null // Not scrollable vertically
} }
@@ -547,6 +552,8 @@ export default (
scrollData: scrollData, scrollData: scrollData,
}) })
console.log('scrollData!!!', scrollData)
return scrollData return scrollData
} }
@@ -1378,6 +1385,12 @@ export default (
return true return true
} }
// Scrollable containers are always distinct — the LLM needs their index for targeted scrolling.
// Check extraData (already set by isScrollableElement in isInteractiveElement) to avoid redundant layout reads.
if (extraData.get(element)?.scrollable) {
return true
}
// Default to false: if it's interactive but doesn't match above, // Default to false: if it's interactive but doesn't match above,
// assume it triggers the same action as the parent. // assume it triggers the same action as the parent.
return false return false