Fix overlapping images when merging screenshots (#2729)
This commit is contained in:
@@ -88,7 +88,7 @@ async def _scrolling_screenshots_helper(
|
|||||||
draw_boxes: bool = False,
|
draw_boxes: bool = False,
|
||||||
max_number: int = settings.MAX_NUM_SCREENSHOTS,
|
max_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||||
mode: ScreenshotMode = ScreenshotMode.DETAILED,
|
mode: ScreenshotMode = ScreenshotMode.DETAILED,
|
||||||
) -> list[bytes]:
|
) -> tuple[list[bytes], list[int]]:
|
||||||
skyvern_page = await SkyvernFrame.create_instance(frame=page)
|
skyvern_page = await SkyvernFrame.create_instance(frame=page)
|
||||||
# page is the main frame and the index must be 0
|
# page is the main frame and the index must be 0
|
||||||
assert isinstance(skyvern_page.frame, Page)
|
assert isinstance(skyvern_page.frame, Page)
|
||||||
@@ -101,6 +101,7 @@ async def _scrolling_screenshots_helper(
|
|||||||
draw_boxes = False
|
draw_boxes = False
|
||||||
|
|
||||||
screenshots: list[bytes] = []
|
screenshots: list[bytes] = []
|
||||||
|
positions: list[int] = []
|
||||||
if await skyvern_page.is_window_scrollable():
|
if await skyvern_page.is_window_scrollable():
|
||||||
scroll_y_px_old = -30.0
|
scroll_y_px_old = -30.0
|
||||||
scroll_y_px = await skyvern_page.scroll_to_top(draw_boxes=draw_boxes, frame=frame, frame_index=frame_index)
|
scroll_y_px = await skyvern_page.scroll_to_top(draw_boxes=draw_boxes, frame=frame, frame_index=frame_index)
|
||||||
@@ -110,6 +111,7 @@ async def _scrolling_screenshots_helper(
|
|||||||
while abs(scroll_y_px_old - scroll_y_px) > 25 and len(screenshots) < max_number:
|
while abs(scroll_y_px_old - scroll_y_px) > 25 and len(screenshots) < max_number:
|
||||||
screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode)
|
screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode)
|
||||||
screenshots.append(screenshot)
|
screenshots.append(screenshot)
|
||||||
|
positions.append(int(scroll_y_px))
|
||||||
scroll_y_px_old = scroll_y_px
|
scroll_y_px_old = scroll_y_px
|
||||||
LOG.debug("Scrolling to next page", url=url, num_screenshots=len(screenshots))
|
LOG.debug("Scrolling to next page", url=url, num_screenshots=len(screenshots))
|
||||||
scroll_y_px = await skyvern_page.scroll_to_next_page(
|
scroll_y_px = await skyvern_page.scroll_to_next_page(
|
||||||
@@ -138,11 +140,48 @@ async def _scrolling_screenshots_helper(
|
|||||||
LOG.debug("Page is not scrollable", url=url, num_screenshots=len(screenshots))
|
LOG.debug("Page is not scrollable", url=url, num_screenshots=len(screenshots))
|
||||||
screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode)
|
screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode)
|
||||||
screenshots.append(screenshot)
|
screenshots.append(screenshot)
|
||||||
|
positions.append(0)
|
||||||
|
|
||||||
if draw_boxes:
|
if draw_boxes:
|
||||||
await skyvern_page.remove_bounding_boxes()
|
await skyvern_page.remove_bounding_boxes()
|
||||||
|
|
||||||
return screenshots
|
return screenshots, positions
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -> Image.Image:
|
||||||
|
"""Merge screenshots vertically using scroll positions to remove overlaps."""
|
||||||
|
if not images:
|
||||||
|
raise ValueError("no images to merge")
|
||||||
|
if len(images) != len(positions):
|
||||||
|
raise ValueError("images and positions length mismatch")
|
||||||
|
|
||||||
|
if len(images) == 1:
|
||||||
|
return images[0]
|
||||||
|
|
||||||
|
max_width = max(img.width for img in images)
|
||||||
|
|
||||||
|
merged_height = images[0].height
|
||||||
|
for i in range(1, len(images)):
|
||||||
|
merged_height += positions[i] - positions[i - 1]
|
||||||
|
|
||||||
|
merged_img = Image.new("RGB", (max_width, merged_height), color=(255, 255, 255))
|
||||||
|
|
||||||
|
current_y = 0
|
||||||
|
merged_img.paste(images[0], (0, current_y))
|
||||||
|
current_y += images[0].height
|
||||||
|
|
||||||
|
for i in range(1, len(images)):
|
||||||
|
step = positions[i] - positions[i - 1]
|
||||||
|
overlap = images[i].height - step
|
||||||
|
if overlap > 0:
|
||||||
|
cropped = images[i].crop((0, overlap, images[i].width, images[i].height))
|
||||||
|
else:
|
||||||
|
cropped = images[i]
|
||||||
|
|
||||||
|
merged_img.paste(cropped, (0, current_y))
|
||||||
|
current_y += cropped.height
|
||||||
|
|
||||||
|
return merged_img
|
||||||
|
|
||||||
|
|
||||||
class SkyvernFrame:
|
class SkyvernFrame:
|
||||||
@@ -195,7 +234,9 @@ class SkyvernFrame:
|
|||||||
LOG.debug("Page is fully loaded, agent is about to generate the full page screenshot")
|
LOG.debug("Page is fully loaded, agent is about to generate the full page screenshot")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
async with asyncio.timeout(timeout):
|
async with asyncio.timeout(timeout):
|
||||||
screenshots = await _scrolling_screenshots_helper(page=page, mode=mode, max_number=scrolling_number)
|
screenshots, positions = await _scrolling_screenshots_helper(
|
||||||
|
page=page, mode=mode, max_number=scrolling_number
|
||||||
|
)
|
||||||
images = []
|
images = []
|
||||||
|
|
||||||
for screenshot in screenshots:
|
for screenshot in screenshots:
|
||||||
@@ -203,15 +244,7 @@ class SkyvernFrame:
|
|||||||
img.load()
|
img.load()
|
||||||
images.append(img)
|
images.append(img)
|
||||||
|
|
||||||
total_height = sum(img.height for img in images)
|
merged_img = _merge_images_by_position(images, positions)
|
||||||
max_width = max(img.width for img in images)
|
|
||||||
|
|
||||||
merged_img = Image.new("RGB", (max_width, total_height), color=(255, 255, 255))
|
|
||||||
|
|
||||||
current_y = 0
|
|
||||||
for img in images:
|
|
||||||
merged_img.paste(img, (0, current_y))
|
|
||||||
current_y += img.height
|
|
||||||
|
|
||||||
buffer = BytesIO()
|
buffer = BytesIO()
|
||||||
merged_img.save(buffer, format="PNG")
|
merged_img.save(buffer, format="PNG")
|
||||||
@@ -241,9 +274,10 @@ class SkyvernFrame:
|
|||||||
if not scroll:
|
if not scroll:
|
||||||
return [await _current_viewpoint_screenshot_helper(page=page, mode=ScreenshotMode.DETAILED)]
|
return [await _current_viewpoint_screenshot_helper(page=page, mode=ScreenshotMode.DETAILED)]
|
||||||
|
|
||||||
return await _scrolling_screenshots_helper(
|
screenshots, _ = await _scrolling_screenshots_helper(
|
||||||
page=page, url=url, max_number=max_number, draw_boxes=draw_boxes, mode=ScreenshotMode.DETAILED
|
page=page, url=url, max_number=max_number, draw_boxes=draw_boxes, mode=ScreenshotMode.DETAILED
|
||||||
)
|
)
|
||||||
|
return screenshots
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame:
|
async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame:
|
||||||
|
|||||||
Reference in New Issue
Block a user