From eec13add7d490ff544ef9f7c12df829c7fbfebe4 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Thu, 25 Jul 2024 20:17:14 -0400 Subject: [PATCH 01/10] wip --- experiments/cursor.py | 93 ++++++++++++++++++++ experiments/cursor_joystick.py | 155 +++++++++++++++++++++++++++++++++ experiments/cursor_search.py | 149 +++++++++++++++++++++++++++++++ openadapt/drivers/openai.py | 2 + openadapt/replay.py | 4 +- openadapt/utils.py | 36 +++++--- 6 files changed, 428 insertions(+), 11 deletions(-) create mode 100644 experiments/cursor.py create mode 100644 experiments/cursor_joystick.py create mode 100644 experiments/cursor_search.py diff --git a/experiments/cursor.py b/experiments/cursor.py new file mode 100644 index 000000000..30b01e7d4 --- /dev/null +++ b/experiments/cursor.py @@ -0,0 +1,93 @@ +from pprint import pformat +from typing import Tuple, List, Dict +import json +import os + +from PIL import Image, ImageDraw +import matplotlib.pyplot as plt + +from openadapt.config import config +from openadapt.drivers import openai, anthropic, google +from openadapt.utils import parse_code_snippet + +DRIVER = openai +HISTORY_SIZE = 5 + +def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image: + """Draw concentric circles on the image at the specified coordinates.""" + draw = ImageDraw.Draw(image) + x, y = coords['x'], coords['y'] + + for color, radius in zip(colors, radii): + draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color) + + return image + +def display_images(images: List[Image.Image]) -> None: + """Display all images in the same window.""" + fig, axs = plt.subplots(1, len(images), figsize=(15, 5)) + if len(images) == 1: + axs = [axs] + for ax, img in zip(axs, images): + ax.imshow(img) + ax.axis('off') + plt.show(block=False) + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = Image.open(image_file_path).convert("RGB") + images = [] + all_coords = [] + exceptions = [] + target = "Cell B3" + + while True: + prompt = f"The attached image size is {image.size}." + + if all_coords: + prompt += f" The images have red circles at coordinates:" + for coord in all_coords: + coord.pop('direction', None) + prompt += f"\n {coord}" + prompt += "\n, in the order they are attached." + prompt += f" Locate the pixel coordinates of the target: {target}. Respond with a Python dict only: {{ 'x': int, 'y': int, 'direction': '' }}." + if all_coords: + prompt += "To move the circle to the right, increase the 'x' coordinate, and decrease it to move to the left. To move the circle down, increase the 'y' coordinate, and decrease it to move up." + #if all_coords: + #prompt += " If the red dot is in the correct location in the last image I gave you, respond with the last pair of coordinates I gave you. Otherwise, consider the images and corresponding coordinate locations I gave you to provide an accurate location of the target." + #prompt += f" IT IS IMPERATIVE THAT IF THE RED DOT IS ALREADY IN THE TARGET, YOU DO NOT PROVIDE UPDATED COORDINATES, BUT RE-USE THE CORRECT ONES. Remember, the target is {target}. IF THE RED DOT IS **NOT** ALREADY IN THE TARGET, YOU MUST PROVIDE UPDATED COORDINATES." + if exceptions: + prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}." + print(prompt) + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.", + images=images or [image], + detail="high", + ) + try: + coords = parse_code_snippet(response) + except Exception as exc: + exceptions.append(exc) + continue + else: + exceptions = [] + last_coords = all_coords[-1] if all_coords else None + print(f"{coords=} {last_coords=}") + if last_coords == coords: + break + all_coords.append(coords) + image_with_dot = draw_concentric_circles(image.copy(), coords, ["red", "yellow"], [25, 15, 5]) + image_with_dot.show() + images.append(image_with_dot) + + if HISTORY_SIZE: + all_coords = all_coords[-HISTORY_SIZE:] + images = images[-HISTORY_SIZE:] + + #display_images(images) + plt.show() + +if __name__ == "__main__": + main() + diff --git a/experiments/cursor_joystick.py b/experiments/cursor_joystick.py new file mode 100644 index 000000000..f59a14956 --- /dev/null +++ b/experiments/cursor_joystick.py @@ -0,0 +1,155 @@ +from pprint import pformat +from typing import Tuple, List, Dict +import json +import os + +from loguru import logger +from PIL import Image, ImageDraw +import matplotlib.pyplot as plt + +from openadapt.config import config +from openadapt.drivers import openai, anthropic, google +from openadapt.utils import parse_code_snippet + +DRIVER = openai +HISTORY_SIZE = 1 + +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.cluster import KMeans + +def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: + """Calculate the RGB color maximally different from every color in a given PIL image. + + Args: + image: The PIL image object. + sample_size: The number of colors to sample from the image. + n_clusters: The number of clusters to use for KMeans clustering. + + Returns: + A tuple representing the RGB color maximally different from all colors in the image. + """ + img = image.convert('RGB') + np_img = np.array(img).reshape(-1, 3) + + if len(np_img) > sample_size: + np.random.seed(42) # For reproducibility + np.random.shuffle(np_img) + sampled_colors = np_img[:sample_size] + else: + sampled_colors = np_img + + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + kmeans.fit(sampled_colors) + cluster_centers = kmeans.cluster_centers_ + + all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)]) + distances = cdist(all_colors, cluster_centers, metric='euclidean') + sum_distances = np.sum(distances, axis=1) + max_dist_index = np.argmax(sum_distances) + + return tuple(all_colors[max_dist_index].astype(int)) + +def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image: + """Draw concentric circles on the image at the specified coordinates.""" + draw = ImageDraw.Draw(image) + x, y = coords['x'], coords['y'] + + for color, radius in zip(colors, radii): + draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color) + + return image + +def display_images(images: List[Image.Image]) -> None: + """Display all images in the same window.""" + fig, axs = plt.subplots(1, len(images), figsize=(15, 5)) + if len(images) == 1: + axs = [axs] + for ax, img in zip(axs, images): + ax.imshow(img) + ax.axis('off') + plt.show(block=False) + +def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int, height: int) -> Dict[str, int]: + """Update coordinates based on directions and magnitude.""" + for direction, magnitude in directions.items(): + step_size = { + 'large': 0.5, + 'medium': 0.125, + 'small': 0.0625 + }[magnitude] + + if direction == 'left': + coords['x'] -= int(width * step_size) + elif direction == 'right': + coords['x'] += int(width * step_size) + elif direction == 'up': + coords['y'] -= int(height * step_size) + elif direction == 'down': + coords['y'] += int(height * step_size) + + return coords + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = Image.open(image_file_path).convert("RGB") + + width, height = image.size + coords = {'x': width // 2, 'y': height // 2} + images = [] + all_coords = [] + exceptions = [] + target = "Cell B3" + iteration = 1 + + color = maximally_different_color(image) + logger.info(f"{color=}") + + while True: + all_coords.append(dict(coords)) + image_with_dot = draw_concentric_circles(image.copy(), coords, [color, "red"], [25, 15, 5]) + image_with_dot.show() + images.append(image_with_dot) + + prompt = f"The attached image size is {image.size}." + + if all_coords: + prompt += f" The attached images have red circles at coordinates:" + for coord in all_coords: + prompt += f"\n {coord}" + prompt += "\n, in the order they are attached." + prompt += f" Identify the magnitude and direction to move the last red dot towards the target: {target}. Respond with a single Python dict of the form {{'left' | 'right' | 'up' | 'down': 'large' | 'medium' | 'small'}}." + prompt += f" If the red dot is on the target, respond with an empty dict." + if exceptions: + prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}." + logger.info(f"prompt=\n{prompt}") + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.", + images=images or [image], + detail="high", + ) + try: + directions = parse_code_snippet(response) + except Exception as exc: + exceptions.append(exc) + all_coords = all_coords[:-1] + continue + else: + exceptions = [] + + coords = update_coords(coords, directions, width, height) + iteration += 1 + + if all_coords and all_coords[-1] == coords: + break + + if HISTORY_SIZE: + all_coords = all_coords[-HISTORY_SIZE:] + images = images[-HISTORY_SIZE:] + + plt.show() + +if __name__ == "__main__": + main() + diff --git a/experiments/cursor_search.py b/experiments/cursor_search.py new file mode 100644 index 000000000..e570822f0 --- /dev/null +++ b/experiments/cursor_search.py @@ -0,0 +1,149 @@ +from pprint import pformat +from typing import Tuple, List, Dict +import json +import os + +from loguru import logger +from PIL import Image, ImageDraw +import matplotlib.pyplot as plt + +from openadapt.config import config +from openadapt.drivers import openai, anthropic, google +from openadapt.utils import parse_code_snippet + +DRIVER = openai +HISTORY_SIZE = 1 + +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.cluster import KMeans + + +def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: + """Calculate the RGB color maximally different from every color in a given PIL image. + + Args: + image: The PIL image object. + sample_size: The number of colors to sample from the image. + n_clusters: The number of clusters to use for KMeans clustering. + + Returns: + A tuple representing the RGB color maximally different from all colors in the image. + """ + img = image.convert('RGB') + np_img = np.array(img).reshape(-1, 3) + + if len(np_img) > sample_size: + np.random.seed(42) # For reproducibility + np.random.shuffle(np_img) + sampled_colors = np_img[:sample_size] + else: + sampled_colors = np_img + + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + kmeans.fit(sampled_colors) + cluster_centers = kmeans.cluster_centers_ + + all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)]) + distances = cdist(all_colors, cluster_centers, metric='euclidean') + sum_distances = np.sum(distances, axis=1) + max_dist_index = np.argmax(sum_distances) + + return tuple(all_colors[max_dist_index].astype(int)) + +def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image: + """Draw concentric circles on the image at the specified coordinates.""" + draw = ImageDraw.Draw(image) + x, y = coords['x'], coords['y'] + + for color, radius in zip(colors, radii): + draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color) + + return image + +def display_images(images: List[Image.Image]) -> None: + """Display all images in the same window.""" + fig, axs = plt.subplots(1, len(images), figsize=(15, 5)) + if len(images) == 1: + axs = [axs] + for ax, img in zip(axs, images): + ax.imshow(img) + ax.axis('off') + plt.show(block=False) + +def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int, height: int, iteration: int) -> Dict[str, int]: + """Update coordinates based on directions and current iteration.""" + if directions.get('x') == 'left': + coords['x'] -= width // (2 ** iteration) + elif directions.get('x') == 'right': + coords['x'] += width // (2 ** iteration) + + if directions.get('y') == 'up': + coords['y'] -= height // (2 ** iteration) + elif directions.get('y') == 'down': + coords['y'] += height // (2 ** iteration) + + return coords + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = Image.open(image_file_path).convert("RGB") + + width, height = image.size + coords = {'x': width // 2, 'y': height // 2} + images = [] + all_coords = [] + exceptions = [] + target = "Cell B3" + iteration = 1 + + color = maximally_different_color(image) + logger.info(f"{color=}") + + while True: + all_coords.append(dict(coords)) + image_with_dot = draw_concentric_circles(image.copy(), coords, ["red", color], [25, 15, 5]) + image_with_dot.show() + images.append(image_with_dot) + + prompt = f"The attached image size is {image.size}." + + if all_coords: + prompt += f" The attached images have red circles at coordinates:" + for coord in all_coords: + prompt += f"\n {coord}" + prompt += "\n, in the order they are attached." + prompt += f" Identify the direction to move the last red dot towards the target: {target}. Respond with a single Python dict only: {{ 'x': 'left' | 'right', 'y': 'up' | 'down' }}." + prompt += f" If the red dot is on the target, respond with an empty dict." + if exceptions: + prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}." + logger.info(f"prompt=\n{prompt}") + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.", + images=images or [image], + detail="high", + ) + try: + directions = parse_code_snippet(response) + except Exception as exc: + exceptions.append(exc) + all_coords = all_coords[:-1] + continue + else: + exceptions = [] + + coords = update_coords(coords, directions, width, height, iteration) + iteration += 1 + + if all_coords and all_coords[-1] == coords: + break + + if HISTORY_SIZE: + all_coords = all_coords[-HISTORY_SIZE:] + images = images[-HISTORY_SIZE:] + + plt.show() + +if __name__ == "__main__": + main() diff --git a/openadapt/drivers/openai.py b/openadapt/drivers/openai.py index f81e23747..b6c780174 100644 --- a/openadapt/drivers/openai.py +++ b/openadapt/drivers/openai.py @@ -181,6 +181,7 @@ def prompt( prompt: str, system_prompt: str | None = None, images: list[Image.Image] | None = None, + model: str = MODEL_NAME, max_tokens: int | None = None, detail: str = "high", ) -> str: @@ -205,6 +206,7 @@ def prompt( prompt, system_prompt, images, + model=model, max_tokens=max_tokens, detail=detail, ) diff --git a/openadapt/replay.py b/openadapt/replay.py index 5b597f3e2..106a64511 100644 --- a/openadapt/replay.py +++ b/openadapt/replay.py @@ -21,6 +21,8 @@ from openadapt.db import crud from openadapt.error_reporting import configure_error_reporting from openadapt.models import Recording +from openadapt.strategies import BaseReplayStrategy + LOG_LEVEL = "INFO" @@ -70,7 +72,7 @@ def replay( logger.info(f"{strategy_name=}") - strategy_class_by_name = utils.get_strategy_class_by_name() + strategy_class_by_name = utils.get_subclass_by_name(BaseReplayStrategy) if strategy_name not in strategy_class_by_name: strategy_names = [ name diff --git a/openadapt/utils.py b/openadapt/utils.py index dc2795f15..3417180e8 100644 --- a/openadapt/utils.py +++ b/openadapt/utils.py @@ -6,7 +6,7 @@ from functools import wraps from io import BytesIO from logging import StreamHandler -from typing import Any, Callable +from typing import Any, Callable, Type import ast import base64 import importlib.metadata @@ -425,16 +425,33 @@ def take_screenshot() -> Image.Image: return image -def get_strategy_class_by_name() -> dict: - """Get a dictionary of strategy classes by their names. +def get_subclass_by_name(base_class: Type) -> dict: + """Get a dictionary of subclasses by their names, recursively. + + Args: + base_class (Type): The base class to find subclasses for. Returns: - dict: A dictionary of strategy classes. + dict: A dictionary where keys are subclass names and values are the subclass types. """ - from openadapt.strategies import BaseReplayStrategy + def get_all_subclasses(cls: Type) -> list[Type]: + """ + Get all subclasses of a given class, recursively. - strategy_classes = BaseReplayStrategy.__subclasses__() - class_by_name = {cls.__name__: cls for cls in strategy_classes} + Args: + cls (Type): The class to find subclasses for. + + Returns: + list[Type]: A list of all subclasses of the given class. + """ + subclasses = cls.__subclasses__() + all_subclasses = subclasses.copy() + for subclass in subclasses: + all_subclasses.extend(get_all_subclasses(subclass)) + return all_subclasses + + subclasses = get_all_subclasses(base_class) + class_by_name = {cls.__name__: cls for cls in subclasses} logger.debug(f"{class_by_name=}") return class_by_name @@ -606,14 +623,13 @@ def parse_code_snippet(snippet: str) -> dict: rval = ast.literal_eval(code_content) except Exception as exc: logger.exception(exc) - import ipdb - - ipdb.set_trace() + #import ipdb; ipdb.set_trace() # TODO: handle this raise return rval +# TODO: support multiple blocks def extract_code_block(text: str) -> str: """Extract the text enclosed by the outermost backticks. From 11824e445494f9b31b6977b1237b97053a60ff7f Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Thu, 25 Jul 2024 20:20:35 -0400 Subject: [PATCH 02/10] experiments/cursor_joystick_history.py getting there --- experiments/cursor_joystick_history.py | 193 +++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 experiments/cursor_joystick_history.py diff --git a/experiments/cursor_joystick_history.py b/experiments/cursor_joystick_history.py new file mode 100644 index 000000000..ca9142531 --- /dev/null +++ b/experiments/cursor_joystick_history.py @@ -0,0 +1,193 @@ +from pprint import pformat +from typing import Tuple, List, Dict +import json +import os + +from loguru import logger +from PIL import Image, ImageDraw +import matplotlib.pyplot as plt + +from openadapt.config import config +from openadapt.drivers import openai, anthropic, google +from openadapt.utils import parse_code_snippet + +DRIVER = openai#anthropic +HISTORY_SIZE = 10 + +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.cluster import KMeans + +def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: + """Calculate the RGB color maximally different from every color in a given PIL image. + + Args: + image: The PIL image object. + sample_size: The number of colors to sample from the image. + n_clusters: The number of clusters to use for KMeans clustering. + + Returns: + A tuple representing the RGB color maximally different from all colors in the image. + """ + img = image.convert('RGB') + np_img = np.array(img).reshape(-1, 3) + + if len(np_img) > sample_size: + np.random.seed(42) # For reproducibility + np.random.shuffle(np_img) + sampled_colors = np_img[:sample_size] + else: + sampled_colors = np_img + + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + kmeans.fit(sampled_colors) + cluster_centers = kmeans.cluster_centers_ + + all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)]) + distances = cdist(all_colors, cluster_centers, metric='euclidean') + sum_distances = np.sum(distances, axis=1) + max_dist_index = np.argmax(sum_distances) + + return tuple(all_colors[max_dist_index].astype(int)) + +def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image: + """Draw concentric circles on the image at the specified coordinates.""" + draw = ImageDraw.Draw(image) + x, y = coords['x'], coords['y'] + + for color, radius in zip(colors, radii): + draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color) + + return image + +def draw_coordinates_and_arrows(image: Image.Image, all_coords: List[Dict[str, int]], arrow_color: str = "blue", dot_radius: int = 25) -> Image.Image: + """Draw all coordinates and arrows between successive pairs on the image. + + Args: + image: The PIL image object. + all_coords: List of dictionaries containing the coordinates. + arrow_color: The color of the arrows. + dot_radius: The radius of the red dot. + + Returns: + The image with drawn coordinates and arrows. + """ + draw = ImageDraw.Draw(image) + border_radius = dot_radius + 5 + arrow_head_size = 50 + + for i in range(len(all_coords) - 1): + x1, y1 = all_coords[i]['x'], all_coords[i]['y'] + x2, y2 = all_coords[i+1]['x'], all_coords[i+1]['y'] + + draw.ellipse((x1 - border_radius, y1 - border_radius, x1 + border_radius, y1 + border_radius), fill=arrow_color) + draw.ellipse((x1 - dot_radius, y1 - dot_radius, x1 + dot_radius, y1 + dot_radius), fill="red") + draw.line((x1, y1, x2, y2), fill=arrow_color, width=4) + + # Adjust arrowhead position to point to the exterior of the dot + angle = np.arctan2(y2 - y1, x2 - x1) + x2_adjusted = x2 - int(dot_radius * np.cos(angle)) + y2_adjusted = y2 - int(dot_radius * np.sin(angle)) + + # Draw arrowhead + draw.polygon([ + (x2_adjusted, y2_adjusted), + (x2_adjusted - arrow_head_size * np.cos(angle - np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle - np.pi / 6)), + (x2_adjusted - arrow_head_size * np.cos(angle + np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle + np.pi / 6)) + ], fill=arrow_color) + + if all_coords: + x, y = all_coords[-1]['x'], all_coords[-1]['y'] + draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=arrow_color) + draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill="red") + + return image + + +def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int, height: int) -> Dict[str, int]: + """Update coordinates based on directions and magnitude.""" + for direction, magnitude in directions.items(): + step_size = { + 'large': 0.25, + 'medium': 0.125, + 'small': 0.0625 + }[magnitude] + + if direction == 'left': + coords['x'] -= int(width * step_size) + elif direction == 'right': + coords['x'] += int(width * step_size) + elif direction == 'up': + coords['y'] -= int(height * step_size) + elif direction == 'down': + coords['y'] += int(height * step_size) + + return coords + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = Image.open(image_file_path).convert("RGB") + + width, height = image.size + coords = {'x': width // 2, 'y': height // 2} + all_coords = [] + exceptions = [] + placement_history = [] + target = "The center of Cell E5" + iteration = 1 + + color = maximally_different_color(image) + logger.info(f"{color=}") + + while True: + all_coords.append(dict(coords)) + + prompt = f"Attached are three images: the first ('raw') is an unadultered screenshot, the second ('history') shows previous cursor locations on the screenshot separated by arrows, and the third ('current') shows the current cursor location." + + if all_coords: + prompt += f" Cursor locations are indicated with red dots surrounded by a border. The history image has cursors at coordinates:" + for coord in all_coords: + prompt += f"\n {coord}" + prompt += f" The current image has a cursor at coordinates {all_coords[-1]}." + prompt += f" Identify the magnitude and direction to move the current cursor towards the target: {target}. Respond with a single Python dict of the form {{'target': '', 'placement': '', 'left' | 'right' | 'up' | 'down': 'large' | 'medium' | 'small'}}." + prompt += f" If the current cursor is already at the target, do not specify any directions (but still specify the placement)." + prompt += " Make sure to surround your code with triple backticks: ```" + if exceptions: + prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}." + logger.info(f"prompt=\n{prompt}") + history_image = draw_coordinates_and_arrows(image.copy(), all_coords, color) + history_image.show() + current_image = draw_coordinates_and_arrows(image.copy(), [all_coords[-1]], color) + #input() + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.", + images=[image, history_image, current_image], + ) + try: + directions = parse_code_snippet(response) + except Exception as exc: + exceptions.append(exc) + all_coords = all_coords[:-1] + continue + else: + exceptions = [] + + target = directions.pop("target") + placement = directions.pop("placement") + placement_history.append(placement) + logger.info(f"{target=} {placement=}") + coords = update_coords(coords, directions, width, height) + iteration += 1 + + if all_coords and all_coords[-1] == coords: + break + + if HISTORY_SIZE: + all_coords = all_coords[-HISTORY_SIZE:] + + logger.info(f"placement_history=\n{pformat(placement_history)}") + +if __name__ == "__main__": + main() + From a68cb2f9861aae6ee25ae0701f685d68e85452c1 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Fri, 26 Jul 2024 10:51:48 -0400 Subject: [PATCH 03/10] reduce step sizes --- experiments/cursor_joystick_history.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/experiments/cursor_joystick_history.py b/experiments/cursor_joystick_history.py index ca9142531..95f1066b7 100644 --- a/experiments/cursor_joystick_history.py +++ b/experiments/cursor_joystick_history.py @@ -109,8 +109,8 @@ def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int for direction, magnitude in directions.items(): step_size = { 'large': 0.25, - 'medium': 0.125, - 'small': 0.0625 + 'medium': 0.10, + 'small': 0.01 }[magnitude] if direction == 'left': @@ -133,9 +133,13 @@ def main(): all_coords = [] exceptions = [] placement_history = [] - target = "The center of Cell E5" + target = "The center of Cell G1" iteration = 1 + + # TODO: warn off screen + # TODO: zoom, other primitives? + color = maximally_different_color(image) logger.info(f"{color=}") From 48c12cbddb06edf8c19a5da3f14e5d2e14cb137d Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Sun, 28 Jul 2024 12:20:35 -0400 Subject: [PATCH 04/10] add cursor experiments --- experiments/{cursor.py => cursor/coords.py} | 0 experiments/cursor/direction.py | 276 ++++++++++++++++++ .../joystick.py} | 0 .../joystick_history.py} | 0 experiments/cursor/quadrant.py | 92 ++++++ .../{cursor_search.py => cursor/search.py} | 0 6 files changed, 368 insertions(+) rename experiments/{cursor.py => cursor/coords.py} (100%) create mode 100644 experiments/cursor/direction.py rename experiments/{cursor_joystick.py => cursor/joystick.py} (100%) rename experiments/{cursor_joystick_history.py => cursor/joystick_history.py} (100%) create mode 100644 experiments/cursor/quadrant.py rename experiments/{cursor_search.py => cursor/search.py} (100%) diff --git a/experiments/cursor.py b/experiments/cursor/coords.py similarity index 100% rename from experiments/cursor.py rename to experiments/cursor/coords.py diff --git a/experiments/cursor/direction.py b/experiments/cursor/direction.py new file mode 100644 index 000000000..3aa1bc477 --- /dev/null +++ b/experiments/cursor/direction.py @@ -0,0 +1,276 @@ +from pprint import pformat +from typing import Tuple, List, Dict +import json +import os + +from loguru import logger +from PIL import Image, ImageDraw +import matplotlib.pyplot as plt + +from openadapt.config import config +from openadapt.drivers import openai, anthropic, google +from openadapt.utils import parse_code_snippet + +DRIVER = anthropic +HISTORY_SIZE = 1 + +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.cluster import KMeans + +def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: + """Calculate the RGB color maximally different from every color in a given PIL image. + + Args: + image: The PIL image object. + sample_size: The number of colors to sample from the image. + n_clusters: The number of clusters to use for KMeans clustering. + + Returns: + A tuple representing the RGB color maximally different from all colors in the image. + """ + img = image.convert('RGB') + np_img = np.array(img).reshape(-1, 3) + + if len(np_img) > sample_size: + np.random.seed(42) # For reproducibility + np.random.shuffle(np_img) + sampled_colors = np_img[:sample_size] + else: + sampled_colors = np_img + + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + kmeans.fit(sampled_colors) + cluster_centers = kmeans.cluster_centers_ + + all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)]) + distances = cdist(all_colors, cluster_centers, metric='euclidean') + sum_distances = np.sum(distances, axis=1) + max_dist_index = np.argmax(sum_distances) + + return tuple(all_colors[max_dist_index].astype(int)) + +def draw_coordinates_and_arrows( + image: Image.Image, + all_coords: List[Dict[str, int]], + inner_color: str, + border_color: str, + draw_x: bool = True, + bg_color: tuple = (0, 0, 0), + bg_transparency: float = 0.25, +) -> Image.Image: + """Draw all coordinates and arrows between successive pairs on the image. + + Args: + image: The PIL image object. + all_coords: List of dictionaries containing the coordinates. + border_color: The color of the arrows. + draw_x: If True, draw a big bold "X" on the last coordinate. + + Returns: + The image with drawn coordinates, arrows, and optionally a big bold "X" on the last coordinate. + """ + width, height = image.size + min_dimension = min(width, height) + + # Define sizes relative to image dimensions + dot_radius = int(min_dimension * 0.014) # Approximately 25 in a 1742 height image + border_radius = int(min_dimension * 0.017) # Slightly larger than dot_radius + arrow_head_size = int(min_dimension * 0.029) # Approximately 50 in a 1742 height image + line_width = max(1, int(min_dimension * 0.0023)) # Approximately 4 in a 1742 height image + x_line_width = max(1, int(min_dimension * 0.0034)) # Approximately 6 in a 1742 height image + + image = image.convert("RGBA") + bg_opacity = int(255 * bg_transparency) + overlay = Image.new("RGBA", image.size, bg_color + (bg_opacity,)) + draw = ImageDraw.Draw(overlay) + image = Image.alpha_composite(image, overlay) + image = image.convert("RGB") + + draw = ImageDraw.Draw(image) + + for i in range(len(all_coords) - 1): + x1, y1 = all_coords[i]['x'], all_coords[i]['y'] + x2, y2 = all_coords[i+1]['x'], all_coords[i+1]['y'] + + draw.ellipse((x1 - border_radius, y1 - border_radius, x1 + border_radius, y1 + border_radius), fill=border_color) + draw.ellipse((x1 - dot_radius, y1 - dot_radius, x1 + dot_radius, y1 + dot_radius), fill=inner_color) + draw.line((x1, y1, x2, y2), fill=border_color, width=line_width) + + # Adjust arrowhead position to point to the exterior of the dot + angle = np.arctan2(y2 - y1, x2 - x1) + x2_adjusted = x2 - int(dot_radius * np.cos(angle)) + y2_adjusted = y2 - int(dot_radius * np.sin(angle)) + + # Draw arrowhead + draw.polygon([ + (x2_adjusted, y2_adjusted), + (x2_adjusted - arrow_head_size * np.cos(angle - np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle - np.pi / 6)), + (x2_adjusted - arrow_head_size * np.cos(angle + np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle + np.pi / 6)) + ], fill=border_color) + + if all_coords: + x, y = all_coords[-1]['x'], all_coords[-1]['y'] + draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=border_color) + draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill=inner_color) + + # Draw a big bold "X" on the last coordinate if draw_x is True + if draw_x: + x_size = border_radius * 2 # Size of the X + + # Draw the X + draw.line((x - x_size, y - x_size, x + x_size, y + x_size), fill="black", width=x_line_width) + draw.line((x - x_size, y + x_size, x + x_size, y - x_size), fill="black", width=x_line_width) + + return image + +import random +from typing import Dict, Tuple + +def update_coords(coords: Dict[str, int], direction: str, magnitude: str, width: int, height: int, previous_step_size: float) -> Tuple[Dict[str, int], float]: + """Update coordinates based on a single direction and relative magnitude, with added jitter.""" + if magnitude == 'more': + new_step_size = min(previous_step_size * 2, 0.25) # Cap at 0.25 + elif magnitude == 'less': + new_step_size = max(previous_step_size / 2, 0.01) # Floor at 0.01 + else: # 'same' + new_step_size = previous_step_size + + step = int(min(width, height) * new_step_size) + + # Add jitter + jitter_range = max(1, int(step * 0.1)) # 10% of step size, minimum 1 pixel + jitter_x = random.randint(-jitter_range, jitter_range) + jitter_y = random.randint(-jitter_range, jitter_range) + + direction_map = { + 'left': (-step, 0), + 'right': (step, 0), + 'up': (0, -step), + 'down': (0, step), + 'up-left': (-step, -step), + 'up-right': (step, -step), + 'down-right': (step, step), + 'down-left': (-step, step), + } + dx, dy = direction_map.get(direction, (0, 0)) + + # Apply movement with jitter + coords['x'] += dx + jitter_x + coords['y'] += dy + jitter_y + + # Ensure coordinates stay within image boundaries + coords['x'] = max(0, min(coords['x'], width - 1)) + coords['y'] = max(0, min(coords['y'], height - 1)) + + return coords, new_step_size + +def load_and_downsample_image(image_file_path): + # Open the image and convert to RGB + image = Image.open(image_file_path).convert("RGB") + + # Get the original dimensions + original_width, original_height = image.size + + # Calculate new dimensions (half of the original) + new_width = original_width // 2 + new_height = original_height // 2 + + # Resize the image to half its original size + downsampled_image = image.resize((new_width, new_height), Image.LANCZOS) + + return downsampled_image + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = load_and_downsample_image(image_file_path) + + width, height = image.size + coords = {'x': width // 2, 'y': height // 2} + all_coords = [] + exceptions = [] + all_directions = [] + placement_history = [] + target = "Inside cell C8" + iteration = 1 + previous_step_size = 0.10 # Start with a medium step size + + border_color = maximally_different_color(image) + inner_color = (255, 0, 0) + logger.info(f"{inner_color=} {border_color=}") + + try: + while True: + all_coords.append(dict(coords)) + + prompt = f""" +Attached are two images: the first ('raw') is an unadultered +screenshot, the second ('history') shows previous cursor locations overlaid +on the (dimmed) screenshot, separated by arrows. Cursors are circles with color +{inner_color} surrounded by {border_color}. The latest cursor location is +indicated by an 'X'. Be careful not to get confused with background GUI elements and the overlaid cursors. +Your task is to identify the direction and magnitude to move the current cursor towards the +target, which is '{target}'. +Respond with a single Python dict of the form: + {{ + 'target': '', + 'placement': '', + 'plan': '', + 'direction': 'left' | 'right' | 'up' | 'down' | 'up-left' | 'up-right' | 'down-left' | 'down-right', + 'magnitude': 'more' | 'less' | 'same' + }} +The 'direction' specifies the direction we need to move the cursor to get it to the target. +The magnitude you specify should be relative to the previous movement, regardless of direction. +If the current cursor is already at the target, do not specify any direction or magnitude (but still specify the placement). +Make sure to surround your code with triple backticks: ``` +""" + if exceptions: + prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}." + logger.info(f"prompt=\n{prompt}") + history_image = draw_coordinates_and_arrows(image.copy(), all_coords[-(HISTORY_SIZE + 1):], inner_color, border_color) + history_image.show() + current_image = draw_coordinates_and_arrows(image.copy(), [all_coords[-1]], inner_color, border_color) + #input() + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and accurate.", + images=[image, history_image, current_image], + ) + try: + directions = parse_code_snippet(response) + except Exception as exc: + exceptions.append(exc) + all_coords = all_coords[:-1] + continue + else: + exceptions = [] + + all_directions.append(directions.copy()) + target = directions.pop("target") + placement = directions.pop("placement") + direction = directions.pop("direction", None) + magnitude = directions.pop("magnitude", None) + plan = directions.pop("plan", None) + placement_history.append(placement) + logger.info(f"{target=} {placement=} {plan=} {direction=} {magnitude=}") + + if direction and magnitude: + coords, previous_step_size = update_coords(coords, direction, magnitude, width, height, previous_step_size) + iteration += 1 + + if all_coords and all_coords[-1] == coords: + break + + #if HISTORY_SIZE: + # all_coords = all_coords[-HISTORY_SIZE:] + except Exception as exc: + logger.exception(exc) + pass + + full_history_image = draw_coordinates_and_arrows(image.copy(), all_coords, inner_color, border_color) + full_history_image.show() + logger.info(f"placement_history=\n{pformat(placement_history)}") + logger.info(f"all_directions=\n{pformat(all_directions)}") + +if __name__ == "__main__": + main() diff --git a/experiments/cursor_joystick.py b/experiments/cursor/joystick.py similarity index 100% rename from experiments/cursor_joystick.py rename to experiments/cursor/joystick.py diff --git a/experiments/cursor_joystick_history.py b/experiments/cursor/joystick_history.py similarity index 100% rename from experiments/cursor_joystick_history.py rename to experiments/cursor/joystick_history.py diff --git a/experiments/cursor/quadrant.py b/experiments/cursor/quadrant.py new file mode 100644 index 000000000..42fd1effc --- /dev/null +++ b/experiments/cursor/quadrant.py @@ -0,0 +1,92 @@ +# XXX this won't work because context is lost + +import os +from collections import deque + +from loguru import logger +from PIL import Image + +from openadapt.config import config +from openadapt.drivers import openai, anthropic, google +from openadapt.utils import parse_code_snippet + +DRIVER = openai # anthropic + +def prompt_model(driver, prompt, image): + return driver.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.", + images=[image] + ) + +def get_quadrant(driver, image, target_element): + prompt = f"The target element is {target_element}. In which quadrant of the image is the target element located: top-left, top-right, bottom-left, or bottom-right? You can also say 'stop' when the target element is in the center of the image. You may reason in natural language, but you should include exactly one code block containing a python dict to structure your final response. The dict should have a single key: 'instruction', whose value is either 'top-left', 'top-right', 'bottom-left', 'bottom-right', 'stop', or 'unknown'. If you don't see the target element, say 'unknown'. DO NOT MAKE ANY ASSUMPTIONS, IF YOU DON'T SEE THE ELEMENT THEN SAY UNKNOWN!!! IF THE TARGET ELEMENT IS IN THE CENTER OF THE IMAGE, SAY STOP!!!" + response = prompt_model(driver, prompt, image) + response_dict = parse_code_snippet(response) + quadrant = response_dict["instruction"] + return quadrant + +def crop_image(image, quadrant): + width, height = image.size + if quadrant == "top-left": + return image.crop((0, 0, width // 2, height // 2)) + elif quadrant == "top-right": + return image.crop((width // 2, 0, width, height // 2)) + elif quadrant == "bottom-left": + return image.crop((0, height // 2, width // 2, height)) + elif quadrant == "bottom-right": + return image.crop((width // 2, height // 2, width, height)) + else: + return image + +def locate_element(driver, image, target_element): + image_history = deque([(image, ["top-left", "top-right", "bottom-left", "bottom-right"])]) + iterations = 0 + max_iterations = 20 # Increased to allow for backtracking + + while iterations < max_iterations and image_history: + current_image, available_quadrants = image_history[-1] + current_image.show() # Display the current image + + quadrant = get_quadrant(driver, current_image, target_element) + logger.info(f"{quadrant=}") + input() + + if quadrant == "stop": + logger.info(f"Element located after {iterations} iterations.") + return current_image + + if quadrant == "unknown": + logger.info("Unknown response, discarding current image and trying next quadrant from previous image") + image_history.pop() # Discard the current image + if not image_history: + logger.warning("No more images in history") + break + continue # Go back to the start of the loop with the previous image + + if quadrant not in available_quadrants: + logger.warning(f"Unexpected quadrant {quadrant}, trying next available quadrant") + if not available_quadrants: + logger.info("No more quadrants to try, backtracking...") + image_history.pop() + continue + quadrant = available_quadrants[0] + + available_quadrants.remove(quadrant) + new_image = crop_image(current_image, quadrant) + image_history.append((new_image, ["top-left", "top-right", "bottom-left", "bottom-right"])) + iterations += 1 + + logger.warning("Max iterations reached or no more images to process") + return image_history[-1][0] if image_history else image # Return the last processed image or the original if all discarded + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = Image.open(image_file_path) + target_element = "Cell A1" + + result_image = locate_element(DRIVER, image, target_element) + result_image.show() # Display the final result + +if __name__ == "__main__": + main() diff --git a/experiments/cursor_search.py b/experiments/cursor/search.py similarity index 100% rename from experiments/cursor_search.py rename to experiments/cursor/search.py From 57d8faffc7f15b88529011fc1a04154bb2d57db7 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Sun, 28 Jul 2024 15:21:31 -0400 Subject: [PATCH 05/10] add experiments/cursor/sample.py --- experiments/cursor/sample.py | 246 +++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 experiments/cursor/sample.py diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py new file mode 100644 index 000000000..5578ce129 --- /dev/null +++ b/experiments/cursor/sample.py @@ -0,0 +1,246 @@ +import random +from typing import List, Dict, Tuple +import json +from PIL import Image, ImageDraw, ImageFont, ImageEnhance +from loguru import logger +import os +import numpy as np +from scipy.spatial.distance import cdist +from sklearn.cluster import KMeans + +from openadapt.config import config +from openadapt.drivers import anthropic, openai, google +from openadapt.utils import parse_code_snippet + +DRIVER = anthropic +NUM_CURSORS = 2**2 # This can now be easily changed +SPREAD_REDUCTION_FACTOR = 0.5 # How much to reduce spread each iteration +MAX_ITERATIONS = 5 # Maximum number of iterations + + +def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: + """Calculate the RGB color maximally different from every color in a given PIL image.""" + img = image.convert('RGB') + np_img = np.array(img).reshape(-1, 3) + + if len(np_img) > sample_size: + np.random.seed(42) # For reproducibility + np.random.shuffle(np_img) + sampled_colors = np_img[:sample_size] + else: + sampled_colors = np_img + + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + kmeans.fit(sampled_colors) + cluster_centers = kmeans.cluster_centers_ + + all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)]) + distances = cdist(all_colors, cluster_centers, metric='euclidean') + sum_distances = np.sum(distances, axis=1) + max_dist_index = np.argmax(sum_distances) + + return tuple(all_colors[max_dist_index].astype(int)) + +def generate_cursors(center: Dict[str, int], spread: float, width: int, height: int) -> List[Dict[str, int]]: + cursors = [] + + # Calculate grid dimensions + grid_size = int(np.sqrt(NUM_CURSORS)) + + # Calculate cell size based on spread + cell_width = (width * spread) / grid_size + cell_height = (height * spread) / grid_size + + # Calculate top-left corner of the grid + start_x = center['x'] - (cell_width * (grid_size - 1)) / 2 + start_y = center['y'] - (cell_height * (grid_size - 1)) / 2 + + for i in range(grid_size): + for j in range(grid_size): + x = int(start_x + i * cell_width) + y = int(start_y + j * cell_height) + + # Ensure cursors are within image bounds + x = max(0, min(width - 1, x)) + y = max(0, min(height - 1, y)) + + cursors.append({'x': x, 'y': y}) + + return cursors +def draw_labelled_cursors( + image: Image.Image, + cursors: List[Dict[str, int]], + inner_color: Tuple[int, int, int], + border_color: Tuple[int, int, int], + label_color: Tuple[int, int, int], + bg_color: tuple = (0, 0, 0), + bg_transparency: float = 0.25, + labels: List[str] = None, +) -> Image.Image: + + image = image.convert("RGBA") + bg_opacity = int(255 * bg_transparency) + overlay = Image.new("RGBA", image.size, bg_color + (bg_opacity,)) + draw = ImageDraw.Draw(overlay) + image = Image.alpha_composite(image, overlay) + image = image.convert("RGB") + + draw = ImageDraw.Draw(image) + font = ImageFont.truetype("Arial.ttf", 40) # Slightly smaller font to accommodate two-digit numbers + + width, height = image.size + min_dimension = min(width, height) + + for i, coords in enumerate(cursors): + x, y = coords['x'], coords['y'] + label = labels[i] if labels else str(i + 1) + + # Draw cursor + border_radius = int(.02 * min_dimension) + dot_radius = int(.015 * min_dimension) + draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=border_color) + draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill=inner_color) + + # Draw label + label_w, label_h = draw.textsize(label, font=font) + draw.text((x - label_w / 2, y - label_h / 2), label, fill=label_color, font=font) + + return image + +def load_and_downsample_image(image_file_path): + # Open the image and convert to RGB + image = Image.open(image_file_path).convert("RGB") + + # Get the original dimensions + original_width, original_height = image.size + + # Calculate new dimensions (half of the original) + new_width = original_width // 2 + new_height = original_height // 2 + + # Resize the image to half its original size + downsampled_image = image.resize((new_width, new_height), Image.LANCZOS) + + return downsampled_image + +def increase_contrast(image: Image.Image, contrast_factor: float) -> Image.Image: + """ + Increases the contrast of a PIL image and returns the enhanced image. + + Args: + image (Image.Image): The input PIL image. + contrast_factor (float): Factor by which to increase the contrast. 1.0 means no change, + less than 1.0 decreases contrast, greater than 1.0 increases contrast. + + Returns: + Image.Image: The enhanced PIL image. + """ + # Create an ImageEnhance object for contrast enhancement + enhancer = ImageEnhance.Contrast(image) + + # Apply the contrast enhancement + enhanced_image = enhancer.enhance(contrast_factor) + + return enhanced_image + +def main(): + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + #image = Image.open(image_file_path).convert("RGB") + image = load_and_downsample_image(image_file_path) + #image = increase_contrast(image, 2) + width, height = image.size + + inner_color = (255, 0, 0) + border_color = maximally_different_color(image) + label_color = (255, 255, 255) + + target = "Inside cell C8" # almost + target = "Inside cell G5" # almost + target = "Inside cell A1" # almost + target = "Cell B12" + target = "Cell E7" + target = "Save button" + target = "Cell C3" + center = {'x': width // 2, 'y': height // 2} + spread = 1.0 + iteration = 1 + + identified_locations = [] + exceptions = [] + + while iteration <= MAX_ITERATIONS: + cursors = generate_cursors(center, spread, width, height) + cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, border_color, label_color) + cursor_image.show() + + prompt = f""" +Attached is 1. a raw screenshot, and 2. the same screenshot a) dimmed and b) with {NUM_CURSORS} cursors overlaid. +Each cursor is a circle with color {inner_color} surrounded by {border_color}, labelled with a number from 1 to {NUM_CURSORS} with color {label_color}. + +Your task is to identify the cursor closest to the target: '{target}'. + +Respond with a single Python dict of the form: + {{ + 'target': '', + 'cursors': '', + 'review': "", + 'analysis': '', + 'closest': '', + }} + +Make sure to surround your code with triple backticks: ``` +""" + if exceptions: + prompt += f""" +Previously when you responded to this prompt, this resulted in the following exceptions: +{{exceptions}} +""" + logger.info(f"prompt=\n{prompt}") + + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and accurate.", + images=[image, cursor_image], + ) + + try: + result = parse_code_snippet(response) + except Exception as exc: + logger.exception(exc) + exceptions.append(exc) + continue + else: + exceptions = [] + + logger.info(f"Iteration {iteration}: {result}") + + if 'closest' in result: + closest_number = int(result['closest']) + identified_location = cursors[closest_number - 1] # Adjust for 0-based indexing + identified_locations.append(identified_location) + center = identified_location + spread *= SPREAD_REDUCTION_FACTOR + else: + logger.error("Invalid response from model") + continue + + iteration += 1 + + # Create final image with all identified locations + final_image = image.copy() + labels = [str(i) for i in range(1, len(identified_locations) + 1)] + final_image_with_cursors = draw_labelled_cursors( + final_image, + identified_locations, + inner_color, + border_color, + label_color, + labels=labels + ) + final_image_with_cursors.show() + final_image_with_cursors.save("final_cursor_locations.png") + + logger.info(f"Final identified locations: {identified_locations}") + +if __name__ == "__main__": + main() From 9193ba7b143876867d6ecb1496f03af089f61c72 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Sun, 28 Jul 2024 22:59:16 -0400 Subject: [PATCH 06/10] working! --- experiments/cursor/sample.py | 152 +++++++++++++++++++++++------------ 1 file changed, 100 insertions(+), 52 deletions(-) diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py index 5578ce129..0a93e849f 100644 --- a/experiments/cursor/sample.py +++ b/experiments/cursor/sample.py @@ -1,3 +1,4 @@ +from collections import Counter import random from typing import List, Dict, Tuple import json @@ -13,9 +14,13 @@ from openadapt.utils import parse_code_snippet DRIVER = anthropic -NUM_CURSORS = 2**2 # This can now be easily changed +NUM_CURSORS = 2**2 SPREAD_REDUCTION_FACTOR = 0.5 # How much to reduce spread each iteration -MAX_ITERATIONS = 5 # Maximum number of iterations +MAX_ITERATIONS = 4 # Maximum number of iterations +CONTRAST_FACTOR = 1 +RETRIES_PER_ITERATION = 3 +DOWNSAMPLE_FACTOR = 3 +CONSENSUS_THRESHOLD = 2 def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: @@ -41,7 +46,19 @@ def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clu return tuple(all_colors[max_dist_index].astype(int)) -def generate_cursors(center: Dict[str, int], spread: float, width: int, height: int) -> List[Dict[str, int]]: +def generate_cursors(center: dict[str, int], spread: float, width: int, height: int, jitter: float = 0.1) -> list[dict[str, int]]: + """Generates cursors around a center point within a defined spread, with optional jitter. + + Args: + center (dict[str, int]): The central point from which cursors are generated. + spread (float): The spread factor determining the grid size. + width (int): The width of the image. + height (int): The height of the image. + jitter (float): The jitter factor to add randomness to cursor positions. + + Returns: + list[dict[str, int]]: A list of cursor positions. + """ cursors = [] # Calculate grid dimensions @@ -60,6 +77,10 @@ def generate_cursors(center: Dict[str, int], spread: float, width: int, height: x = int(start_x + i * cell_width) y = int(start_y + j * cell_height) + # Apply jitter + x += int((random.random() - 0.5) * cell_width * jitter) + y += int((random.random() - 0.5) * cell_height * jitter) + # Ensure cursors are within image bounds x = max(0, min(width - 1, x)) y = max(0, min(height - 1, y)) @@ -67,17 +88,30 @@ def generate_cursors(center: Dict[str, int], spread: float, width: int, height: cursors.append({'x': x, 'y': y}) return cursors + def draw_labelled_cursors( image: Image.Image, cursors: List[Dict[str, int]], inner_color: Tuple[int, int, int], - border_color: Tuple[int, int, int], label_color: Tuple[int, int, int], - bg_color: tuple = (0, 0, 0), + bg_color: Tuple[int, int, int] = (0, 0, 0), bg_transparency: float = 0.25, labels: List[str] = None, ) -> Image.Image: + """Draws labelled cursors on the image. + + Args: + image: The input image on which cursors are to be drawn. + cursors: A list of dictionaries containing cursor coordinates. + inner_color: The color of the inner part of the cursor. + label_color: The color of the label text. + bg_color: Background color for transparency overlay. + bg_transparency: Transparency level for the background overlay. + labels: List of labels to be drawn with the cursors. + Returns: + Image with labelled cursors. + """ image = image.convert("RGBA") bg_opacity = int(255 * bg_transparency) overlay = Image.new("RGBA", image.size, bg_color + (bg_opacity,)) @@ -91,23 +125,34 @@ def draw_labelled_cursors( width, height = image.size min_dimension = min(width, height) + # Calculate rectangle size based on the largest label + max_label = max(labels, key=len) if labels else str(len(cursors)) + max_label_bbox = draw.textbbox((0, 0), max_label, font=font) + max_label_w = max_label_bbox[2] - max_label_bbox[0] + max_label_h = max_label_bbox[3] - max_label_bbox[1] + rect_width = max_label_w + 20 + rect_height = max_label_h + 20 + for i, coords in enumerate(cursors): x, y = coords['x'], coords['y'] label = labels[i] if labels else str(i + 1) - # Draw cursor - border_radius = int(.02 * min_dimension) - dot_radius = int(.015 * min_dimension) - draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=border_color) - draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill=inner_color) + # Draw rectangle + rect_x0 = x - rect_width // 2 + rect_y0 = y - rect_height // 2 + rect_x1 = x + rect_width // 2 + rect_y1 = y + rect_height // 2 + draw.rectangle((rect_x0, rect_y0, rect_x1, rect_y1), fill=inner_color) # Draw label - label_w, label_h = draw.textsize(label, font=font) + label_bbox = draw.textbbox((0, 0), label, font=font) + label_w = label_bbox[2] - label_bbox[0] + label_h = label_bbox[3] - label_bbox[1] draw.text((x - label_w / 2, y - label_h / 2), label, fill=label_color, font=font) return image -def load_and_downsample_image(image_file_path): +def load_and_downsample_image(image_file_path, downsample_factor: int): # Open the image and convert to RGB image = Image.open(image_file_path).convert("RGB") @@ -115,8 +160,8 @@ def load_and_downsample_image(image_file_path): original_width, original_height = image.size # Calculate new dimensions (half of the original) - new_width = original_width // 2 - new_height = original_height // 2 + new_width = original_width // downsample_factor + new_height = original_height // downsample_factor # Resize the image to half its original size downsampled_image = image.resize((new_width, new_height), Image.LANCZOS) @@ -135,6 +180,9 @@ def increase_contrast(image: Image.Image, contrast_factor: float) -> Image.Image Returns: Image.Image: The enhanced PIL image. """ + if contrast_factor == 1: + return image + # Create an ImageEnhance object for contrast enhancement enhancer = ImageEnhance.Contrast(image) @@ -145,13 +193,11 @@ def increase_contrast(image: Image.Image, contrast_factor: float) -> Image.Image def main(): image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") - #image = Image.open(image_file_path).convert("RGB") - image = load_and_downsample_image(image_file_path) - #image = increase_contrast(image, 2) + image = load_and_downsample_image(image_file_path, DOWNSAMPLE_FACTOR) + image = increase_contrast(image, CONTRAST_FACTOR) width, height = image.size - inner_color = (255, 0, 0) - border_color = maximally_different_color(image) + inner_color = maximally_different_color(image) label_color = (255, 255, 255) target = "Inside cell C8" # almost @@ -161,6 +207,7 @@ def main(): target = "Cell E7" target = "Save button" target = "Cell C3" + target = "Cell G4" center = {'x': width // 2, 'y': height // 2} spread = 1.0 iteration = 1 @@ -169,13 +216,10 @@ def main(): exceptions = [] while iteration <= MAX_ITERATIONS: - cursors = generate_cursors(center, spread, width, height) - cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, border_color, label_color) - cursor_image.show() prompt = f""" -Attached is 1. a raw screenshot, and 2. the same screenshot a) dimmed and b) with {NUM_CURSORS} cursors overlaid. -Each cursor is a circle with color {inner_color} surrounded by {border_color}, labelled with a number from 1 to {NUM_CURSORS} with color {label_color}. +Attached is a screenshot that has been dimmed and with {NUM_CURSORS} cursors overlaid. +Each cursor is a rectangle with color {inner_color}, labelled with a number from 1 to {NUM_CURSORS} with color {label_color}. Your task is to identify the cursor closest to the target: '{target}'. @@ -196,33 +240,38 @@ def main(): {{exceptions}} """ logger.info(f"prompt=\n{prompt}") - - response = DRIVER.prompt( - prompt=prompt, - system_prompt="You are an expert GUI interpreter. You are precise and accurate.", - images=[image, cursor_image], - ) - - try: - result = parse_code_snippet(response) - except Exception as exc: - logger.exception(exc) - exceptions.append(exc) - continue - else: - exceptions = [] - - logger.info(f"Iteration {iteration}: {result}") - - if 'closest' in result: - closest_number = int(result['closest']) - identified_location = cursors[closest_number - 1] # Adjust for 0-based indexing - identified_locations.append(identified_location) - center = identified_location - spread *= SPREAD_REDUCTION_FACTOR - else: - logger.error("Invalid response from model") - continue + + votes = [] + while True: + cursors = generate_cursors(center, spread, width, height) + cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, label_color) + cursor_image.show() + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and accurate.", + images=[ + #image, + cursor_image, + ], + ) + + try: + result = parse_code_snippet(response) + if 'closest' in result: + votes.append(int(result['closest'])) + except Exception as exc: + logger.exception(exc) + exceptions.append(exc) + + most_common = Counter(votes).most_common(1)[0] + logger.info(f"{votes=} {most_common=}") + if most_common[1] >= CONSENSUS_THRESHOLD: + break + closest_number = most_common[0] + identified_location = cursors[closest_number - 1] # Adjust for 0-based indexing + identified_locations.append(identified_location) + center = identified_location + spread *= SPREAD_REDUCTION_FACTOR iteration += 1 @@ -233,7 +282,6 @@ def main(): final_image, identified_locations, inner_color, - border_color, label_color, labels=labels ) From d411928f34e5c976d93b846383a3ab7bd9ebee97 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Mon, 29 Jul 2024 10:33:00 -0400 Subject: [PATCH 07/10] overlap ratio --- experiments/cursor/sample.py | 90 ++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py index 0a93e849f..017301abf 100644 --- a/experiments/cursor/sample.py +++ b/experiments/cursor/sample.py @@ -13,14 +13,16 @@ from openadapt.drivers import anthropic, openai, google from openadapt.utils import parse_code_snippet -DRIVER = anthropic +DRIVER = google NUM_CURSORS = 2**2 -SPREAD_REDUCTION_FACTOR = 0.5 # How much to reduce spread each iteration -MAX_ITERATIONS = 4 # Maximum number of iterations +SPREAD_REDUCTION_FACTOR = 0.5 +MAX_ITERATIONS = None +MAX_OVERLAP_RATIO = 0.2 CONTRAST_FACTOR = 1 RETRIES_PER_ITERATION = 3 DOWNSAMPLE_FACTOR = 3 CONSENSUS_THRESHOLD = 2 +LABEL_SIZE_RATIO = 0.04 def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]: @@ -46,7 +48,7 @@ def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clu return tuple(all_colors[max_dist_index].astype(int)) -def generate_cursors(center: dict[str, int], spread: float, width: int, height: int, jitter: float = 0.1) -> list[dict[str, int]]: +def generate_cursors(center: dict[str, int], spread: float, width: int, height: int, jitter: float = 0.01) -> list[dict[str, int]]: """Generates cursors around a center point within a defined spread, with optional jitter. Args: @@ -97,8 +99,9 @@ def draw_labelled_cursors( bg_color: Tuple[int, int, int] = (0, 0, 0), bg_transparency: float = 0.25, labels: List[str] = None, -) -> Image.Image: - """Draws labelled cursors on the image. + padding: int = 10, +) -> Tuple[Image.Image, float]: + """Draws labelled cursors on the image and computes maximum label overlap ratio. Args: image: The input image on which cursors are to be drawn. @@ -108,9 +111,10 @@ def draw_labelled_cursors( bg_color: Background color for transparency overlay. bg_transparency: Transparency level for the background overlay. labels: List of labels to be drawn with the cursors. + padding: Padding around the labels. Returns: - Image with labelled cursors. + Tuple of Image with labelled cursors and the maximum overlap ratio. """ image = image.convert("RGBA") bg_opacity = int(255 * bg_transparency) @@ -120,19 +124,21 @@ def draw_labelled_cursors( image = image.convert("RGB") draw = ImageDraw.Draw(image) - font = ImageFont.truetype("Arial.ttf", 40) # Slightly smaller font to accommodate two-digit numbers + min_dimension = min(image.size) + font_size = int(min_dimension * LABEL_SIZE_RATIO) + font = ImageFont.truetype("Arial.ttf", font_size) width, height = image.size - min_dimension = min(width, height) - + # Calculate rectangle size based on the largest label max_label = max(labels, key=len) if labels else str(len(cursors)) max_label_bbox = draw.textbbox((0, 0), max_label, font=font) max_label_w = max_label_bbox[2] - max_label_bbox[0] max_label_h = max_label_bbox[3] - max_label_bbox[1] - rect_width = max_label_w + 20 - rect_height = max_label_h + 20 - + rect_width = max_label_w + padding + rect_height = max_label_h + padding + + rectangles = [] for i, coords in enumerate(cursors): x, y = coords['x'], coords['y'] label = labels[i] if labels else str(i + 1) @@ -149,8 +155,25 @@ def draw_labelled_cursors( label_w = label_bbox[2] - label_bbox[0] label_h = label_bbox[3] - label_bbox[1] draw.text((x - label_w / 2, y - label_h / 2), label, fill=label_color, font=font) - - return image + + rectangles.append((rect_x0, rect_y0, rect_x1, rect_y1)) + + # Compute maximum overlap ratio + max_overlap_ratio = 0.0 + for i, rect1 in enumerate(rectangles): + for j, rect2 in enumerate(rectangles): + if i != j: + overlap_width = max(0, min(rect1[2], rect2[2]) - max(rect1[0], rect2[0])) + overlap_height = max(0, min(rect1[3], rect2[3]) - max(rect1[1], rect2[1])) + overlap_area = overlap_width * overlap_height + rect1_area = (rect1[2] - rect1[0]) * (rect1[3] - rect1[1]) + rect2_area = (rect2[2] - rect2[0]) * (rect2[3] - rect2[1]) + ratio1 = overlap_area / rect1_area if rect1_area > 0 else 0 + ratio2 = overlap_area / rect2_area if rect2_area > 0 else 0 + max_overlap_ratio = max(max_overlap_ratio, ratio1, ratio2) + + return image, max_overlap_ratio + def load_and_downsample_image(image_file_path, downsample_factor: int): # Open the image and convert to RGB @@ -208,14 +231,19 @@ def main(): target = "Save button" target = "Cell C3" target = "Cell G4" + target = "Cell A1" center = {'x': width // 2, 'y': height // 2} spread = 1.0 iteration = 1 identified_locations = [] exceptions = [] + center_history = [] + spread_history = [] - while iteration <= MAX_ITERATIONS: + while True: + if MAX_ITERATIONS and iteration > MAX_ITERATIONS: + break prompt = f""" Attached is a screenshot that has been dimmed and with {NUM_CURSORS} cursors overlaid. @@ -226,12 +254,13 @@ def main(): Respond with a single Python dict of the form: {{ 'target': '', - 'cursors': '', - 'review': "", - 'analysis': '', + 'target_position': '', + 'cursor_positions': '', 'closest': '', }} +Don't make any assumptions about positions of anything. I need you to CAREFULLY ANALYZE THE IMAGE. + Make sure to surround your code with triple backticks: ``` """ if exceptions: @@ -242,9 +271,14 @@ def main(): logger.info(f"prompt=\n{prompt}") votes = [] + center_history.append(center) + spread_history.append(spread) + max_overlap_ratio = 0 while True: cursors = generate_cursors(center, spread, width, height) - cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, label_color) + cursor_image, overlap_ratio = draw_labelled_cursors(image.copy(), cursors, inner_color, label_color) + logger.info(f"{overlap_ratio=}") + max_overlap_ratio = max(overlap_ratio, max_overlap_ratio) cursor_image.show() response = DRIVER.prompt( prompt=prompt, @@ -257,12 +291,17 @@ def main(): try: result = parse_code_snippet(response) - if 'closest' in result: - votes.append(int(result['closest'])) + closest = result['closest'] except Exception as exc: logger.exception(exc) exceptions.append(exc) + if closest is None: + # try again with previous level + center = center_history.pop() + spread = spread_history.pop() + continue + votes.append(int(closest)) most_common = Counter(votes).most_common(1)[0] logger.info(f"{votes=} {most_common=}") if most_common[1] >= CONSENSUS_THRESHOLD: @@ -273,17 +312,20 @@ def main(): center = identified_location spread *= SPREAD_REDUCTION_FACTOR + if max_overlap_ratio > MAX_OVERLAP_RATIO: + break + iteration += 1 # Create final image with all identified locations final_image = image.copy() labels = [str(i) for i in range(1, len(identified_locations) + 1)] - final_image_with_cursors = draw_labelled_cursors( + final_image_with_cursors, _ = draw_labelled_cursors( final_image, identified_locations, inner_color, label_color, - labels=labels + labels=labels, ) final_image_with_cursors.show() final_image_with_cursors.save("final_cursor_locations.png") From 170e48596bc98b66813cbfdfd46c4157f38b84be Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 30 Jul 2024 15:24:19 -0400 Subject: [PATCH 08/10] add experiments/cursor/grid.py --- experiments/cursor/grid.py | 170 +++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 experiments/cursor/grid.py diff --git a/experiments/cursor/grid.py b/experiments/cursor/grid.py new file mode 100644 index 000000000..af2fec42e --- /dev/null +++ b/experiments/cursor/grid.py @@ -0,0 +1,170 @@ +from typing import Tuple +import os + +from loguru import logger +from PIL import Image, ImageDraw, ImageFont, ImageEnhance + +from openadapt.config import config +from openadapt.drivers import anthropic, openai, google +from openadapt.utils import parse_code_snippet + +# Constants +DOWNSAMPLE_FACTOR = 2 +GRID_SIZE = 25 # Adjust based on desired grid size +DRIVER = openai + +def load_and_downsample_image(image_file_path, downsample_factor: int): + # Open the image and convert to RGB + image = Image.open(image_file_path).convert("RGB") + + # Get the original dimensions + original_width, original_height = image.size + + # Calculate new dimensions (half of the original) + new_width = original_width // downsample_factor + new_height = original_height // downsample_factor + + # Resize the image to half its original size + downsampled_image = image.resize((new_width, new_height), Image.LANCZOS) + + return downsampled_image + +def dim_image(image: Image.Image) -> Image.Image: + enhancer = ImageEnhance.Brightness(image) + return enhancer.enhance(0.5) # Dim the image by reducing brightness to 50% + +from PIL import Image, ImageDraw, ImageFont + +def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image: + width, height = image.size + grid_image = Image.new('RGB', (width + 50, height + 50), 'red') + grid_image.paste(image, (50, 50)) + draw = ImageDraw.Draw(grid_image) + + # Calculate the maximum font size that fits within the grid cell + max_font_size = min((height // grid_size) // 2, (width // grid_size) // 2) + font_size = max_font_size + font = ImageFont.truetype("Arial.ttf", font_size) + + # Adjust font size dynamically to fit within grid cells + while True: + fits = True + for i in range(grid_size): + row_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:] + col_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:] + if row_text_size[1] > height // grid_size or col_text_size[0] > width // grid_size: + fits = False + break + if fits: + break + font_size -= 1 + font = ImageFont.truetype("Arial.ttf", font_size) + if font_size <= 0: + raise ValueError("Cannot find a suitable font size that fits within the grid cells") + + cell_width = width / grid_size + cell_height = height / grid_size + + # Add labels for rows and columns with the determined font size + for i in range(grid_size): + draw.text((25, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font) + draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, 25), str(i + 1), fill='white', font=font) + + # Draw semi-transparent grid lines + overlay = Image.new('RGBA', grid_image.size, (255, 0, 0, 0)) + overlay_draw = ImageDraw.Draw(overlay) + for i in range(grid_size + 1): + # Horizontal lines + overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=1) + # Vertical lines + overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=1) + + # Composite the overlay with the labeled image + grid_image = Image.alpha_composite(grid_image.convert('RGBA'), overlay) + + return grid_image.convert('RGB') + +def get_cell_coordinates(grid_size: int, target_cell: Tuple[int, int], image_size: Tuple[int, int]) -> Tuple[int, int]: + row, col = target_cell + cell_width = image_size[0] / grid_size + cell_height = image_size[1] / grid_size + x = int((col - 0.5) * cell_width) + y = int((row - 0.5) * cell_height) + return x, y + +def draw_target_coordinates(image: Image.Image, coordinates: Tuple[int, int]) -> Image.Image: + image = image.copy() + label_offset = 50 # The offset added to the top and left for labels + draw = ImageDraw.Draw(image) + adjusted_coordinates = (coordinates[0] + label_offset, coordinates[1] + label_offset) + draw.ellipse((adjusted_coordinates[0] - 5, adjusted_coordinates[1] - 5, adjusted_coordinates[0] + 5, adjusted_coordinates[1] + 5), fill='red', outline='red') + return image + +def main(target: str): + # Load and dim the image + image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") + image = load_and_downsample_image(image_file_path, DOWNSAMPLE_FACTOR) + image = dim_image(image) + + # Add grid labels + grid_image = add_grid_labels(image, GRID_SIZE) + grid_image.show() + + row, column = None, None + grid_image_with_target = None + while True: + prompt = f""" +Attached is an image containing a screenshot over which a grid has been overlaid. +The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. The grid lines +are semi-transparent. +Your task is to identify the grid row and column containing the target. +The target is: "{target}". +Respond in JSON with the following keys: + {{ + "target": "", + "analysis": "", + "row": "", + "column": "", + }} +Make sure not to confuse the grid labels with the content of the screenshot. +For example, if the screenshot contains a spreadsheet, don't get confused between +the spreadsheet cell labels and the grid labels. +Wrap your code in triple backticks: ``` +""" + if row and column: + prompt += f""" +Previously, you specified row {row}, column {column}. This cell has been marked +with a red dot. If you agree with your previous assessment, confirm by specifying +the same row and column again. Otherwise, please correct your previous assessment. +""" + # Prompt the model to identify the cell + response = DRIVER.prompt( + prompt=prompt, + system_prompt="You are an expert GUI interpreter. You are precise and accurate.", + images=[grid_image_with_target or grid_image], + ) + result = parse_code_snippet(response) + + prev_row, prev_column = row, column + row = int(result["row"]) + column = int(result["column"]) + logger.info(f"{row=} {column=}") + coordinates = get_cell_coordinates(GRID_SIZE, (row, column), image.size) + + # Draw the coordinates on the image + grid_image_with_target = draw_target_coordinates(grid_image, coordinates) + + # Show the final image + grid_image_with_target.show() + + if row == prev_row and column == prev_column: + break + +if __name__ == "__main__": + #main("save button") + #main("cell A1") + #main("cell containing 'Marketing'") + #main("font selector") + #main("zoom slider") + #main("paste button") + main("font size dropdown") From 048aa525485c875a380b56924193e841718b0f56 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 30 Jul 2024 17:02:20 -0400 Subject: [PATCH 09/10] disable cache; undim; DO_CORRECTIONS --- experiments/cursor/grid.py | 120 +++++++++++++++++++---------------- experiments/cursor/sample.py | 3 +- openadapt/cache.py | 7 +- 3 files changed, 71 insertions(+), 59 deletions(-) diff --git a/experiments/cursor/grid.py b/experiments/cursor/grid.py index af2fec42e..4296de078 100644 --- a/experiments/cursor/grid.py +++ b/experiments/cursor/grid.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Tuple, List import os from loguru import logger @@ -12,41 +12,33 @@ DOWNSAMPLE_FACTOR = 2 GRID_SIZE = 25 # Adjust based on desired grid size DRIVER = openai +DO_CORRECTIONS = False -def load_and_downsample_image(image_file_path, downsample_factor: int): - # Open the image and convert to RGB +def load_and_downsample_image(image_file_path: str, downsample_factor: int) -> Image.Image: + """Load and downsample the image.""" image = Image.open(image_file_path).convert("RGB") - - # Get the original dimensions original_width, original_height = image.size - - # Calculate new dimensions (half of the original) new_width = original_width // downsample_factor new_height = original_height // downsample_factor - - # Resize the image to half its original size downsampled_image = image.resize((new_width, new_height), Image.LANCZOS) - return downsampled_image def dim_image(image: Image.Image) -> Image.Image: + """Dim the image by reducing brightness to 50%.""" enhancer = ImageEnhance.Brightness(image) - return enhancer.enhance(0.5) # Dim the image by reducing brightness to 50% - -from PIL import Image, ImageDraw, ImageFont + return enhancer.enhance(0.5) def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image: + """Add grid labels to the image.""" width, height = image.size grid_image = Image.new('RGB', (width + 50, height + 50), 'red') grid_image.paste(image, (50, 50)) draw = ImageDraw.Draw(grid_image) - # Calculate the maximum font size that fits within the grid cell max_font_size = min((height // grid_size) // 2, (width // grid_size) // 2) font_size = max_font_size font = ImageFont.truetype("Arial.ttf", font_size) - # Adjust font size dynamically to fit within grid cells while True: fits = True for i in range(grid_size): @@ -65,26 +57,21 @@ def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image: cell_width = width / grid_size cell_height = height / grid_size - # Add labels for rows and columns with the determined font size for i in range(grid_size): draw.text((25, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font) draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, 25), str(i + 1), fill='white', font=font) - # Draw semi-transparent grid lines overlay = Image.new('RGBA', grid_image.size, (255, 0, 0, 0)) overlay_draw = ImageDraw.Draw(overlay) for i in range(grid_size + 1): - # Horizontal lines overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=1) - # Vertical lines overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=1) - # Composite the overlay with the labeled image grid_image = Image.alpha_composite(grid_image.convert('RGBA'), overlay) - return grid_image.convert('RGB') def get_cell_coordinates(grid_size: int, target_cell: Tuple[int, int], image_size: Tuple[int, int]) -> Tuple[int, int]: + """Get the coordinates of a cell in the grid.""" row, col = target_cell cell_width = image_size[0] / grid_size cell_height = image_size[1] / grid_size @@ -92,74 +79,97 @@ def get_cell_coordinates(grid_size: int, target_cell: Tuple[int, int], image_siz y = int((row - 0.5) * cell_height) return x, y -def draw_target_coordinates(image: Image.Image, coordinates: Tuple[int, int]) -> Image.Image: +def draw_target_coordinates(image: Image.Image, coordinates: List[Tuple[int, int]]) -> Image.Image: + """Draw the target coordinates on the image.""" image = image.copy() - label_offset = 50 # The offset added to the top and left for labels + label_offset = 50 draw = ImageDraw.Draw(image) - adjusted_coordinates = (coordinates[0] + label_offset, coordinates[1] + label_offset) - draw.ellipse((adjusted_coordinates[0] - 5, adjusted_coordinates[1] - 5, adjusted_coordinates[0] + 5, adjusted_coordinates[1] + 5), fill='red', outline='red') + for coordinate in coordinates: + adjusted_coordinates = (coordinate[0] + label_offset, coordinate[1] + label_offset) + draw.ellipse((adjusted_coordinates[0] - 5, adjusted_coordinates[1] - 5, adjusted_coordinates[0] + 5, adjusted_coordinates[1] + 5), fill='red', outline='red') + return image + +def undim_target_cells(image: Image.Image, coordinates: List[Tuple[int, int]], grid_size: int) -> Image.Image: + """Undim the target cells on the image.""" + image = image.copy() + width, height = image.size + cell_width = (width - 50) / grid_size + cell_height = (height - 50) / grid_size + label_offset = 50 + + for coordinate in coordinates: + row, col = coordinate + x1 = int((col - 1) * cell_width) + label_offset + y1 = int((row - 1) * cell_height) + label_offset + x2 = int(x1 + cell_width) + y2 = int(y1 + cell_height) + box = (x1, y1, x2, y2) + cropped_section = image.crop(box) + enhancer = ImageEnhance.Brightness(cropped_section) + brightened_section = enhancer.enhance(2.0) # Increase brightness to undim + image.paste(brightened_section, box) return image def main(target: str): - # Load and dim the image + """Main function to process the image and identify target cells.""" image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png") image = load_and_downsample_image(image_file_path, DOWNSAMPLE_FACTOR) image = dim_image(image) - - # Add grid labels grid_image = add_grid_labels(image, GRID_SIZE) grid_image.show() - row, column = None, None + coordinates = [] + all_coordinates = [] grid_image_with_target = None + while True: prompt = f""" Attached is an image containing a screenshot over which a grid has been overlaid. The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. The grid lines are semi-transparent. -Your task is to identify the grid row and column containing the target. +Your task is to identify the coordinates of grid cells containing the target. The target is: "{target}". Respond in JSON with the following keys: {{ "target": "", - "analysis": "", - "row": "", - "column": "", + "descrpition": "", + "reasoning": "", + "coordinates": [(, ), (, ), ...], }} -Make sure not to confuse the grid labels with the content of the screenshot. -For example, if the screenshot contains a spreadsheet, don't get confused between -the spreadsheet cell labels and the grid labels. +You may specify one or more grid cell coordinates. +Make sure not to confuse the overlaid grid with any grid inside the screenshot!!! +For example, if the screenshot contains a spreadsheet, don't specify the spreadsheet +coordinates. You MUST specify the coordinates in the overlaid grid. Wrap your code in triple backticks: ``` """ - if row and column: + if DO_CORRECTIONS and coordinates: prompt += f""" -Previously, you specified row {row}, column {column}. This cell has been marked -with a red dot. If you agree with your previous assessment, confirm by specifying -the same row and column again. Otherwise, please correct your previous assessment. +Previously, someone else specified these cells: {coordinates}. These cells have been undimmed. +There may be an error. Please correct or confirm the previous assessment. +DON'T GUESS -- LOOK CAREFULLY AT THE IMAGE!!! My career depends on this. Lives are at stake. """ - # Prompt the model to identify the cell + + config.CACHE_ENABLED = False response = DRIVER.prompt( prompt=prompt, system_prompt="You are an expert GUI interpreter. You are precise and accurate.", - images=[grid_image_with_target or grid_image], + images=[grid_image_with_target or grid_image] if DO_CORRECTIONS else [grid_image], ) result = parse_code_snippet(response) + coordinates = sorted(result["coordinates"]) + logger.info(f"{coordinates=}") + coord_list = [get_cell_coordinates(GRID_SIZE, coord, image.size) for coord in coordinates] + #grid_image_with_target = draw_target_coordinates(grid_image, coord_list) + grid_image_with_target = undim_target_cells(grid_image, coordinates, GRID_SIZE) + #grid_image_with_target = draw_target_coordinates(grid_image_with_target, coord_list) - prev_row, prev_column = row, column - row = int(result["row"]) - column = int(result["column"]) - logger.info(f"{row=} {column=}") - coordinates = get_cell_coordinates(GRID_SIZE, (row, column), image.size) - - # Draw the coordinates on the image - grid_image_with_target = draw_target_coordinates(grid_image, coordinates) - - # Show the final image grid_image_with_target.show() - if row == prev_row and column == prev_column: + if coordinates in all_coordinates: break + all_coordinates.append(coordinates) + if __name__ == "__main__": #main("save button") #main("cell A1") @@ -167,4 +177,6 @@ def main(target: str): #main("font selector") #main("zoom slider") #main("paste button") - main("font size dropdown") + #main("font size dropdown") + #main("13-May") + main("Engineering") diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py index 017301abf..8a7ff8a18 100644 --- a/experiments/cursor/sample.py +++ b/experiments/cursor/sample.py @@ -13,7 +13,7 @@ from openadapt.drivers import anthropic, openai, google from openadapt.utils import parse_code_snippet -DRIVER = google +DRIVER = openai#google NUM_CURSORS = 2**2 SPREAD_REDUCTION_FACTOR = 0.5 MAX_ITERATIONS = None @@ -232,6 +232,7 @@ def main(): target = "Cell C3" target = "Cell G4" target = "Cell A1" + target = "14-May" center = {'x': width // 2, 'y': height // 2} spread = 1.0 iteration = 1 diff --git a/openadapt/cache.py b/openadapt/cache.py index 1ddd4b20d..1c6787ff6 100644 --- a/openadapt/cache.py +++ b/openadapt/cache.py @@ -62,13 +62,12 @@ def cache( Returns: The decorator function. """ - cache_dir_path = default(dir_path, config.CACHE_DIR_PATH) - cache_enabled = default(enabled, config.CACHE_ENABLED) - cache_verbosity = default(verbosity, config.CACHE_VERBOSITY) - def decorator(fn: Callable) -> Callable: @wraps(fn) def wrapper(*args: Any, **kwargs: Any) -> Any: + cache_dir_path = default(dir_path, config.CACHE_DIR_PATH) + cache_enabled = default(enabled, config.CACHE_ENABLED) + cache_verbosity = default(verbosity, config.CACHE_VERBOSITY) logger.debug(f"{cache_enabled=}") if cache_enabled: memory = Memory(cache_dir_path, verbose=cache_verbosity) From efc663975cf8222b7d02a8fbb604a03681024507 Mon Sep 17 00:00:00 2001 From: Richard Abrich Date: Tue, 30 Jul 2024 17:40:36 -0400 Subject: [PATCH 10/10] highlight target cells in red; add labels on bottom/right --- experiments/cursor/grid.py | 92 ++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/experiments/cursor/grid.py b/experiments/cursor/grid.py index 4296de078..af642776d 100644 --- a/experiments/cursor/grid.py +++ b/experiments/cursor/grid.py @@ -12,7 +12,7 @@ DOWNSAMPLE_FACTOR = 2 GRID_SIZE = 25 # Adjust based on desired grid size DRIVER = openai -DO_CORRECTIONS = False +DO_CORRECTIONS = True def load_and_downsample_image(image_file_path: str, downsample_factor: int) -> Image.Image: """Load and downsample the image.""" @@ -29,9 +29,9 @@ def dim_image(image: Image.Image) -> Image.Image: return enhancer.enhance(0.5) def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image: - """Add grid labels to the image.""" + """Add grid labels to the image on all four sides.""" width, height = image.size - grid_image = Image.new('RGB', (width + 50, height + 50), 'red') + grid_image = Image.new('RGB', (width + 100, height + 100), 'red') grid_image.paste(image, (50, 50)) draw = ImageDraw.Draw(grid_image) @@ -42,9 +42,8 @@ def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image: while True: fits = True for i in range(grid_size): - row_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:] - col_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:] - if row_text_size[1] > height // grid_size or col_text_size[0] > width // grid_size: + text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:] + if text_size[0] > width // grid_size or text_size[1] > height // grid_size: fits = False break if fits: @@ -58,14 +57,22 @@ def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image: cell_height = height / grid_size for i in range(grid_size): + # Left side labels draw.text((25, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font) + # Top side labels draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, 25), str(i + 1), fill='white', font=font) + # Right side labels + draw.text((width + 75, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font) + # Bottom side labels + draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, height + 75), str(i + 1), fill='white', font=font) overlay = Image.new('RGBA', grid_image.size, (255, 0, 0, 0)) overlay_draw = ImageDraw.Draw(overlay) for i in range(grid_size + 1): - overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=1) - overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=1) + # Horizontal lines + overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=3) + # Vertical lines + overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=3) grid_image = Image.alpha_composite(grid_image.convert('RGBA'), overlay) return grid_image.convert('RGB') @@ -90,11 +97,11 @@ def draw_target_coordinates(image: Image.Image, coordinates: List[Tuple[int, int return image def undim_target_cells(image: Image.Image, coordinates: List[Tuple[int, int]], grid_size: int) -> Image.Image: - """Undim the target cells on the image.""" + """Undim the target cells on the image and highlight them in red.""" image = image.copy() width, height = image.size - cell_width = (width - 50) / grid_size - cell_height = (height - 50) / grid_size + cell_width = (width - 100) / grid_size + cell_height = (height - 100) / grid_size label_offset = 50 for coordinate in coordinates: @@ -107,7 +114,15 @@ def undim_target_cells(image: Image.Image, coordinates: List[Tuple[int, int]], g cropped_section = image.crop(box) enhancer = ImageEnhance.Brightness(cropped_section) brightened_section = enhancer.enhance(2.0) # Increase brightness to undim - image.paste(brightened_section, box) + + # Create a red overlay + red_overlay = Image.new('RGBA', brightened_section.size, (255, 0, 0, 128)) # Semi-transparent red + + # Combine the brightened section with the red overlay + highlighted_section = Image.alpha_composite(brightened_section.convert('RGBA'), red_overlay) + + image.paste(highlighted_section, box) + return image def main(target: str): @@ -123,33 +138,42 @@ def main(target: str): grid_image_with_target = None while True: + if DO_CORRECTIONS and coordinates: + correction_prompt = f""" +- Previously, someone specified these cells: {coordinates}. These cells have been highlighted in red. +- There may be an error. Please correct or confirm the previous assessment. +- In your analysis, include the position of the target relative to these previously specified cells. +- In your reasoning, describe how many cells to move in each direction to reach the target.""" + else: + correction_prompt = "\n" + prompt = f""" Attached is an image containing a screenshot over which a grid has been overlaid. -The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. The grid lines -are semi-transparent. +The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. +The grid lines are semi-transparent. Your task is to identify the coordinates of grid cells containing the target. The target is: "{target}". + Respond in JSON with the following keys: - {{ - "target": "", - "descrpition": "", - "reasoning": "", - "coordinates": [(, ), (, ), ...], - }} -You may specify one or more grid cell coordinates. -Make sure not to confuse the overlaid grid with any grid inside the screenshot!!! -For example, if the screenshot contains a spreadsheet, don't specify the spreadsheet -coordinates. You MUST specify the coordinates in the overlaid grid. +{{ + "target": "", + "analysis": "", + "reasoning": "", + "coordinates": [(, ), (, ), ...], +}} + +**Important Instructions**: +- Focus on the overlaid grid for coordinate determination. +- Do not confuse the overlaid grid with any internal grids in the screenshot, such as those in spreadsheets. +- Verify the grid coordinates carefully.{correction_prompt} +- LOOK CAREFULLY AT THE IMAGE to avoid mistakes. +- You must specify all cells containing any part of the target, so that the target is COMPLETELY COVERED. + Wrap your code in triple backticks: ``` -""" - if DO_CORRECTIONS and coordinates: - prompt += f""" -Previously, someone else specified these cells: {coordinates}. These cells have been undimmed. -There may be an error. Please correct or confirm the previous assessment. -DON'T GUESS -- LOOK CAREFULLY AT THE IMAGE!!! My career depends on this. Lives are at stake. """ - config.CACHE_ENABLED = False + if not DO_CORRECTIONS: + config.CACHE_ENABLED = False response = DRIVER.prompt( prompt=prompt, system_prompt="You are an expert GUI interpreter. You are precise and accurate.", @@ -172,11 +196,13 @@ def main(target: str): if __name__ == "__main__": #main("save button") - #main("cell A1") + main("Spreadsheet cell A1") #main("cell containing 'Marketing'") #main("font selector") #main("zoom slider") #main("paste button") #main("font size dropdown") #main("13-May") - main("Engineering") + #main("Engineering") + #main("Sales") + #main("Spreadsheet area")