From eec13add7d490ff544ef9f7c12df829c7fbfebe4 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 25 Jul 2024 20:17:14 -0400
Subject: [PATCH 01/10] wip

---
 experiments/cursor.py          |  93 ++++++++++++++++++++
 experiments/cursor_joystick.py | 155 +++++++++++++++++++++++++++++++++
 experiments/cursor_search.py   | 149 +++++++++++++++++++++++++++++++
 openadapt/drivers/openai.py    |   2 +
 openadapt/replay.py            |   4 +-
 openadapt/utils.py             |  36 +++++---
 6 files changed, 428 insertions(+), 11 deletions(-)
 create mode 100644 experiments/cursor.py
 create mode 100644 experiments/cursor_joystick.py
 create mode 100644 experiments/cursor_search.py
diff --git a/experiments/cursor.py b/experiments/cursor.py
new file mode 100644
index 000000000..30b01e7d4
--- /dev/null
+++ b/experiments/cursor.py
@@ -0,0 +1,93 @@
+from pprint import pformat
+from typing import Tuple, List, Dict
+import json
+import os
+
+from PIL import Image, ImageDraw
+import matplotlib.pyplot as plt
+
+from openadapt.config import config
+from openadapt.drivers import openai, anthropic, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = openai
+HISTORY_SIZE = 5
+
+def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image:
+    """Draw concentric circles on the image at the specified coordinates."""
+    draw = ImageDraw.Draw(image)
+    x, y = coords['x'], coords['y']
+    
+    for color, radius in zip(colors, radii):
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color)
+        
+    return image
+
+def display_images(images: List[Image.Image]) -> None:
+    """Display all images in the same window."""
+    fig, axs = plt.subplots(1, len(images), figsize=(15, 5))
+    if len(images) == 1:
+        axs = [axs]
+    for ax, img in zip(axs, images):
+        ax.imshow(img)
+        ax.axis('off')
+    plt.show(block=False)
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = Image.open(image_file_path).convert("RGB")
+    images = []
+    all_coords = []
+    exceptions = []
+    target = "Cell B3"
+    
+    while True:
+        prompt = f"The attached image size is {image.size}."
+
+        if all_coords:
+            prompt += f" The images have red circles at coordinates:"
+            for coord in all_coords:
+                coord.pop('direction', None)
+                prompt += f"\n  {coord}"
+            prompt += "\n, in the order they are attached."
+        prompt += f" Locate the pixel coordinates of the target: {target}. Respond with a Python dict only: {{ 'x': int, 'y': int, 'direction': '<natural language description of your intended direction to move the last circle>' }}."
+        if all_coords:
+            prompt += "To move the circle to the right, increase the 'x' coordinate, and decrease it to move to the left. To move the circle down, increase the 'y' coordinate, and decrease it to move up."
+        #if all_coords:
+            #prompt += " If the red dot is in the correct location in the last image I gave you, respond with the last pair of coordinates I gave you. Otherwise, consider the images and corresponding coordinate locations I gave you to provide an accurate location of the target."
+            #prompt += f" IT IS IMPERATIVE THAT IF THE RED DOT IS ALREADY IN THE TARGET, YOU DO NOT PROVIDE UPDATED COORDINATES, BUT RE-USE THE CORRECT ONES. Remember, the target is {target}. IF THE RED DOT IS **NOT** ALREADY IN THE TARGET, YOU MUST PROVIDE UPDATED COORDINATES."
+        if exceptions:
+            prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}."
+        print(prompt)
+        response = DRIVER.prompt(
+            prompt=prompt,
+            system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.",
+            images=images or [image],
+            detail="high",
+        )
+        try:
+            coords = parse_code_snippet(response)
+        except Exception as exc:
+            exceptions.append(exc)
+            continue
+        else:
+            exceptions = []
+        last_coords = all_coords[-1] if all_coords else None
+        print(f"{coords=} {last_coords=}")
+        if last_coords == coords:
+            break
+        all_coords.append(coords)
+        image_with_dot = draw_concentric_circles(image.copy(), coords, ["red", "yellow"], [25, 15, 5])
+        image_with_dot.show()
+        images.append(image_with_dot)
+
+        if HISTORY_SIZE:
+            all_coords = all_coords[-HISTORY_SIZE:]
+            images = images[-HISTORY_SIZE:]
+
+    #display_images(images)
+    plt.show()
+
+if __name__ == "__main__":
+    main()
+
diff --git a/experiments/cursor_joystick.py b/experiments/cursor_joystick.py
new file mode 100644
index 000000000..f59a14956
--- /dev/null
+++ b/experiments/cursor_joystick.py
@@ -0,0 +1,155 @@
+from pprint import pformat
+from typing import Tuple, List, Dict
+import json
+import os
+
+from loguru import logger
+from PIL import Image, ImageDraw
+import matplotlib.pyplot as plt
+
+from openadapt.config import config
+from openadapt.drivers import openai, anthropic, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = openai
+HISTORY_SIZE = 1
+
+import numpy as np
+from scipy.spatial.distance import cdist
+from sklearn.cluster import KMeans
+
+def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
+    """Calculate the RGB color maximally different from every color in a given PIL image.
+
+    Args:
+        image: The PIL image object.
+        sample_size: The number of colors to sample from the image.
+        n_clusters: The number of clusters to use for KMeans clustering.
+
+    Returns:
+        A tuple representing the RGB color maximally different from all colors in the image.
+    """
+    img = image.convert('RGB')
+    np_img = np.array(img).reshape(-1, 3)
+    
+    if len(np_img) > sample_size:
+        np.random.seed(42)  # For reproducibility
+        np.random.shuffle(np_img)
+        sampled_colors = np_img[:sample_size]
+    else:
+        sampled_colors = np_img
+
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(sampled_colors)
+    cluster_centers = kmeans.cluster_centers_
+
+    all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)])
+    distances = cdist(all_colors, cluster_centers, metric='euclidean')
+    sum_distances = np.sum(distances, axis=1)
+    max_dist_index = np.argmax(sum_distances)
+
+    return tuple(all_colors[max_dist_index].astype(int))
+
+def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image:
+    """Draw concentric circles on the image at the specified coordinates."""
+    draw = ImageDraw.Draw(image)
+    x, y = coords['x'], coords['y']
+    
+    for color, radius in zip(colors, radii):
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color)
+        
+    return image
+
+def display_images(images: List[Image.Image]) -> None:
+    """Display all images in the same window."""
+    fig, axs = plt.subplots(1, len(images), figsize=(15, 5))
+    if len(images) == 1:
+        axs = [axs]
+    for ax, img in zip(axs, images):
+        ax.imshow(img)
+        ax.axis('off')
+    plt.show(block=False)
+
+def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int, height: int) -> Dict[str, int]:
+    """Update coordinates based on directions and magnitude."""
+    for direction, magnitude in directions.items():
+        step_size = {
+            'large': 0.5,
+            'medium': 0.125,
+            'small': 0.0625
+        }[magnitude]
+        
+        if direction == 'left':
+            coords['x'] -= int(width * step_size)
+        elif direction == 'right':
+            coords['x'] += int(width * step_size)
+        elif direction == 'up':
+            coords['y'] -= int(height * step_size)
+        elif direction == 'down':
+            coords['y'] += int(height * step_size)
+
+    return coords
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = Image.open(image_file_path).convert("RGB")
+
+    width, height = image.size
+    coords = {'x': width // 2, 'y': height // 2}
+    images = []
+    all_coords = []
+    exceptions = []
+    target = "Cell B3"
+    iteration = 1
+
+    color = maximally_different_color(image)
+    logger.info(f"{color=}")
+    
+    while True:
+        all_coords.append(dict(coords))
+        image_with_dot = draw_concentric_circles(image.copy(), coords, [color, "red"], [25, 15, 5])
+        image_with_dot.show()
+        images.append(image_with_dot)
+
+        prompt = f"The attached image size is {image.size}."
+
+        if all_coords:
+            prompt += f" The attached images have red circles at coordinates:"
+            for coord in all_coords:
+                prompt += f"\n  {coord}"
+            prompt += "\n, in the order they are attached."
+        prompt += f" Identify the magnitude and direction to move the last red dot towards the target: {target}. Respond with a single Python dict of the form {{'left' | 'right' | 'up' | 'down': 'large' | 'medium' | 'small'}}."
+        prompt += f" If the red dot is on the target, respond with an empty dict."
+        if exceptions:
+            prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}."
+        logger.info(f"prompt=\n{prompt}")
+        response = DRIVER.prompt(
+            prompt=prompt,
+            system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.",
+            images=images or [image],
+            detail="high",
+        )
+        try:
+            directions = parse_code_snippet(response)
+        except Exception as exc:
+            exceptions.append(exc)
+            all_coords = all_coords[:-1]
+            continue
+        else:
+            exceptions = []
+        
+        coords = update_coords(coords, directions, width, height)
+        iteration += 1
+        
+        if all_coords and all_coords[-1] == coords:
+            break
+        
+        if HISTORY_SIZE:
+            all_coords = all_coords[-HISTORY_SIZE:]
+            images = images[-HISTORY_SIZE:]
+
+    plt.show()
+
+if __name__ == "__main__":
+    main()
+
diff --git a/experiments/cursor_search.py b/experiments/cursor_search.py
new file mode 100644
index 000000000..e570822f0
--- /dev/null
+++ b/experiments/cursor_search.py
@@ -0,0 +1,149 @@
+from pprint import pformat
+from typing import Tuple, List, Dict
+import json
+import os
+
+from loguru import logger
+from PIL import Image, ImageDraw
+import matplotlib.pyplot as plt
+
+from openadapt.config import config
+from openadapt.drivers import openai, anthropic, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = openai
+HISTORY_SIZE = 1
+
+import numpy as np
+from scipy.spatial.distance import cdist
+from sklearn.cluster import KMeans
+
+
+def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
+    """Calculate the RGB color maximally different from every color in a given PIL image.
+
+    Args:
+        image: The PIL image object.
+        sample_size: The number of colors to sample from the image.
+        n_clusters: The number of clusters to use for KMeans clustering.
+
+    Returns:
+        A tuple representing the RGB color maximally different from all colors in the image.
+    """
+    img = image.convert('RGB')
+    np_img = np.array(img).reshape(-1, 3)
+    
+    if len(np_img) > sample_size:
+        np.random.seed(42)  # For reproducibility
+        np.random.shuffle(np_img)
+        sampled_colors = np_img[:sample_size]
+    else:
+        sampled_colors = np_img
+
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(sampled_colors)
+    cluster_centers = kmeans.cluster_centers_
+
+    all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)])
+    distances = cdist(all_colors, cluster_centers, metric='euclidean')
+    sum_distances = np.sum(distances, axis=1)
+    max_dist_index = np.argmax(sum_distances)
+
+    return tuple(all_colors[max_dist_index].astype(int))
+
+def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image:
+    """Draw concentric circles on the image at the specified coordinates."""
+    draw = ImageDraw.Draw(image)
+    x, y = coords['x'], coords['y']
+    
+    for color, radius in zip(colors, radii):
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color)
+        
+    return image
+
+def display_images(images: List[Image.Image]) -> None:
+    """Display all images in the same window."""
+    fig, axs = plt.subplots(1, len(images), figsize=(15, 5))
+    if len(images) == 1:
+        axs = [axs]
+    for ax, img in zip(axs, images):
+        ax.imshow(img)
+        ax.axis('off')
+    plt.show(block=False)
+
+def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int, height: int, iteration: int) -> Dict[str, int]:
+    """Update coordinates based on directions and current iteration."""
+    if directions.get('x') == 'left':
+        coords['x'] -= width // (2 ** iteration)
+    elif directions.get('x') == 'right':
+        coords['x'] += width // (2 ** iteration)
+
+    if directions.get('y') == 'up':
+        coords['y'] -= height // (2 ** iteration)
+    elif directions.get('y') == 'down':
+        coords['y'] += height // (2 ** iteration)
+
+    return coords
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = Image.open(image_file_path).convert("RGB")
+
+    width, height = image.size
+    coords = {'x': width // 2, 'y': height // 2}
+    images = []
+    all_coords = []
+    exceptions = []
+    target = "Cell B3"
+    iteration = 1
+
+    color = maximally_different_color(image)
+    logger.info(f"{color=}")
+    
+    while True:
+        all_coords.append(dict(coords))
+        image_with_dot = draw_concentric_circles(image.copy(), coords, ["red", color], [25, 15, 5])
+        image_with_dot.show()
+        images.append(image_with_dot)
+
+        prompt = f"The attached image size is {image.size}."
+
+        if all_coords:
+            prompt += f" The attached images have red circles at coordinates:"
+            for coord in all_coords:
+                prompt += f"\n  {coord}"
+            prompt += "\n, in the order they are attached."
+        prompt += f" Identify the direction to move the last red dot towards the target: {target}. Respond with a single Python dict only: {{ 'x': 'left' | 'right', 'y': 'up' | 'down' }}."
+        prompt += f" If the red dot is on the target, respond with an empty dict."
+        if exceptions:
+            prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}."
+        logger.info(f"prompt=\n{prompt}")
+        response = DRIVER.prompt(
+            prompt=prompt,
+            system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.",
+            images=images or [image],
+            detail="high",
+        )
+        try:
+            directions = parse_code_snippet(response)
+        except Exception as exc:
+            exceptions.append(exc)
+            all_coords = all_coords[:-1]
+            continue
+        else:
+            exceptions = []
+        
+        coords = update_coords(coords, directions, width, height, iteration)
+        iteration += 1
+        
+        if all_coords and all_coords[-1] == coords:
+            break
+        
+        if HISTORY_SIZE:
+            all_coords = all_coords[-HISTORY_SIZE:]
+            images = images[-HISTORY_SIZE:]
+
+    plt.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/openadapt/drivers/openai.py b/openadapt/drivers/openai.py
index f81e23747..b6c780174 100644
--- a/openadapt/drivers/openai.py
+++ b/openadapt/drivers/openai.py
@@ -181,6 +181,7 @@ def prompt(
     prompt: str,
     system_prompt: str | None = None,
     images: list[Image.Image] | None = None,
+    model: str = MODEL_NAME,
     max_tokens: int | None = None,
     detail: str = "high",
 ) -> str:
@@ -205,6 +206,7 @@ def prompt(
         prompt,
         system_prompt,
         images,
+        model=model,
         max_tokens=max_tokens,
         detail=detail,
     )
diff --git a/openadapt/replay.py b/openadapt/replay.py
index 5b597f3e2..106a64511 100644
--- a/openadapt/replay.py
+++ b/openadapt/replay.py
@@ -21,6 +21,8 @@
 from openadapt.db import crud
 from openadapt.error_reporting import configure_error_reporting
 from openadapt.models import Recording
+from openadapt.strategies import BaseReplayStrategy
+
 
 LOG_LEVEL = "INFO"
 
@@ -70,7 +72,7 @@ def replay(
 
     logger.info(f"{strategy_name=}")
 
-    strategy_class_by_name = utils.get_strategy_class_by_name()
+    strategy_class_by_name = utils.get_subclass_by_name(BaseReplayStrategy)
     if strategy_name not in strategy_class_by_name:
         strategy_names = [
             name
diff --git a/openadapt/utils.py b/openadapt/utils.py
index dc2795f15..3417180e8 100644
--- a/openadapt/utils.py
+++ b/openadapt/utils.py
@@ -6,7 +6,7 @@
 from functools import wraps
 from io import BytesIO
 from logging import StreamHandler
-from typing import Any, Callable
+from typing import Any, Callable, Type
 import ast
 import base64
 import importlib.metadata
@@ -425,16 +425,33 @@ def take_screenshot() -> Image.Image:
     return image
 
 
-def get_strategy_class_by_name() -> dict:
-    """Get a dictionary of strategy classes by their names.
+def get_subclass_by_name(base_class: Type) -> dict:
+    """Get a dictionary of subclasses by their names, recursively.
+
+    Args:
+        base_class (Type): The base class to find subclasses for.
 
     Returns:
-        dict: A dictionary of strategy classes.
+        dict: A dictionary where keys are subclass names and values are the subclass types.
     """
-    from openadapt.strategies import BaseReplayStrategy
+    def get_all_subclasses(cls: Type) -> list[Type]:
+        """
+        Get all subclasses of a given class, recursively.
 
-    strategy_classes = BaseReplayStrategy.__subclasses__()
-    class_by_name = {cls.__name__: cls for cls in strategy_classes}
+        Args:
+            cls (Type): The class to find subclasses for.
+
+        Returns:
+            list[Type]: A list of all subclasses of the given class.
+        """
+        subclasses = cls.__subclasses__()
+        all_subclasses = subclasses.copy()
+        for subclass in subclasses:
+            all_subclasses.extend(get_all_subclasses(subclass))
+        return all_subclasses
+
+    subclasses = get_all_subclasses(base_class)
+    class_by_name = {cls.__name__: cls for cls in subclasses}
     logger.debug(f"{class_by_name=}")
     return class_by_name
 
@@ -606,14 +623,13 @@ def parse_code_snippet(snippet: str) -> dict:
         rval = ast.literal_eval(code_content)
     except Exception as exc:
         logger.exception(exc)
-        import ipdb
-
-        ipdb.set_trace()
+        #import ipdb; ipdb.set_trace()
         # TODO: handle this
         raise
     return rval
 
 
+# TODO: support multiple blocks
 def extract_code_block(text: str) -> str:
     """Extract the text enclosed by the outermost backticks.
 

From 11824e445494f9b31b6977b1237b97053a60ff7f Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Thu, 25 Jul 2024 20:20:35 -0400
Subject: [PATCH 02/10] experiments/cursor_joystick_history.py getting there

---
 experiments/cursor_joystick_history.py | 193 +++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 experiments/cursor_joystick_history.py

diff --git a/experiments/cursor_joystick_history.py b/experiments/cursor_joystick_history.py
new file mode 100644
index 000000000..ca9142531
--- /dev/null
+++ b/experiments/cursor_joystick_history.py
@@ -0,0 +1,193 @@
+from pprint import pformat
+from typing import Tuple, List, Dict
+import json
+import os
+
+from loguru import logger
+from PIL import Image, ImageDraw
+import matplotlib.pyplot as plt
+
+from openadapt.config import config
+from openadapt.drivers import openai, anthropic, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = openai#anthropic
+HISTORY_SIZE = 10
+
+import numpy as np
+from scipy.spatial.distance import cdist
+from sklearn.cluster import KMeans
+
+def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
+    """Calculate the RGB color maximally different from every color in a given PIL image.
+
+    Args:
+        image: The PIL image object.
+        sample_size: The number of colors to sample from the image.
+        n_clusters: The number of clusters to use for KMeans clustering.
+
+    Returns:
+        A tuple representing the RGB color maximally different from all colors in the image.
+    """
+    img = image.convert('RGB')
+    np_img = np.array(img).reshape(-1, 3)
+    
+    if len(np_img) > sample_size:
+        np.random.seed(42)  # For reproducibility
+        np.random.shuffle(np_img)
+        sampled_colors = np_img[:sample_size]
+    else:
+        sampled_colors = np_img
+
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(sampled_colors)
+    cluster_centers = kmeans.cluster_centers_
+
+    all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)])
+    distances = cdist(all_colors, cluster_centers, metric='euclidean')
+    sum_distances = np.sum(distances, axis=1)
+    max_dist_index = np.argmax(sum_distances)
+
+    return tuple(all_colors[max_dist_index].astype(int))
+
+def draw_concentric_circles(image: Image.Image, coords: Dict[str, int], colors: List[str], radii: List[int]) -> Image.Image:
+    """Draw concentric circles on the image at the specified coordinates."""
+    draw = ImageDraw.Draw(image)
+    x, y = coords['x'], coords['y']
+    
+    for color, radius in zip(colors, radii):
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color)
+        
+    return image
+
+def draw_coordinates_and_arrows(image: Image.Image, all_coords: List[Dict[str, int]], arrow_color: str = "blue", dot_radius: int = 25) -> Image.Image:
+    """Draw all coordinates and arrows between successive pairs on the image.
+
+    Args:
+        image: The PIL image object.
+        all_coords: List of dictionaries containing the coordinates.
+        arrow_color: The color of the arrows.
+        dot_radius: The radius of the red dot.
+    
+    Returns:
+        The image with drawn coordinates and arrows.
+    """
+    draw = ImageDraw.Draw(image)
+    border_radius = dot_radius + 5
+    arrow_head_size = 50
+    
+    for i in range(len(all_coords) - 1):
+        x1, y1 = all_coords[i]['x'], all_coords[i]['y']
+        x2, y2 = all_coords[i+1]['x'], all_coords[i+1]['y']
+        
+        draw.ellipse((x1 - border_radius, y1 - border_radius, x1 + border_radius, y1 + border_radius), fill=arrow_color)
+        draw.ellipse((x1 - dot_radius, y1 - dot_radius, x1 + dot_radius, y1 + dot_radius), fill="red")
+        draw.line((x1, y1, x2, y2), fill=arrow_color, width=4)
+        
+        # Adjust arrowhead position to point to the exterior of the dot
+        angle = np.arctan2(y2 - y1, x2 - x1)
+        x2_adjusted = x2 - int(dot_radius * np.cos(angle))
+        y2_adjusted = y2 - int(dot_radius * np.sin(angle))
+        
+        # Draw arrowhead
+        draw.polygon([
+            (x2_adjusted, y2_adjusted), 
+            (x2_adjusted - arrow_head_size * np.cos(angle - np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle - np.pi / 6)), 
+            (x2_adjusted - arrow_head_size * np.cos(angle + np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle + np.pi / 6))
+        ], fill=arrow_color)
+    
+    if all_coords:
+        x, y = all_coords[-1]['x'], all_coords[-1]['y']
+        draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=arrow_color)
+        draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill="red")
+    
+    return image
+
+
+def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int, height: int) -> Dict[str, int]:
+    """Update coordinates based on directions and magnitude."""
+    for direction, magnitude in directions.items():
+        step_size = {
+            'large': 0.25,
+            'medium': 0.125,
+            'small': 0.0625
+        }[magnitude]
+        
+        if direction == 'left':
+            coords['x'] -= int(width * step_size)
+        elif direction == 'right':
+            coords['x'] += int(width * step_size)
+        elif direction == 'up':
+            coords['y'] -= int(height * step_size)
+        elif direction == 'down':
+            coords['y'] += int(height * step_size)
+
+    return coords
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = Image.open(image_file_path).convert("RGB")
+
+    width, height = image.size
+    coords = {'x': width // 2, 'y': height // 2}
+    all_coords = []
+    exceptions = []
+    placement_history = []
+    target = "The center of Cell E5"
+    iteration = 1
+
+    color = maximally_different_color(image)
+    logger.info(f"{color=}")
+    
+    while True:
+        all_coords.append(dict(coords))
+
+        prompt = f"Attached are three images: the first ('raw') is an unadultered screenshot, the second ('history') shows previous cursor locations on the screenshot separated by arrows, and the third ('current') shows the current cursor location."
+
+        if all_coords:
+            prompt += f" Cursor locations are indicated with red dots surrounded by a border. The history image has cursors at coordinates:"
+            for coord in all_coords:
+                prompt += f"\n  {coord}"
+            prompt += f" The current image has a cursor at coordinates {all_coords[-1]}."
+        prompt += f" Identify the magnitude and direction to move the current cursor towards the target: {target}. Respond with a single Python dict of the form {{'target': '<specified target>', 'placement': '<natural language description of the current location of the cursor>', 'left' | 'right' | 'up' | 'down': 'large' | 'medium' | 'small'}}."
+        prompt += f" If the current cursor is already at the target, do not specify any directions (but still specify the placement)."
+        prompt += " Make sure to surround your code with triple backticks: ```"
+        if exceptions:
+            prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}."
+        logger.info(f"prompt=\n{prompt}")
+        history_image = draw_coordinates_and_arrows(image.copy(), all_coords, color)
+        history_image.show()
+        current_image = draw_coordinates_and_arrows(image.copy(), [all_coords[-1]], color)
+        #input()
+        response = DRIVER.prompt(
+            prompt=prompt,
+            system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.",
+            images=[image, history_image, current_image],
+        )
+        try:
+            directions = parse_code_snippet(response)
+        except Exception as exc:
+            exceptions.append(exc)
+            all_coords = all_coords[:-1]
+            continue
+        else:
+            exceptions = []
+        
+        target = directions.pop("target")
+        placement = directions.pop("placement")
+        placement_history.append(placement)
+        logger.info(f"{target=} {placement=}")
+        coords = update_coords(coords, directions, width, height)
+        iteration += 1
+        
+        if all_coords and all_coords[-1] == coords:
+            break
+        
+        if HISTORY_SIZE:
+            all_coords = all_coords[-HISTORY_SIZE:]
+
+    logger.info(f"placement_history=\n{pformat(placement_history)}")
+
+if __name__ == "__main__":
+    main()
+

From a68cb2f9861aae6ee25ae0701f685d68e85452c1 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Fri, 26 Jul 2024 10:51:48 -0400
Subject: [PATCH 03/10] reduce step sizes

---
 experiments/cursor_joystick_history.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/experiments/cursor_joystick_history.py b/experiments/cursor_joystick_history.py
index ca9142531..95f1066b7 100644
--- a/experiments/cursor_joystick_history.py
+++ b/experiments/cursor_joystick_history.py
@@ -109,8 +109,8 @@ def update_coords(coords: Dict[str, int], directions: Dict[str, str], width: int
     for direction, magnitude in directions.items():
         step_size = {
             'large': 0.25,
-            'medium': 0.125,
-            'small': 0.0625
+            'medium': 0.10,
+            'small': 0.01
         }[magnitude]
         
         if direction == 'left':
@@ -133,9 +133,13 @@ def main():
     all_coords = []
     exceptions = []
     placement_history = []
-    target = "The center of Cell E5"
+    target = "The center of Cell G1"
     iteration = 1
 
+
+    # TODO: warn off screen
+    # TODO: zoom, other primitives?
+
     color = maximally_different_color(image)
     logger.info(f"{color=}")
     

From 48c12cbddb06edf8c19a5da3f14e5d2e14cb137d Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Sun, 28 Jul 2024 12:20:35 -0400
Subject: [PATCH 04/10] add cursor experiments

---
 experiments/{cursor.py => cursor/coords.py}   |   0
 experiments/cursor/direction.py               | 276 ++++++++++++++++++
 .../joystick.py}                              |   0
 .../joystick_history.py}                      |   0
 experiments/cursor/quadrant.py                |  92 ++++++
 .../{cursor_search.py => cursor/search.py}    |   0
 6 files changed, 368 insertions(+)
 rename experiments/{cursor.py => cursor/coords.py} (100%)
 create mode 100644 experiments/cursor/direction.py
 rename experiments/{cursor_joystick.py => cursor/joystick.py} (100%)
 rename experiments/{cursor_joystick_history.py => cursor/joystick_history.py} (100%)
 create mode 100644 experiments/cursor/quadrant.py
 rename experiments/{cursor_search.py => cursor/search.py} (100%)

diff --git a/experiments/cursor.py b/experiments/cursor/coords.py
similarity index 100%
rename from experiments/cursor.py
rename to experiments/cursor/coords.py
diff --git a/experiments/cursor/direction.py b/experiments/cursor/direction.py
new file mode 100644
index 000000000..3aa1bc477
--- /dev/null
+++ b/experiments/cursor/direction.py
@@ -0,0 +1,276 @@
+from pprint import pformat
+from typing import Tuple, List, Dict
+import json
+import os
+
+from loguru import logger
+from PIL import Image, ImageDraw
+import matplotlib.pyplot as plt
+
+from openadapt.config import config
+from openadapt.drivers import openai, anthropic, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = anthropic
+HISTORY_SIZE = 1
+
+import numpy as np
+from scipy.spatial.distance import cdist
+from sklearn.cluster import KMeans
+
+def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
+    """Calculate the RGB color maximally different from every color in a given PIL image.
+
+    Args:
+        image: The PIL image object.
+        sample_size: The number of colors to sample from the image.
+        n_clusters: The number of clusters to use for KMeans clustering.
+
+    Returns:
+        A tuple representing the RGB color maximally different from all colors in the image.
+    """
+    img = image.convert('RGB')
+    np_img = np.array(img).reshape(-1, 3)
+    
+    if len(np_img) > sample_size:
+        np.random.seed(42)  # For reproducibility
+        np.random.shuffle(np_img)
+        sampled_colors = np_img[:sample_size]
+    else:
+        sampled_colors = np_img
+
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(sampled_colors)
+    cluster_centers = kmeans.cluster_centers_
+
+    all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)])
+    distances = cdist(all_colors, cluster_centers, metric='euclidean')
+    sum_distances = np.sum(distances, axis=1)
+    max_dist_index = np.argmax(sum_distances)
+
+    return tuple(all_colors[max_dist_index].astype(int))
+
+def draw_coordinates_and_arrows(
+    image: Image.Image,
+    all_coords: List[Dict[str, int]],
+    inner_color: str,
+    border_color: str,
+    draw_x: bool = True,
+    bg_color: tuple = (0, 0, 0),
+    bg_transparency: float = 0.25,
+) -> Image.Image:
+    """Draw all coordinates and arrows between successive pairs on the image.
+
+    Args:
+        image: The PIL image object.
+        all_coords: List of dictionaries containing the coordinates.
+        border_color: The color of the arrows.
+        draw_x: If True, draw a big bold "X" on the last coordinate.
+    
+    Returns:
+        The image with drawn coordinates, arrows, and optionally a big bold "X" on the last coordinate.
+    """
+    width, height = image.size
+    min_dimension = min(width, height)
+    
+    # Define sizes relative to image dimensions
+    dot_radius = int(min_dimension * 0.014)  # Approximately 25 in a 1742 height image
+    border_radius = int(min_dimension * 0.017)  # Slightly larger than dot_radius
+    arrow_head_size = int(min_dimension * 0.029)  # Approximately 50 in a 1742 height image
+    line_width = max(1, int(min_dimension * 0.0023))  # Approximately 4 in a 1742 height image
+    x_line_width = max(1, int(min_dimension * 0.0034))  # Approximately 6 in a 1742 height image
+
+    image = image.convert("RGBA")
+    bg_opacity = int(255 * bg_transparency)
+    overlay = Image.new("RGBA", image.size, bg_color + (bg_opacity,))
+    draw = ImageDraw.Draw(overlay)
+    image = Image.alpha_composite(image, overlay)
+    image = image.convert("RGB")
+
+    draw = ImageDraw.Draw(image)
+    
+    for i in range(len(all_coords) - 1):
+        x1, y1 = all_coords[i]['x'], all_coords[i]['y']
+        x2, y2 = all_coords[i+1]['x'], all_coords[i+1]['y']
+        
+        draw.ellipse((x1 - border_radius, y1 - border_radius, x1 + border_radius, y1 + border_radius), fill=border_color)
+        draw.ellipse((x1 - dot_radius, y1 - dot_radius, x1 + dot_radius, y1 + dot_radius), fill=inner_color)
+        draw.line((x1, y1, x2, y2), fill=border_color, width=line_width)
+        
+        # Adjust arrowhead position to point to the exterior of the dot
+        angle = np.arctan2(y2 - y1, x2 - x1)
+        x2_adjusted = x2 - int(dot_radius * np.cos(angle))
+        y2_adjusted = y2 - int(dot_radius * np.sin(angle))
+        
+        # Draw arrowhead
+        draw.polygon([
+            (x2_adjusted, y2_adjusted), 
+            (x2_adjusted - arrow_head_size * np.cos(angle - np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle - np.pi / 6)), 
+            (x2_adjusted - arrow_head_size * np.cos(angle + np.pi / 6), y2_adjusted - arrow_head_size * np.sin(angle + np.pi / 6))
+        ], fill=border_color)
+    
+    if all_coords:
+        x, y = all_coords[-1]['x'], all_coords[-1]['y']
+        draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=border_color)
+        draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill=inner_color)
+        
+        # Draw a big bold "X" on the last coordinate if draw_x is True
+        if draw_x:
+            x_size = border_radius * 2  # Size of the X
+            
+            # Draw the X
+            draw.line((x - x_size, y - x_size, x + x_size, y + x_size), fill="black", width=x_line_width)
+            draw.line((x - x_size, y + x_size, x + x_size, y - x_size), fill="black", width=x_line_width)
+
+    return image
+
+import random
+from typing import Dict, Tuple
+
+def update_coords(coords: Dict[str, int], direction: str, magnitude: str, width: int, height: int, previous_step_size: float) -> Tuple[Dict[str, int], float]:
+    """Update coordinates based on a single direction and relative magnitude, with added jitter."""
+    if magnitude == 'more':
+        new_step_size = min(previous_step_size * 2, 0.25)  # Cap at 0.25
+    elif magnitude == 'less':
+        new_step_size = max(previous_step_size / 2, 0.01)  # Floor at 0.01
+    else:  # 'same'
+        new_step_size = previous_step_size
+
+    step = int(min(width, height) * new_step_size)
+
+    # Add jitter
+    jitter_range = max(1, int(step * 0.1))  # 10% of step size, minimum 1 pixel
+    jitter_x = random.randint(-jitter_range, jitter_range)
+    jitter_y = random.randint(-jitter_range, jitter_range)
+
+    direction_map = {
+        'left': (-step, 0),
+        'right': (step, 0),
+        'up': (0, -step),
+        'down': (0, step),
+        'up-left': (-step, -step),
+        'up-right': (step, -step),
+        'down-right': (step, step),
+        'down-left': (-step, step),
+    }
+    dx, dy = direction_map.get(direction, (0, 0))
+    
+    # Apply movement with jitter
+    coords['x'] += dx + jitter_x
+    coords['y'] += dy + jitter_y
+
+    # Ensure coordinates stay within image boundaries
+    coords['x'] = max(0, min(coords['x'], width - 1))
+    coords['y'] = max(0, min(coords['y'], height - 1))
+
+    return coords, new_step_size
+
+def load_and_downsample_image(image_file_path):
+    # Open the image and convert to RGB
+    image = Image.open(image_file_path).convert("RGB")
+    
+    # Get the original dimensions
+    original_width, original_height = image.size
+    
+    # Calculate new dimensions (half of the original)
+    new_width = original_width // 2
+    new_height = original_height // 2
+    
+    # Resize the image to half its original size
+    downsampled_image = image.resize((new_width, new_height), Image.LANCZOS)
+    
+    return downsampled_image
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = load_and_downsample_image(image_file_path)
+
+    width, height = image.size
+    coords = {'x': width // 2, 'y': height // 2}
+    all_coords = []
+    exceptions = []
+    all_directions = []
+    placement_history = []
+    target = "Inside cell C8"
+    iteration = 1
+    previous_step_size = 0.10  # Start with a medium step size
+
+    border_color = maximally_different_color(image)
+    inner_color = (255, 0, 0)
+    logger.info(f"{inner_color=} {border_color=}")
+    
+    try:
+        while True:
+            all_coords.append(dict(coords))
+
+            prompt = f"""
+Attached are two images: the first ('raw') is an unadultered
+screenshot, the second ('history') shows previous cursor locations overlaid
+on the (dimmed) screenshot, separated by arrows. Cursors are circles with color 
+{inner_color} surrounded by {border_color}. The latest cursor location is 
+indicated by an 'X'. Be careful not to get confused with background GUI elements and the overlaid cursors.
+Your task is to identify the direction and magnitude to move the current cursor towards the 
+target, which is '{target}'.
+Respond with a single Python dict of the form:
+    {{
+        'target': '<specified target>',
+        'placement': '<step by step reasoning for identifying the current location of the cursor>',
+        'plan': '<natural language description of your plan for moving the current placement to the target>',
+        'direction': 'left' | 'right' | 'up' | 'down' | 'up-left' | 'up-right' | 'down-left' | 'down-right',
+        'magnitude': 'more' | 'less' | 'same'
+    }}
+The 'direction' specifies the direction we need to move the cursor to get it to the target.
+The magnitude you specify should be relative to the previous movement, regardless of direction.
+If the current cursor is already at the target, do not specify any direction or magnitude (but still specify the placement).
+Make sure to surround your code with triple backticks: ```
+"""
+            if exceptions:
+                prompt += f"Last time you tried this, your response resulted in the following exception(s): {exceptions}."
+            logger.info(f"prompt=\n{prompt}")
+            history_image = draw_coordinates_and_arrows(image.copy(), all_coords[-(HISTORY_SIZE + 1):], inner_color, border_color)
+            history_image.show()
+            current_image = draw_coordinates_and_arrows(image.copy(), [all_coords[-1]], inner_color, border_color)
+            #input()
+            response = DRIVER.prompt(
+                prompt=prompt,
+                system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
+                images=[image, history_image, current_image],
+            )
+            try:
+                directions = parse_code_snippet(response)
+            except Exception as exc:
+                exceptions.append(exc)
+                all_coords = all_coords[:-1]
+                continue
+            else:
+                exceptions = []
+            
+            all_directions.append(directions.copy())
+            target = directions.pop("target")
+            placement = directions.pop("placement")
+            direction = directions.pop("direction", None)
+            magnitude = directions.pop("magnitude", None)
+            plan = directions.pop("plan", None)
+            placement_history.append(placement)
+            logger.info(f"{target=} {placement=} {plan=} {direction=} {magnitude=}")
+            
+            if direction and magnitude:
+                coords, previous_step_size = update_coords(coords, direction, magnitude, width, height, previous_step_size)
+            iteration += 1
+            
+            if all_coords and all_coords[-1] == coords:
+                break
+            
+            #if HISTORY_SIZE:
+            #    all_coords = all_coords[-HISTORY_SIZE:]
+    except Exception as exc:
+        logger.exception(exc)
+        pass
+
+    full_history_image = draw_coordinates_and_arrows(image.copy(), all_coords, inner_color, border_color)
+    full_history_image.show()
+    logger.info(f"placement_history=\n{pformat(placement_history)}")
+    logger.info(f"all_directions=\n{pformat(all_directions)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/cursor_joystick.py b/experiments/cursor/joystick.py
similarity index 100%
rename from experiments/cursor_joystick.py
rename to experiments/cursor/joystick.py
diff --git a/experiments/cursor_joystick_history.py b/experiments/cursor/joystick_history.py
similarity index 100%
rename from experiments/cursor_joystick_history.py
rename to experiments/cursor/joystick_history.py
diff --git a/experiments/cursor/quadrant.py b/experiments/cursor/quadrant.py
new file mode 100644
index 000000000..42fd1effc
--- /dev/null
+++ b/experiments/cursor/quadrant.py
@@ -0,0 +1,92 @@
+# XXX this won't work because context is lost
+
+import os
+from collections import deque
+
+from loguru import logger
+from PIL import Image
+
+from openadapt.config import config
+from openadapt.drivers import openai, anthropic, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = openai  # anthropic
+
+def prompt_model(driver, prompt, image):
+    return driver.prompt(
+        prompt=prompt,
+        system_prompt="You are an expert GUI interpreter. You are precise and discerning, and you strive for accuracy. You do not make the same mistake twice.",
+        images=[image]
+    )
+
+def get_quadrant(driver, image, target_element):
+    prompt = f"The target element is {target_element}. In which quadrant of the image is the target element located: top-left, top-right, bottom-left, or bottom-right? You can also say 'stop' when the target element is in the center of the image. You may reason in natural language, but you should include exactly one code block containing a python dict to structure your final response. The dict should have a single key: 'instruction', whose value is either 'top-left', 'top-right', 'bottom-left', 'bottom-right', 'stop', or 'unknown'. If you don't see the target element, say 'unknown'. DO NOT MAKE ANY ASSUMPTIONS, IF YOU DON'T SEE THE ELEMENT THEN SAY UNKNOWN!!! IF THE TARGET ELEMENT IS IN THE CENTER OF THE IMAGE, SAY STOP!!!"
+    response = prompt_model(driver, prompt, image)
+    response_dict = parse_code_snippet(response)
+    quadrant = response_dict["instruction"]
+    return quadrant
+
+def crop_image(image, quadrant):
+    width, height = image.size
+    if quadrant == "top-left":
+        return image.crop((0, 0, width // 2, height // 2))
+    elif quadrant == "top-right":
+        return image.crop((width // 2, 0, width, height // 2))
+    elif quadrant == "bottom-left":
+        return image.crop((0, height // 2, width // 2, height))
+    elif quadrant == "bottom-right":
+        return image.crop((width // 2, height // 2, width, height))
+    else:
+        return image
+
+def locate_element(driver, image, target_element):
+    image_history = deque([(image, ["top-left", "top-right", "bottom-left", "bottom-right"])])
+    iterations = 0
+    max_iterations = 20  # Increased to allow for backtracking
+
+    while iterations < max_iterations and image_history:
+        current_image, available_quadrants = image_history[-1]
+        current_image.show()  # Display the current image
+
+        quadrant = get_quadrant(driver, current_image, target_element)
+        logger.info(f"{quadrant=}")
+        input()
+        
+        if quadrant == "stop":
+            logger.info(f"Element located after {iterations} iterations.")
+            return current_image
+
+        if quadrant == "unknown":
+            logger.info("Unknown response, discarding current image and trying next quadrant from previous image")
+            image_history.pop()  # Discard the current image
+            if not image_history:
+                logger.warning("No more images in history")
+                break
+            continue  # Go back to the start of the loop with the previous image
+
+        if quadrant not in available_quadrants:
+            logger.warning(f"Unexpected quadrant {quadrant}, trying next available quadrant")
+            if not available_quadrants:
+                logger.info("No more quadrants to try, backtracking...")
+                image_history.pop()
+                continue
+            quadrant = available_quadrants[0]
+
+        available_quadrants.remove(quadrant)
+        new_image = crop_image(current_image, quadrant)
+        image_history.append((new_image, ["top-left", "top-right", "bottom-left", "bottom-right"]))
+        iterations += 1
+
+    logger.warning("Max iterations reached or no more images to process")
+    return image_history[-1][0] if image_history else image  # Return the last processed image or the original if all discarded
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = Image.open(image_file_path)
+    target_element = "Cell A1"
+    
+    result_image = locate_element(DRIVER, image, target_element)
+    result_image.show()  # Display the final result
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/cursor_search.py b/experiments/cursor/search.py
similarity index 100%
rename from experiments/cursor_search.py
rename to experiments/cursor/search.py

From 57d8faffc7f15b88529011fc1a04154bb2d57db7 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Sun, 28 Jul 2024 15:21:31 -0400
Subject: [PATCH 05/10] add experiments/cursor/sample.py

---
 experiments/cursor/sample.py | 246 +++++++++++++++++++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100644 experiments/cursor/sample.py

diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py
new file mode 100644
index 000000000..5578ce129
--- /dev/null
+++ b/experiments/cursor/sample.py
@@ -0,0 +1,246 @@
+import random
+from typing import List, Dict, Tuple
+import json
+from PIL import Image, ImageDraw, ImageFont, ImageEnhance
+from loguru import logger
+import os
+import numpy as np
+from scipy.spatial.distance import cdist
+from sklearn.cluster import KMeans
+
+from openadapt.config import config
+from openadapt.drivers import anthropic, openai, google
+from openadapt.utils import parse_code_snippet
+
+DRIVER = anthropic
+NUM_CURSORS = 2**2 # This can now be easily changed
+SPREAD_REDUCTION_FACTOR = 0.5  # How much to reduce spread each iteration
+MAX_ITERATIONS = 5  # Maximum number of iterations
+
+
+def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
+    """Calculate the RGB color maximally different from every color in a given PIL image."""
+    img = image.convert('RGB')
+    np_img = np.array(img).reshape(-1, 3)
+    
+    if len(np_img) > sample_size:
+        np.random.seed(42)  # For reproducibility
+        np.random.shuffle(np_img)
+        sampled_colors = np_img[:sample_size]
+    else:
+        sampled_colors = np_img
+
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(sampled_colors)
+    cluster_centers = kmeans.cluster_centers_
+
+    all_colors = np.array([[r, g, b] for r in range(0, 256, 8) for g in range(0, 256, 8) for b in range(0, 256, 8)])
+    distances = cdist(all_colors, cluster_centers, metric='euclidean')
+    sum_distances = np.sum(distances, axis=1)
+    max_dist_index = np.argmax(sum_distances)
+
+    return tuple(all_colors[max_dist_index].astype(int))
+
+def generate_cursors(center: Dict[str, int], spread: float, width: int, height: int) -> List[Dict[str, int]]:
+    cursors = []
+    
+    # Calculate grid dimensions
+    grid_size = int(np.sqrt(NUM_CURSORS))
+    
+    # Calculate cell size based on spread
+    cell_width = (width * spread) / grid_size
+    cell_height = (height * spread) / grid_size
+    
+    # Calculate top-left corner of the grid
+    start_x = center['x'] - (cell_width * (grid_size - 1)) / 2
+    start_y = center['y'] - (cell_height * (grid_size - 1)) / 2
+    
+    for i in range(grid_size):
+        for j in range(grid_size):
+            x = int(start_x + i * cell_width)
+            y = int(start_y + j * cell_height)
+            
+            # Ensure cursors are within image bounds
+            x = max(0, min(width - 1, x))
+            y = max(0, min(height - 1, y))
+            
+            cursors.append({'x': x, 'y': y})
+    
+    return cursors
+def draw_labelled_cursors(
+    image: Image.Image,
+    cursors: List[Dict[str, int]],
+    inner_color: Tuple[int, int, int],
+    border_color: Tuple[int, int, int],
+    label_color: Tuple[int, int, int],
+    bg_color: tuple = (0, 0, 0),
+    bg_transparency: float = 0.25,
+    labels: List[str] = None,
+) -> Image.Image:
+
+    image = image.convert("RGBA")
+    bg_opacity = int(255 * bg_transparency)
+    overlay = Image.new("RGBA", image.size, bg_color + (bg_opacity,))
+    draw = ImageDraw.Draw(overlay)
+    image = Image.alpha_composite(image, overlay)
+    image = image.convert("RGB")
+
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.truetype("Arial.ttf", 40)  # Slightly smaller font to accommodate two-digit numbers
+
+    width, height = image.size
+    min_dimension = min(width, height)
+    
+    for i, coords in enumerate(cursors):
+        x, y = coords['x'], coords['y']
+        label = labels[i] if labels else str(i + 1)
+        
+        # Draw cursor
+        border_radius = int(.02 * min_dimension)
+        dot_radius = int(.015 * min_dimension)
+        draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=border_color)
+        draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill=inner_color)
+        
+        # Draw label
+        label_w, label_h = draw.textsize(label, font=font)
+        draw.text((x - label_w / 2, y - label_h / 2), label, fill=label_color, font=font)
+    
+    return image
+
+def load_and_downsample_image(image_file_path):
+    # Open the image and convert to RGB
+    image = Image.open(image_file_path).convert("RGB")
+    
+    # Get the original dimensions
+    original_width, original_height = image.size
+    
+    # Calculate new dimensions (half of the original)
+    new_width = original_width // 2
+    new_height = original_height // 2
+    
+    # Resize the image to half its original size
+    downsampled_image = image.resize((new_width, new_height), Image.LANCZOS)
+    
+    return downsampled_image
+
+def increase_contrast(image: Image.Image, contrast_factor: float) -> Image.Image:
+    """
+    Increases the contrast of a PIL image and returns the enhanced image.
+
+    Args:
+        image (Image.Image): The input PIL image.
+        contrast_factor (float): Factor by which to increase the contrast. 1.0 means no change,
+                                 less than 1.0 decreases contrast, greater than 1.0 increases contrast.
+
+    Returns:
+        Image.Image: The enhanced PIL image.
+    """
+    # Create an ImageEnhance object for contrast enhancement
+    enhancer = ImageEnhance.Contrast(image)
+    
+    # Apply the contrast enhancement
+    enhanced_image = enhancer.enhance(contrast_factor)
+    
+    return enhanced_image
+
+def main():
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    #image = Image.open(image_file_path).convert("RGB")
+    image = load_and_downsample_image(image_file_path)
+    #image = increase_contrast(image, 2)
+    width, height = image.size
+
+    inner_color = (255, 0, 0)
+    border_color = maximally_different_color(image)
+    label_color = (255, 255, 255)
+    
+    target = "Inside cell C8"  # almost
+    target = "Inside cell G5"  # almost
+    target = "Inside cell A1"  # almost
+    target = "Cell B12"
+    target = "Cell E7"
+    target = "Save button"
+    target = "Cell C3"
+    center = {'x': width // 2, 'y': height // 2}
+    spread = 1.0
+    iteration = 1
+
+    identified_locations = []
+    exceptions = []
+
+    while iteration <= MAX_ITERATIONS:
+        cursors = generate_cursors(center, spread, width, height)
+        cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, border_color, label_color)
+        cursor_image.show()
+
+        prompt = f"""
+Attached is 1. a raw screenshot, and 2. the same screenshot a) dimmed and b) with {NUM_CURSORS} cursors overlaid.
+Each cursor is a circle with color {inner_color} surrounded by {border_color}, labelled with a number from 1 to {NUM_CURSORS} with color {label_color}.
+
+Your task is to identify the cursor closest to the target: '{target}'.
+
+Respond with a single Python dict of the form:
+    {{
+        'target': '<target>',
+        'cursors': '<natural language description of the cursor positions relative to the other elements in the image>',
+        'review': "<look at the image CAREFULLY. you made a mistake. that's ok, but you need to correct it.>",
+        'analysis': '<look at the image again! are you sure?? i will pay you money if you fix your mistake.>',
+        'closest': '<number of cursor closest to target>',
+    }}
+
+Make sure to surround your code with triple backticks: ```
+"""
+        if exceptions:
+            prompt += f"""
+Previously when you responded to this prompt, this resulted in the following exceptions:
+{{exceptions}}
+"""
+        logger.info(f"prompt=\n{prompt}")
+        
+        response = DRIVER.prompt(
+            prompt=prompt,
+            system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
+            images=[image, cursor_image],
+        )
+
+        try:
+            result = parse_code_snippet(response)
+        except Exception as exc:
+            logger.exception(exc)
+            exceptions.append(exc)
+            continue
+        else:
+            exceptions = []
+
+        logger.info(f"Iteration {iteration}: {result}")
+
+        if 'closest' in result:
+            closest_number = int(result['closest'])
+            identified_location = cursors[closest_number - 1]  # Adjust for 0-based indexing
+            identified_locations.append(identified_location)
+            center = identified_location
+            spread *= SPREAD_REDUCTION_FACTOR
+        else:
+            logger.error("Invalid response from model")
+            continue
+
+        iteration += 1
+
+    # Create final image with all identified locations
+    final_image = image.copy()
+    labels = [str(i) for i in range(1, len(identified_locations) + 1)]
+    final_image_with_cursors = draw_labelled_cursors(
+        final_image, 
+        identified_locations, 
+        inner_color, 
+        border_color, 
+        label_color, 
+        labels=labels
+    )
+    final_image_with_cursors.show()
+    final_image_with_cursors.save("final_cursor_locations.png")
+
+    logger.info(f"Final identified locations: {identified_locations}")
+
+if __name__ == "__main__":
+    main()

From 9193ba7b143876867d6ecb1496f03af089f61c72 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Sun, 28 Jul 2024 22:59:16 -0400
Subject: [PATCH 06/10] working!

---
 experiments/cursor/sample.py | 152 +++++++++++++++++++++++------------
 1 file changed, 100 insertions(+), 52 deletions(-)

diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py
index 5578ce129..0a93e849f 100644
--- a/experiments/cursor/sample.py
+++ b/experiments/cursor/sample.py
@@ -1,3 +1,4 @@
+from collections import Counter
 import random
 from typing import List, Dict, Tuple
 import json
@@ -13,9 +14,13 @@
 from openadapt.utils import parse_code_snippet
 
 DRIVER = anthropic
-NUM_CURSORS = 2**2 # This can now be easily changed
+NUM_CURSORS = 2**2
 SPREAD_REDUCTION_FACTOR = 0.5  # How much to reduce spread each iteration
-MAX_ITERATIONS = 5  # Maximum number of iterations
+MAX_ITERATIONS = 4  # Maximum number of iterations
+CONTRAST_FACTOR = 1
+RETRIES_PER_ITERATION = 3
+DOWNSAMPLE_FACTOR = 3
+CONSENSUS_THRESHOLD = 2
 
 
 def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
@@ -41,7 +46,19 @@ def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clu
 
     return tuple(all_colors[max_dist_index].astype(int))
 
-def generate_cursors(center: Dict[str, int], spread: float, width: int, height: int) -> List[Dict[str, int]]:
+def generate_cursors(center: dict[str, int], spread: float, width: int, height: int, jitter: float = 0.1) -> list[dict[str, int]]:
+    """Generates cursors around a center point within a defined spread, with optional jitter.
+
+    Args:
+        center (dict[str, int]): The central point from which cursors are generated.
+        spread (float): The spread factor determining the grid size.
+        width (int): The width of the image.
+        height (int): The height of the image.
+        jitter (float): The jitter factor to add randomness to cursor positions.
+
+    Returns:
+        list[dict[str, int]]: A list of cursor positions.
+    """
     cursors = []
     
     # Calculate grid dimensions
@@ -60,6 +77,10 @@ def generate_cursors(center: Dict[str, int], spread: float, width: int, height:
             x = int(start_x + i * cell_width)
             y = int(start_y + j * cell_height)
             
+            # Apply jitter
+            x += int((random.random() - 0.5) * cell_width * jitter)
+            y += int((random.random() - 0.5) * cell_height * jitter)
+            
             # Ensure cursors are within image bounds
             x = max(0, min(width - 1, x))
             y = max(0, min(height - 1, y))
@@ -67,17 +88,30 @@ def generate_cursors(center: Dict[str, int], spread: float, width: int, height:
             cursors.append({'x': x, 'y': y})
     
     return cursors
+
 def draw_labelled_cursors(
     image: Image.Image,
     cursors: List[Dict[str, int]],
     inner_color: Tuple[int, int, int],
-    border_color: Tuple[int, int, int],
     label_color: Tuple[int, int, int],
-    bg_color: tuple = (0, 0, 0),
+    bg_color: Tuple[int, int, int] = (0, 0, 0),
     bg_transparency: float = 0.25,
     labels: List[str] = None,
 ) -> Image.Image:
+    """Draws labelled cursors on the image.
+    
+    Args:
+        image: The input image on which cursors are to be drawn.
+        cursors: A list of dictionaries containing cursor coordinates.
+        inner_color: The color of the inner part of the cursor.
+        label_color: The color of the label text.
+        bg_color: Background color for transparency overlay.
+        bg_transparency: Transparency level for the background overlay.
+        labels: List of labels to be drawn with the cursors.
 
+    Returns:
+        Image with labelled cursors.
+    """
     image = image.convert("RGBA")
     bg_opacity = int(255 * bg_transparency)
     overlay = Image.new("RGBA", image.size, bg_color + (bg_opacity,))
@@ -91,23 +125,34 @@ def draw_labelled_cursors(
     width, height = image.size
     min_dimension = min(width, height)
     
+    # Calculate rectangle size based on the largest label
+    max_label = max(labels, key=len) if labels else str(len(cursors))
+    max_label_bbox = draw.textbbox((0, 0), max_label, font=font)
+    max_label_w = max_label_bbox[2] - max_label_bbox[0]
+    max_label_h = max_label_bbox[3] - max_label_bbox[1]
+    rect_width = max_label_w + 20
+    rect_height = max_label_h + 20
+    
     for i, coords in enumerate(cursors):
         x, y = coords['x'], coords['y']
         label = labels[i] if labels else str(i + 1)
         
-        # Draw cursor
-        border_radius = int(.02 * min_dimension)
-        dot_radius = int(.015 * min_dimension)
-        draw.ellipse((x - border_radius, y - border_radius, x + border_radius, y + border_radius), fill=border_color)
-        draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill=inner_color)
+        # Draw rectangle
+        rect_x0 = x - rect_width // 2
+        rect_y0 = y - rect_height // 2
+        rect_x1 = x + rect_width // 2
+        rect_y1 = y + rect_height // 2
+        draw.rectangle((rect_x0, rect_y0, rect_x1, rect_y1), fill=inner_color)
         
         # Draw label
-        label_w, label_h = draw.textsize(label, font=font)
+        label_bbox = draw.textbbox((0, 0), label, font=font)
+        label_w = label_bbox[2] - label_bbox[0]
+        label_h = label_bbox[3] - label_bbox[1]
         draw.text((x - label_w / 2, y - label_h / 2), label, fill=label_color, font=font)
     
     return image
 
-def load_and_downsample_image(image_file_path):
+def load_and_downsample_image(image_file_path, downsample_factor: int):
     # Open the image and convert to RGB
     image = Image.open(image_file_path).convert("RGB")
     
@@ -115,8 +160,8 @@ def load_and_downsample_image(image_file_path):
     original_width, original_height = image.size
     
     # Calculate new dimensions (half of the original)
-    new_width = original_width // 2
-    new_height = original_height // 2
+    new_width = original_width // downsample_factor
+    new_height = original_height // downsample_factor
     
     # Resize the image to half its original size
     downsampled_image = image.resize((new_width, new_height), Image.LANCZOS)
@@ -135,6 +180,9 @@ def increase_contrast(image: Image.Image, contrast_factor: float) -> Image.Image
     Returns:
         Image.Image: The enhanced PIL image.
     """
+    if contrast_factor == 1:
+        return image
+
     # Create an ImageEnhance object for contrast enhancement
     enhancer = ImageEnhance.Contrast(image)
     
@@ -145,13 +193,11 @@ def increase_contrast(image: Image.Image, contrast_factor: float) -> Image.Image
 
 def main():
     image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
-    #image = Image.open(image_file_path).convert("RGB")
-    image = load_and_downsample_image(image_file_path)
-    #image = increase_contrast(image, 2)
+    image = load_and_downsample_image(image_file_path, DOWNSAMPLE_FACTOR)
+    image = increase_contrast(image, CONTRAST_FACTOR)
     width, height = image.size
 
-    inner_color = (255, 0, 0)
-    border_color = maximally_different_color(image)
+    inner_color = maximally_different_color(image)
     label_color = (255, 255, 255)
     
     target = "Inside cell C8"  # almost
@@ -161,6 +207,7 @@ def main():
     target = "Cell E7"
     target = "Save button"
     target = "Cell C3"
+    target = "Cell G4"
     center = {'x': width // 2, 'y': height // 2}
     spread = 1.0
     iteration = 1
@@ -169,13 +216,10 @@ def main():
     exceptions = []
 
     while iteration <= MAX_ITERATIONS:
-        cursors = generate_cursors(center, spread, width, height)
-        cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, border_color, label_color)
-        cursor_image.show()
 
         prompt = f"""
-Attached is 1. a raw screenshot, and 2. the same screenshot a) dimmed and b) with {NUM_CURSORS} cursors overlaid.
-Each cursor is a circle with color {inner_color} surrounded by {border_color}, labelled with a number from 1 to {NUM_CURSORS} with color {label_color}.
+Attached is a screenshot that has been dimmed and with {NUM_CURSORS} cursors overlaid.
+Each cursor is a rectangle with color {inner_color}, labelled with a number from 1 to {NUM_CURSORS} with color {label_color}.
 
 Your task is to identify the cursor closest to the target: '{target}'.
 
@@ -196,33 +240,38 @@ def main():
 {{exceptions}}
 """
         logger.info(f"prompt=\n{prompt}")
-        
-        response = DRIVER.prompt(
-            prompt=prompt,
-            system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
-            images=[image, cursor_image],
-        )
-
-        try:
-            result = parse_code_snippet(response)
-        except Exception as exc:
-            logger.exception(exc)
-            exceptions.append(exc)
-            continue
-        else:
-            exceptions = []
-
-        logger.info(f"Iteration {iteration}: {result}")
-
-        if 'closest' in result:
-            closest_number = int(result['closest'])
-            identified_location = cursors[closest_number - 1]  # Adjust for 0-based indexing
-            identified_locations.append(identified_location)
-            center = identified_location
-            spread *= SPREAD_REDUCTION_FACTOR
-        else:
-            logger.error("Invalid response from model")
-            continue
+
+        votes = []
+        while True:
+            cursors = generate_cursors(center, spread, width, height)
+            cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, label_color)
+            cursor_image.show()
+            response = DRIVER.prompt(
+                prompt=prompt,
+                system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
+                images=[
+                    #image,
+                    cursor_image,
+                ],
+            )
+
+            try:
+                result = parse_code_snippet(response)
+                if 'closest' in result:
+                    votes.append(int(result['closest']))
+            except Exception as exc:
+                logger.exception(exc)
+                exceptions.append(exc)
+
+            most_common = Counter(votes).most_common(1)[0]
+            logger.info(f"{votes=} {most_common=}")
+            if most_common[1] >= CONSENSUS_THRESHOLD:
+                break
+        closest_number = most_common[0]
+        identified_location = cursors[closest_number - 1]  # Adjust for 0-based indexing
+        identified_locations.append(identified_location)
+        center = identified_location
+        spread *= SPREAD_REDUCTION_FACTOR
 
         iteration += 1
 
@@ -233,7 +282,6 @@ def main():
         final_image, 
         identified_locations, 
         inner_color, 
-        border_color, 
         label_color, 
         labels=labels
     )

From d411928f34e5c976d93b846383a3ab7bd9ebee97 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Mon, 29 Jul 2024 10:33:00 -0400
Subject: [PATCH 07/10] overlap ratio

---
 experiments/cursor/sample.py | 90 ++++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 24 deletions(-)

diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py
index 0a93e849f..017301abf 100644
--- a/experiments/cursor/sample.py
+++ b/experiments/cursor/sample.py
@@ -13,14 +13,16 @@
 from openadapt.drivers import anthropic, openai, google
 from openadapt.utils import parse_code_snippet
 
-DRIVER = anthropic
+DRIVER = google
 NUM_CURSORS = 2**2
-SPREAD_REDUCTION_FACTOR = 0.5  # How much to reduce spread each iteration
-MAX_ITERATIONS = 4  # Maximum number of iterations
+SPREAD_REDUCTION_FACTOR = 0.5
+MAX_ITERATIONS = None
+MAX_OVERLAP_RATIO = 0.2
 CONTRAST_FACTOR = 1
 RETRIES_PER_ITERATION = 3
 DOWNSAMPLE_FACTOR = 3
 CONSENSUS_THRESHOLD = 2
+LABEL_SIZE_RATIO = 0.04
 
 
 def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clusters: int = 10) -> tuple[int, int, int]:
@@ -46,7 +48,7 @@ def maximally_different_color(image: Image.Image, sample_size: int = 1000, n_clu
 
     return tuple(all_colors[max_dist_index].astype(int))
 
-def generate_cursors(center: dict[str, int], spread: float, width: int, height: int, jitter: float = 0.1) -> list[dict[str, int]]:
+def generate_cursors(center: dict[str, int], spread: float, width: int, height: int, jitter: float = 0.01) -> list[dict[str, int]]:
     """Generates cursors around a center point within a defined spread, with optional jitter.
 
     Args:
@@ -97,8 +99,9 @@ def draw_labelled_cursors(
     bg_color: Tuple[int, int, int] = (0, 0, 0),
     bg_transparency: float = 0.25,
     labels: List[str] = None,
-) -> Image.Image:
-    """Draws labelled cursors on the image.
+    padding: int = 10,
+) -> Tuple[Image.Image, float]:
+    """Draws labelled cursors on the image and computes maximum label overlap ratio.
     
     Args:
         image: The input image on which cursors are to be drawn.
@@ -108,9 +111,10 @@ def draw_labelled_cursors(
         bg_color: Background color for transparency overlay.
         bg_transparency: Transparency level for the background overlay.
         labels: List of labels to be drawn with the cursors.
+        padding: Padding around the labels.
 
     Returns:
-        Image with labelled cursors.
+        Tuple of Image with labelled cursors and the maximum overlap ratio.
     """
     image = image.convert("RGBA")
     bg_opacity = int(255 * bg_transparency)
@@ -120,19 +124,21 @@ def draw_labelled_cursors(
     image = image.convert("RGB")
 
     draw = ImageDraw.Draw(image)
-    font = ImageFont.truetype("Arial.ttf", 40)  # Slightly smaller font to accommodate two-digit numbers
+    min_dimension = min(image.size)
+    font_size = int(min_dimension * LABEL_SIZE_RATIO)
+    font = ImageFont.truetype("Arial.ttf", font_size)
 
     width, height = image.size
-    min_dimension = min(width, height)
-    
+
     # Calculate rectangle size based on the largest label
     max_label = max(labels, key=len) if labels else str(len(cursors))
     max_label_bbox = draw.textbbox((0, 0), max_label, font=font)
     max_label_w = max_label_bbox[2] - max_label_bbox[0]
     max_label_h = max_label_bbox[3] - max_label_bbox[1]
-    rect_width = max_label_w + 20
-    rect_height = max_label_h + 20
-    
+    rect_width = max_label_w + padding
+    rect_height = max_label_h + padding
+
+    rectangles = []
     for i, coords in enumerate(cursors):
         x, y = coords['x'], coords['y']
         label = labels[i] if labels else str(i + 1)
@@ -149,8 +155,25 @@ def draw_labelled_cursors(
         label_w = label_bbox[2] - label_bbox[0]
         label_h = label_bbox[3] - label_bbox[1]
         draw.text((x - label_w / 2, y - label_h / 2), label, fill=label_color, font=font)
-    
-    return image
+
+        rectangles.append((rect_x0, rect_y0, rect_x1, rect_y1))
+
+    # Compute maximum overlap ratio
+    max_overlap_ratio = 0.0
+    for i, rect1 in enumerate(rectangles):
+        for j, rect2 in enumerate(rectangles):
+            if i != j:
+                overlap_width = max(0, min(rect1[2], rect2[2]) - max(rect1[0], rect2[0]))
+                overlap_height = max(0, min(rect1[3], rect2[3]) - max(rect1[1], rect2[1]))
+                overlap_area = overlap_width * overlap_height
+                rect1_area = (rect1[2] - rect1[0]) * (rect1[3] - rect1[1])
+                rect2_area = (rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
+                ratio1 = overlap_area / rect1_area if rect1_area > 0 else 0
+                ratio2 = overlap_area / rect2_area if rect2_area > 0 else 0
+                max_overlap_ratio = max(max_overlap_ratio, ratio1, ratio2)
+
+    return image, max_overlap_ratio
+
 
 def load_and_downsample_image(image_file_path, downsample_factor: int):
     # Open the image and convert to RGB
@@ -208,14 +231,19 @@ def main():
     target = "Save button"
     target = "Cell C3"
     target = "Cell G4"
+    target = "Cell A1"
     center = {'x': width // 2, 'y': height // 2}
     spread = 1.0
     iteration = 1
 
     identified_locations = []
     exceptions = []
+    center_history = []
+    spread_history = []
 
-    while iteration <= MAX_ITERATIONS:
+    while True:
+        if MAX_ITERATIONS and iteration > MAX_ITERATIONS:
+            break
 
         prompt = f"""
 Attached is a screenshot that has been dimmed and with {NUM_CURSORS} cursors overlaid.
@@ -226,12 +254,13 @@ def main():
 Respond with a single Python dict of the form:
     {{
         'target': '<target>',
-        'cursors': '<natural language description of the cursor positions relative to the other elements in the image>',
-        'review': "<look at the image CAREFULLY. you made a mistake. that's ok, but you need to correct it.>",
-        'analysis': '<look at the image again! are you sure?? i will pay you money if you fix your mistake.>',
+        'target_position': '<natural language description of the target position>',
+        'cursor_positions': '<natural language description of the cursor positions relative to the other elements in the image>',
         'closest': '<number of cursor closest to target>',
     }}
 
+Don't make any assumptions about positions of anything. I need you to CAREFULLY ANALYZE THE IMAGE.
+
 Make sure to surround your code with triple backticks: ```
 """
         if exceptions:
@@ -242,9 +271,14 @@ def main():
         logger.info(f"prompt=\n{prompt}")
 
         votes = []
+        center_history.append(center)
+        spread_history.append(spread)
+        max_overlap_ratio = 0
         while True:
             cursors = generate_cursors(center, spread, width, height)
-            cursor_image = draw_labelled_cursors(image.copy(), cursors, inner_color, label_color)
+            cursor_image, overlap_ratio = draw_labelled_cursors(image.copy(), cursors, inner_color, label_color)
+            logger.info(f"{overlap_ratio=}")
+            max_overlap_ratio = max(overlap_ratio, max_overlap_ratio)
             cursor_image.show()
             response = DRIVER.prompt(
                 prompt=prompt,
@@ -257,12 +291,17 @@ def main():
 
             try:
                 result = parse_code_snippet(response)
-                if 'closest' in result:
-                    votes.append(int(result['closest']))
+                closest = result['closest']
             except Exception as exc:
                 logger.exception(exc)
                 exceptions.append(exc)
 
+            if closest is None:
+                # try again with previous level
+                center = center_history.pop()
+                spread = spread_history.pop()
+                continue
+            votes.append(int(closest))
             most_common = Counter(votes).most_common(1)[0]
             logger.info(f"{votes=} {most_common=}")
             if most_common[1] >= CONSENSUS_THRESHOLD:
@@ -273,17 +312,20 @@ def main():
         center = identified_location
         spread *= SPREAD_REDUCTION_FACTOR
 
+        if max_overlap_ratio > MAX_OVERLAP_RATIO:
+            break
+
         iteration += 1
 
     # Create final image with all identified locations
     final_image = image.copy()
     labels = [str(i) for i in range(1, len(identified_locations) + 1)]
-    final_image_with_cursors = draw_labelled_cursors(
+    final_image_with_cursors, _ = draw_labelled_cursors(
         final_image, 
         identified_locations, 
         inner_color, 
         label_color, 
-        labels=labels
+        labels=labels,
     )
     final_image_with_cursors.show()
     final_image_with_cursors.save("final_cursor_locations.png")

From 170e48596bc98b66813cbfdfd46c4157f38b84be Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Tue, 30 Jul 2024 15:24:19 -0400
Subject: [PATCH 08/10] add experiments/cursor/grid.py

---
 experiments/cursor/grid.py | 170 +++++++++++++++++++++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 experiments/cursor/grid.py

diff --git a/experiments/cursor/grid.py b/experiments/cursor/grid.py
new file mode 100644
index 000000000..af2fec42e
--- /dev/null
+++ b/experiments/cursor/grid.py
@@ -0,0 +1,170 @@
+from typing import Tuple
+import os
+
+from loguru import logger
+from PIL import Image, ImageDraw, ImageFont, ImageEnhance
+
+from openadapt.config import config
+from openadapt.drivers import anthropic, openai, google
+from openadapt.utils import parse_code_snippet
+
+# Constants
+DOWNSAMPLE_FACTOR = 2
+GRID_SIZE = 25  # Adjust based on desired grid size
+DRIVER = openai
+
+def load_and_downsample_image(image_file_path, downsample_factor: int):
+    # Open the image and convert to RGB
+    image = Image.open(image_file_path).convert("RGB")
+    
+    # Get the original dimensions
+    original_width, original_height = image.size
+    
+    # Calculate new dimensions (half of the original)
+    new_width = original_width // downsample_factor
+    new_height = original_height // downsample_factor
+    
+    # Resize the image to half its original size
+    downsampled_image = image.resize((new_width, new_height), Image.LANCZOS)
+    
+    return downsampled_image
+
+def dim_image(image: Image.Image) -> Image.Image:
+    enhancer = ImageEnhance.Brightness(image)
+    return enhancer.enhance(0.5)  # Dim the image by reducing brightness to 50%
+
+from PIL import Image, ImageDraw, ImageFont
+
+def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image:
+    width, height = image.size
+    grid_image = Image.new('RGB', (width + 50, height + 50), 'red')
+    grid_image.paste(image, (50, 50))
+    draw = ImageDraw.Draw(grid_image)
+
+    # Calculate the maximum font size that fits within the grid cell
+    max_font_size = min((height // grid_size) // 2, (width // grid_size) // 2)
+    font_size = max_font_size
+    font = ImageFont.truetype("Arial.ttf", font_size)
+
+    # Adjust font size dynamically to fit within grid cells
+    while True:
+        fits = True
+        for i in range(grid_size):
+            row_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:]
+            col_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:]
+            if row_text_size[1] > height // grid_size or col_text_size[0] > width // grid_size:
+                fits = False
+                break
+        if fits:
+            break
+        font_size -= 1
+        font = ImageFont.truetype("Arial.ttf", font_size)
+        if font_size <= 0:
+            raise ValueError("Cannot find a suitable font size that fits within the grid cells")
+
+    cell_width = width / grid_size
+    cell_height = height / grid_size
+
+    # Add labels for rows and columns with the determined font size
+    for i in range(grid_size):
+        draw.text((25, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font)
+        draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, 25), str(i + 1), fill='white', font=font)
+
+    # Draw semi-transparent grid lines
+    overlay = Image.new('RGBA', grid_image.size, (255, 0, 0, 0))
+    overlay_draw = ImageDraw.Draw(overlay)
+    for i in range(grid_size + 1):
+        # Horizontal lines
+        overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=1)
+        # Vertical lines
+        overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=1)
+
+    # Composite the overlay with the labeled image
+    grid_image = Image.alpha_composite(grid_image.convert('RGBA'), overlay)
+
+    return grid_image.convert('RGB')
+
+def get_cell_coordinates(grid_size: int, target_cell: Tuple[int, int], image_size: Tuple[int, int]) -> Tuple[int, int]:
+    row, col = target_cell
+    cell_width = image_size[0] / grid_size
+    cell_height = image_size[1] / grid_size
+    x = int((col - 0.5) * cell_width)
+    y = int((row - 0.5) * cell_height)
+    return x, y
+
+def draw_target_coordinates(image: Image.Image, coordinates: Tuple[int, int]) -> Image.Image:
+    image = image.copy()
+    label_offset = 50  # The offset added to the top and left for labels
+    draw = ImageDraw.Draw(image)
+    adjusted_coordinates = (coordinates[0] + label_offset, coordinates[1] + label_offset)
+    draw.ellipse((adjusted_coordinates[0] - 5, adjusted_coordinates[1] - 5, adjusted_coordinates[0] + 5, adjusted_coordinates[1] + 5), fill='red', outline='red')
+    return image
+
+def main(target: str):
+    # Load and dim the image
+    image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
+    image = load_and_downsample_image(image_file_path, DOWNSAMPLE_FACTOR)
+    image = dim_image(image)
+
+    # Add grid labels
+    grid_image = add_grid_labels(image, GRID_SIZE)
+    grid_image.show()
+
+    row, column = None, None
+    grid_image_with_target = None
+    while True:
+        prompt = f"""
+Attached is an image containing a screenshot over which a grid has been overlaid.
+The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. The grid lines
+are semi-transparent.
+Your task is to identify the grid row and column containing the target.
+The target is: "{target}".
+Respond in JSON with the following keys:
+    {{
+        "target": "<the target you are looking for>",
+        "analysis": "<natural language description of the location of the target, adjacent elements, and anything else that is relevant to identifying the correct row and column>",
+        "row": "<grid row number>",
+        "column": "<column row number>",
+    }}
+Make sure not to confuse the grid labels with the content of the screenshot.
+For example, if the screenshot contains a spreadsheet, don't get confused between
+the spreadsheet cell labels and the grid labels.
+Wrap your code in triple backticks: ```
+"""
+        if row and column:
+            prompt += f"""
+Previously, you specified row {row}, column {column}. This cell has been marked
+with a red dot. If you agree with your previous assessment, confirm by specifying
+the same row and column again. Otherwise, please correct your previous assessment.
+"""
+        # Prompt the model to identify the cell
+        response = DRIVER.prompt(
+            prompt=prompt,
+            system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
+            images=[grid_image_with_target or grid_image],
+        )
+        result = parse_code_snippet(response)
+
+        prev_row, prev_column = row, column
+        row = int(result["row"])
+        column = int(result["column"])
+        logger.info(f"{row=} {column=}")
+        coordinates = get_cell_coordinates(GRID_SIZE, (row, column), image.size)
+
+        # Draw the coordinates on the image
+        grid_image_with_target = draw_target_coordinates(grid_image, coordinates)
+
+        # Show the final image
+        grid_image_with_target.show()
+
+        if row == prev_row and column == prev_column:
+            break
+
+if __name__ == "__main__":
+    #main("save button")
+    #main("cell A1")
+    #main("cell containing 'Marketing'")
+    #main("font selector")
+    #main("zoom slider")
+    #main("paste button")
+    main("font size dropdown")

From 048aa525485c875a380b56924193e841718b0f56 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Tue, 30 Jul 2024 17:02:20 -0400
Subject: [PATCH 09/10] disable cache; undim; DO_CORRECTIONS

---
 experiments/cursor/grid.py   | 120 +++++++++++++++++++----------------
 experiments/cursor/sample.py |   3 +-
 openadapt/cache.py           |   7 +-
 3 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/experiments/cursor/grid.py b/experiments/cursor/grid.py
index af2fec42e..4296de078 100644
--- a/experiments/cursor/grid.py
+++ b/experiments/cursor/grid.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Tuple, List
 import os
 
 from loguru import logger
@@ -12,41 +12,33 @@
 DOWNSAMPLE_FACTOR = 2
 GRID_SIZE = 25  # Adjust based on desired grid size
 DRIVER = openai
+DO_CORRECTIONS = False
 
-def load_and_downsample_image(image_file_path, downsample_factor: int):
-    # Open the image and convert to RGB
+def load_and_downsample_image(image_file_path: str, downsample_factor: int) -> Image.Image:
+    """Load and downsample the image."""
     image = Image.open(image_file_path).convert("RGB")
-    
-    # Get the original dimensions
     original_width, original_height = image.size
-    
-    # Calculate new dimensions (half of the original)
     new_width = original_width // downsample_factor
     new_height = original_height // downsample_factor
-    
-    # Resize the image to half its original size
     downsampled_image = image.resize((new_width, new_height), Image.LANCZOS)
-    
     return downsampled_image
 
 def dim_image(image: Image.Image) -> Image.Image:
+    """Dim the image by reducing brightness to 50%."""
     enhancer = ImageEnhance.Brightness(image)
-    return enhancer.enhance(0.5)  # Dim the image by reducing brightness to 50%
-
-from PIL import Image, ImageDraw, ImageFont
+    return enhancer.enhance(0.5)
 
 def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image:
+    """Add grid labels to the image."""
     width, height = image.size
     grid_image = Image.new('RGB', (width + 50, height + 50), 'red')
     grid_image.paste(image, (50, 50))
     draw = ImageDraw.Draw(grid_image)
 
-    # Calculate the maximum font size that fits within the grid cell
     max_font_size = min((height // grid_size) // 2, (width // grid_size) // 2)
     font_size = max_font_size
     font = ImageFont.truetype("Arial.ttf", font_size)
 
-    # Adjust font size dynamically to fit within grid cells
     while True:
         fits = True
         for i in range(grid_size):
@@ -65,26 +57,21 @@ def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image:
     cell_width = width / grid_size
     cell_height = height / grid_size
 
-    # Add labels for rows and columns with the determined font size
     for i in range(grid_size):
         draw.text((25, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font)
         draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, 25), str(i + 1), fill='white', font=font)
 
-    # Draw semi-transparent grid lines
     overlay = Image.new('RGBA', grid_image.size, (255, 0, 0, 0))
     overlay_draw = ImageDraw.Draw(overlay)
     for i in range(grid_size + 1):
-        # Horizontal lines
         overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=1)
-        # Vertical lines
         overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=1)
 
-    # Composite the overlay with the labeled image
     grid_image = Image.alpha_composite(grid_image.convert('RGBA'), overlay)
-
     return grid_image.convert('RGB')
 
 def get_cell_coordinates(grid_size: int, target_cell: Tuple[int, int], image_size: Tuple[int, int]) -> Tuple[int, int]:
+    """Get the coordinates of a cell in the grid."""
     row, col = target_cell
     cell_width = image_size[0] / grid_size
     cell_height = image_size[1] / grid_size
@@ -92,74 +79,97 @@ def get_cell_coordinates(grid_size: int, target_cell: Tuple[int, int], image_siz
     y = int((row - 0.5) * cell_height)
     return x, y
 
-def draw_target_coordinates(image: Image.Image, coordinates: Tuple[int, int]) -> Image.Image:
+def draw_target_coordinates(image: Image.Image, coordinates: List[Tuple[int, int]]) -> Image.Image:
+    """Draw the target coordinates on the image."""
     image = image.copy()
-    label_offset = 50  # The offset added to the top and left for labels
+    label_offset = 50
     draw = ImageDraw.Draw(image)
-    adjusted_coordinates = (coordinates[0] + label_offset, coordinates[1] + label_offset)
-    draw.ellipse((adjusted_coordinates[0] - 5, adjusted_coordinates[1] - 5, adjusted_coordinates[0] + 5, adjusted_coordinates[1] + 5), fill='red', outline='red')
+    for coordinate in coordinates:
+        adjusted_coordinates = (coordinate[0] + label_offset, coordinate[1] + label_offset)
+        draw.ellipse((adjusted_coordinates[0] - 5, adjusted_coordinates[1] - 5, adjusted_coordinates[0] + 5, adjusted_coordinates[1] + 5), fill='red', outline='red')
+    return image
+
+def undim_target_cells(image: Image.Image, coordinates: List[Tuple[int, int]], grid_size: int) -> Image.Image:
+    """Undim the target cells on the image."""
+    image = image.copy()
+    width, height = image.size
+    cell_width = (width - 50) / grid_size
+    cell_height = (height - 50) / grid_size
+    label_offset = 50
+
+    for coordinate in coordinates:
+        row, col = coordinate
+        x1 = int((col - 1) * cell_width) + label_offset
+        y1 = int((row - 1) * cell_height) + label_offset
+        x2 = int(x1 + cell_width)
+        y2 = int(y1 + cell_height)
+        box = (x1, y1, x2, y2)
+        cropped_section = image.crop(box)
+        enhancer = ImageEnhance.Brightness(cropped_section)
+        brightened_section = enhancer.enhance(2.0)  # Increase brightness to undim
+        image.paste(brightened_section, box)
     return image
 
 def main(target: str):
-    # Load and dim the image
+    """Main function to process the image and identify target cells."""
     image_file_path = os.path.join(config.ROOT_DIR_PATH, "../tests/assets/excel.png")
     image = load_and_downsample_image(image_file_path, DOWNSAMPLE_FACTOR)
     image = dim_image(image)
-
-    # Add grid labels
     grid_image = add_grid_labels(image, GRID_SIZE)
     grid_image.show()
 
-    row, column = None, None
+    coordinates = []
+    all_coordinates = []
     grid_image_with_target = None
+
     while True:
         prompt = f"""
 Attached is an image containing a screenshot over which a grid has been overlaid.
 The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. The grid lines
 are semi-transparent.
-Your task is to identify the grid row and column containing the target.
+Your task is to identify the coordinates of grid cells containing the target.
 The target is: "{target}".
 Respond in JSON with the following keys:
     {{
         "target": "<the target you are looking for>",
-        "analysis": "<natural language description of the location of the target, adjacent elements, and anything else that is relevant to identifying the correct row and column>",
-        "row": "<grid row number>",
-        "column": "<column row number>",
+        "descrpition": "<natural language description of the location of the target, adjacent elements, and anything else that is relevant to identifying the correct row and column>",
+        "reasoning": "<natural language step by step reasoning of how you are determining the grid coordinates of the target>",
+        "coordinates": [(<grid row>, <grid col>), (<grid row>, <grid col>), ...],
     }}
-Make sure not to confuse the grid labels with the content of the screenshot.
-For example, if the screenshot contains a spreadsheet, don't get confused between
-the spreadsheet cell labels and the grid labels.
+You may specify one or more grid cell coordinates.
+Make sure not to confuse the overlaid grid with any grid inside the screenshot!!!
+For example, if the screenshot contains a spreadsheet, don't specify the spreadsheet
+coordinates. You MUST specify the coordinates in the overlaid grid.
 Wrap your code in triple backticks: ```
 """
-        if row and column:
+        if DO_CORRECTIONS and coordinates:
             prompt += f"""
-Previously, you specified row {row}, column {column}. This cell has been marked
-with a red dot. If you agree with your previous assessment, confirm by specifying
-the same row and column again. Otherwise, please correct your previous assessment.
+Previously, someone else specified these cells: {coordinates}. These cells have been undimmed.
+There may be an error. Please correct or confirm the previous assessment.
+DON'T GUESS -- LOOK CAREFULLY AT THE IMAGE!!! My career depends on this. Lives are at stake.
 """
-        # Prompt the model to identify the cell
+
+        config.CACHE_ENABLED = False
         response = DRIVER.prompt(
             prompt=prompt,
             system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
-            images=[grid_image_with_target or grid_image],
+            images=[grid_image_with_target or grid_image] if DO_CORRECTIONS else [grid_image],
         )
         result = parse_code_snippet(response)
+        coordinates = sorted(result["coordinates"])
+        logger.info(f"{coordinates=}")
+        coord_list = [get_cell_coordinates(GRID_SIZE, coord, image.size) for coord in coordinates]
+        #grid_image_with_target = draw_target_coordinates(grid_image, coord_list)
+        grid_image_with_target = undim_target_cells(grid_image, coordinates, GRID_SIZE)
+        #grid_image_with_target = draw_target_coordinates(grid_image_with_target, coord_list)
 
-        prev_row, prev_column = row, column
-        row = int(result["row"])
-        column = int(result["column"])
-        logger.info(f"{row=} {column=}")
-        coordinates = get_cell_coordinates(GRID_SIZE, (row, column), image.size)
-
-        # Draw the coordinates on the image
-        grid_image_with_target = draw_target_coordinates(grid_image, coordinates)
-
-        # Show the final image
         grid_image_with_target.show()
 
-        if row == prev_row and column == prev_column:
+        if coordinates in all_coordinates:
             break
 
+        all_coordinates.append(coordinates)
+
 if __name__ == "__main__":
     #main("save button")
     #main("cell A1")
@@ -167,4 +177,6 @@ def main(target: str):
     #main("font selector")
     #main("zoom slider")
     #main("paste button")
-    main("font size dropdown")
+    #main("font size dropdown")
+    #main("13-May")
+    main("Engineering")
diff --git a/experiments/cursor/sample.py b/experiments/cursor/sample.py
index 017301abf..8a7ff8a18 100644
--- a/experiments/cursor/sample.py
+++ b/experiments/cursor/sample.py
@@ -13,7 +13,7 @@
 from openadapt.drivers import anthropic, openai, google
 from openadapt.utils import parse_code_snippet
 
-DRIVER = google
+DRIVER = openai#google
 NUM_CURSORS = 2**2
 SPREAD_REDUCTION_FACTOR = 0.5
 MAX_ITERATIONS = None
@@ -232,6 +232,7 @@ def main():
     target = "Cell C3"
     target = "Cell G4"
     target = "Cell A1"
+    target = "14-May"
     center = {'x': width // 2, 'y': height // 2}
     spread = 1.0
     iteration = 1
diff --git a/openadapt/cache.py b/openadapt/cache.py
index 1ddd4b20d..1c6787ff6 100644
--- a/openadapt/cache.py
+++ b/openadapt/cache.py
@@ -62,13 +62,12 @@ def cache(
     Returns:
         The decorator function.
     """
-    cache_dir_path = default(dir_path, config.CACHE_DIR_PATH)
-    cache_enabled = default(enabled, config.CACHE_ENABLED)
-    cache_verbosity = default(verbosity, config.CACHE_VERBOSITY)
-
     def decorator(fn: Callable) -> Callable:
         @wraps(fn)
         def wrapper(*args: Any, **kwargs: Any) -> Any:
+            cache_dir_path = default(dir_path, config.CACHE_DIR_PATH)
+            cache_enabled = default(enabled, config.CACHE_ENABLED)
+            cache_verbosity = default(verbosity, config.CACHE_VERBOSITY)
             logger.debug(f"{cache_enabled=}")
             if cache_enabled:
                 memory = Memory(cache_dir_path, verbose=cache_verbosity)

From efc663975cf8222b7d02a8fbb604a03681024507 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Tue, 30 Jul 2024 17:40:36 -0400
Subject: [PATCH 10/10] highlight target cells in red; add labels on
 bottom/right

---
 experiments/cursor/grid.py | 92 ++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 33 deletions(-)

diff --git a/experiments/cursor/grid.py b/experiments/cursor/grid.py
index 4296de078..af642776d 100644
--- a/experiments/cursor/grid.py
+++ b/experiments/cursor/grid.py
@@ -12,7 +12,7 @@
 DOWNSAMPLE_FACTOR = 2
 GRID_SIZE = 25  # Adjust based on desired grid size
 DRIVER = openai
-DO_CORRECTIONS = False
+DO_CORRECTIONS = True
 
 def load_and_downsample_image(image_file_path: str, downsample_factor: int) -> Image.Image:
     """Load and downsample the image."""
@@ -29,9 +29,9 @@ def dim_image(image: Image.Image) -> Image.Image:
     return enhancer.enhance(0.5)
 
 def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image:
-    """Add grid labels to the image."""
+    """Add grid labels to the image on all four sides."""
     width, height = image.size
-    grid_image = Image.new('RGB', (width + 50, height + 50), 'red')
+    grid_image = Image.new('RGB', (width + 100, height + 100), 'red')
     grid_image.paste(image, (50, 50))
     draw = ImageDraw.Draw(grid_image)
 
@@ -42,9 +42,8 @@ def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image:
     while True:
         fits = True
         for i in range(grid_size):
-            row_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:]
-            col_text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:]
-            if row_text_size[1] > height // grid_size or col_text_size[0] > width // grid_size:
+            text_size = draw.textbbox((0, 0), str(i + 1), font=font)[2:]
+            if text_size[0] > width // grid_size or text_size[1] > height // grid_size:
                 fits = False
                 break
         if fits:
@@ -58,14 +57,22 @@ def add_grid_labels(image: Image.Image, grid_size: int) -> Image.Image:
     cell_height = height / grid_size
 
     for i in range(grid_size):
+        # Left side labels
         draw.text((25, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font)
+        # Top side labels
         draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, 25), str(i + 1), fill='white', font=font)
+        # Right side labels
+        draw.text((width + 75, 50 + int(i * cell_height + cell_height / 2) - font_size // 2), str(i + 1), fill='white', font=font)
+        # Bottom side labels
+        draw.text((50 + int(i * cell_width + cell_width / 2) - font_size // 2, height + 75), str(i + 1), fill='white', font=font)
 
     overlay = Image.new('RGBA', grid_image.size, (255, 0, 0, 0))
     overlay_draw = ImageDraw.Draw(overlay)
     for i in range(grid_size + 1):
-        overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=1)
-        overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=1)
+        # Horizontal lines
+        overlay_draw.line([(50, 50 + int(i * cell_height)), (50 + width, 50 + int(i * cell_height))], fill=(255, 255, 255, 128), width=3)
+        # Vertical lines
+        overlay_draw.line([(50 + int(i * cell_width), 50), (50 + int(i * cell_width), 50 + height)], fill=(255, 255, 255, 128), width=3)
 
     grid_image = Image.alpha_composite(grid_image.convert('RGBA'), overlay)
     return grid_image.convert('RGB')
@@ -90,11 +97,11 @@ def draw_target_coordinates(image: Image.Image, coordinates: List[Tuple[int, int
     return image
 
 def undim_target_cells(image: Image.Image, coordinates: List[Tuple[int, int]], grid_size: int) -> Image.Image:
-    """Undim the target cells on the image."""
+    """Undim the target cells on the image and highlight them in red."""
     image = image.copy()
     width, height = image.size
-    cell_width = (width - 50) / grid_size
-    cell_height = (height - 50) / grid_size
+    cell_width = (width - 100) / grid_size
+    cell_height = (height - 100) / grid_size
     label_offset = 50
 
     for coordinate in coordinates:
@@ -107,7 +114,15 @@ def undim_target_cells(image: Image.Image, coordinates: List[Tuple[int, int]], g
         cropped_section = image.crop(box)
         enhancer = ImageEnhance.Brightness(cropped_section)
         brightened_section = enhancer.enhance(2.0)  # Increase brightness to undim
-        image.paste(brightened_section, box)
+
+        # Create a red overlay
+        red_overlay = Image.new('RGBA', brightened_section.size, (255, 0, 0, 128))  # Semi-transparent red
+
+        # Combine the brightened section with the red overlay
+        highlighted_section = Image.alpha_composite(brightened_section.convert('RGBA'), red_overlay)
+
+        image.paste(highlighted_section, box)
+
     return image
 
 def main(target: str):
@@ -123,33 +138,42 @@ def main(target: str):
     grid_image_with_target = None
 
     while True:
+        if DO_CORRECTIONS and coordinates:
+            correction_prompt = f"""
+- Previously, someone specified these cells: {coordinates}. These cells have been highlighted in red.
+- There may be an error. Please correct or confirm the previous assessment.
+- In your analysis, include the position of the target relative to these previously specified cells.
+- In your reasoning, describe how many cells to move in each direction to reach the target."""
+        else:
+            correction_prompt = "\n"
+
         prompt = f"""
 Attached is an image containing a screenshot over which a grid has been overlaid.
-The grid labels are white (255, 255, 255) on a red (255, 0, 0) background. The grid lines
-are semi-transparent.
+The grid labels are white (255, 255, 255) on a red (255, 0, 0) background.
+The grid lines are semi-transparent.
 Your task is to identify the coordinates of grid cells containing the target.
 The target is: "{target}".
+
 Respond in JSON with the following keys:
-    {{
-        "target": "<the target you are looking for>",
-        "descrpition": "<natural language description of the location of the target, adjacent elements, and anything else that is relevant to identifying the correct row and column>",
-        "reasoning": "<natural language step by step reasoning of how you are determining the grid coordinates of the target>",
-        "coordinates": [(<grid row>, <grid col>), (<grid row>, <grid col>), ...],
-    }}
-You may specify one or more grid cell coordinates.
-Make sure not to confuse the overlaid grid with any grid inside the screenshot!!!
-For example, if the screenshot contains a spreadsheet, don't specify the spreadsheet
-coordinates. You MUST specify the coordinates in the overlaid grid.
+{{
+    "target": "<the target you are looking for>",
+    "analysis": "<natural language description of the location of the target, adjacent elements, and anything else that is relevant to identifying the correct row and column>",
+    "reasoning": "<natural language step by step reasoning of how you are determining the grid coordinates of the target>",
+    "coordinates": [(<grid row>, <grid col>), (<grid row>, <grid col>), ...],
+}}
+
+**Important Instructions**:
+- Focus on the overlaid grid for coordinate determination.
+- Do not confuse the overlaid grid with any internal grids in the screenshot, such as those in spreadsheets.
+- Verify the grid coordinates carefully.{correction_prompt}
+- LOOK CAREFULLY AT THE IMAGE to avoid mistakes.
+- You must specify all cells containing any part of the target, so that the target is COMPLETELY COVERED.
+
 Wrap your code in triple backticks: ```
-"""
-        if DO_CORRECTIONS and coordinates:
-            prompt += f"""
-Previously, someone else specified these cells: {coordinates}. These cells have been undimmed.
-There may be an error. Please correct or confirm the previous assessment.
-DON'T GUESS -- LOOK CAREFULLY AT THE IMAGE!!! My career depends on this. Lives are at stake.
 """
 
-        config.CACHE_ENABLED = False
+        if not DO_CORRECTIONS:
+            config.CACHE_ENABLED = False
         response = DRIVER.prompt(
             prompt=prompt,
             system_prompt="You are an expert GUI interpreter. You are precise and accurate.",
@@ -172,11 +196,13 @@ def main(target: str):
 
 if __name__ == "__main__":
     #main("save button")
-    #main("cell A1")
+    main("Spreadsheet cell A1")
     #main("cell containing 'Marketing'")
     #main("font selector")
     #main("zoom slider")
     #main("paste button")
     #main("font size dropdown")
     #main("13-May")
-    main("Engineering")
+    #main("Engineering")
+    #main("Sales")
+    #main("Spreadsheet area")