diff --git a/experiments/describe_action.py b/experiments/describe_actions.py similarity index 53% rename from experiments/describe_action.py rename to experiments/describe_actions.py index 81a79f67c..3561d6d8e 100644 --- a/experiments/describe_action.py +++ b/experiments/describe_actions.py @@ -1,38 +1,40 @@ -"""Generate action descriptions.""" +"""Generate natural language descriptions from actions.""" from pprint import pformat - from loguru import logger -import cv2 +from PIL import Image, ImageDraw import numpy as np from openadapt.db import crud +from openadapt.plotting import get_font +from openadapt.utils import get_scaling_factor + +scaling_factor = get_scaling_factor() def embed_description( - image: np.ndarray, + image: Image.Image, description: str, - x: int = None, - y: int = None, -) -> np.ndarray: + x: int = 0, + y: int = 0, +) -> Image.Image: """Embed a description into an image at the specified location. Args: - image (np.ndarray): The image to annotate. + image (Image.Image): The image to annotate. description (str): The text to embed. - x (int, optional): The x-coordinate. Defaults to None (centered). - y (int, optional): The y-coordinate. Defaults to None (centered). + x (int, optional): The x-coordinate. Defaults to 0. + y (int, optional): The y-coordinate. Defaults to 0. Returns: - np.ndarray: The annotated image. + Image.Image: The annotated image. """ - font = cv2.FONT_HERSHEY_SIMPLEX - font_scale = 1 - font_color = (255, 255, 255) # White - line_type = 1 + draw = ImageDraw.Draw(image) + font_size = 30 # Set font size (2x the default size) + font = get_font("Arial.ttf", font_size) # Split description into multiple lines - max_width = 60 # Maximum characters per line + max_width = image.width words = description.split() lines = [] current_line = [] @@ -45,36 +47,28 @@ def embed_description( if current_line: lines.append(" ".join(current_line)) - # Default to center if coordinates are not provided - if x is None or y is None: - x = image.shape[1] // 2 - y = image.shape[0] // 2 + # Adjust coordinates for scaling factor + x = int(x * scaling_factor) + y = int(y * scaling_factor) - # Draw semi-transparent background and text + # Calculate text dimensions and draw semi-transparent background and text for i, line in enumerate(lines): - text_size, _ = cv2.getTextSize(line, font, font_scale, line_type) - text_x = max(0, min(x - text_size[0] // 2, image.shape[1] - text_size[0])) - text_y = y + i * 20 + bbox = draw.textbbox((0, 0), line, font=font) + text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1] + text_x = max(0, min(x - text_width // 2, image.width - text_width)) + text_y = y + i * text_height # Draw background - cv2.rectangle( - image, - (text_x - 15, text_y - 25), - (text_x + text_size[0] + 15, text_y + 15), - (0, 0, 0), - -1, + background_box = ( + text_x - 15, + text_y - 5, + text_x + text_width + 15, + text_y + text_height + 5, ) + draw.rectangle(background_box, fill=(0, 0, 0, 128)) # Draw text - cv2.putText( - image, - line, - (text_x, text_y), - font, - font_scale, - font_color, - line_type, - ) + draw.text((text_x, text_y), line, fill=(255, 255, 255), font=font) return image @@ -88,16 +82,14 @@ def main() -> None: for action in action_events: description, image = action.prompt_for_description(return_image=True) - # Convert image to numpy array for OpenCV compatibility - image = np.array(image) + # Convert image to PIL.Image for compatibility + image = Image.fromarray(np.array(image)) if action.mouse_x is not None and action.mouse_y is not None: # Use the mouse coordinates for mouse events annotated_image = embed_description( image, description, - x=int(action.mouse_x) * 2, - y=int(action.mouse_y) * 2, ) else: # Center the text for other events @@ -105,8 +97,7 @@ def main() -> None: logger.info(f"{action=}") logger.info(f"{description=}") - cv2.imshow("Annotated Image", annotated_image) - cv2.waitKey(0) + annotated_image.show() # Opens the annotated image using the default viewer descriptions.append(description) logger.info(f"descriptions=\n{pformat(descriptions)}") diff --git a/openadapt/models.py b/openadapt/models.py index b2e9a4224..03b60329e 100644 --- a/openadapt/models.py +++ b/openadapt/models.py @@ -573,6 +573,7 @@ def prompt_for_description(self, return_image: bool = False) -> str: darken_outside=0.7, display_text=False, marker_fill_transparency=0, + dim_outside_window=False, ) if self.text: diff --git a/openadapt/plotting.py b/openadapt/plotting.py index b0bc0b932..1cffb8261 100644 --- a/openadapt/plotting.py +++ b/openadapt/plotting.py @@ -228,6 +228,7 @@ def display_event( diff: bool = False, darken_outside: float | None = None, display_text: bool = True, + dim_outside_window: bool = True, ) -> Image.Image: """Display an action event on the image. @@ -247,6 +248,7 @@ def display_event( the ellipse for mouse events. Range 0-1, where 1 is completely black. Defaults to None (no darkening). display_text (bool): Whether to display action text. Defaults to True. + dim_outside_window (bool): Whether to dim outside the WindowEvent area. Returns: PIL.Image.Image: The image with the action event displayed on it. @@ -267,14 +269,15 @@ def display_event( width_ratio, height_ratio = utils.get_scale_ratios(action_event) # dim area outside window event - if not window_event: - logger.error(f"{window_event=}") - else: - x0 = window_event.left * width_ratio - y0 = window_event.top * height_ratio - x1 = x0 + window_event.width * width_ratio - y1 = y0 + window_event.height * height_ratio - image = draw_rectangle(x0, y0, x1, y1, image, outline_width=5) + if dim_outside_window: + if not window_event: + logger.error(f"{window_event=}") + else: + x0 = window_event.left * width_ratio + y0 = window_event.top * height_ratio + x1 = x0 + window_event.width * width_ratio + y1 = y0 + window_event.height * height_ratio + image = draw_rectangle(x0, y0, x1, y1, image, outline_width=5) # display diff bbox if diff: diff --git a/openadapt/record.py b/openadapt/record.py index 4740da8c9..a76de017a 100644 --- a/openadapt/record.py +++ b/openadapt/record.py @@ -35,7 +35,6 @@ import numpy as np import psutil -import sounddevice import soundfile import websockets.sync.server import whisper @@ -1082,6 +1081,8 @@ def record_audio( audio_frames = [] # to store audio frames + import sounddevice + def audio_callback( indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags ) -> None: diff --git a/openadapt/utils.py b/openadapt/utils.py index 524441946..4e1a0f44a 100644 --- a/openadapt/utils.py +++ b/openadapt/utils.py @@ -1087,6 +1087,18 @@ def get_html_prompt(html: str, convert_to_markdown: bool = False) -> str: return str(soup) +def get_scaling_factor() -> int: + """Determine the scaling factor using AppKit on macOS.""" + if sys.platform == "darwin": + from AppKit import NSScreen + + main_screen = NSScreen.mainScreen() + backing_scale = main_screen.backingScaleFactor() + logger.info(f"Backing Scale Factor: {backing_scale}") + return int(backing_scale) + return 1 # Default for Windows/Linux + + class WrapStdout: """Class to be used a target for multiprocessing.Process.""" diff --git a/openadapt/visualize.py b/openadapt/visualize.py index ff47712ae..8722d0883 100644 --- a/openadapt/visualize.py +++ b/openadapt/visualize.py @@ -159,6 +159,7 @@ def main( recording_id: int = None, diff_video: bool = False, cleanup: bool = True, + browser: str = None, ) -> bool: """Visualize a recording. @@ -167,6 +168,7 @@ def main( recording_id (int, optional): The ID of the recording to visualize. diff_video (bool): Whether to diff Screenshots against video frames. cleanup (bool): Whether to remove the HTML file after it is displayed. + browser (str, optional): Command to open the browser executable. Returns: bool: True if visualization was successful, None otherwise. @@ -442,11 +444,13 @@ def main( os.makedirs(RECORDING_DIR_PATH, exist_ok=True) output_file(fname_out, title=title) - result = show( # noqa: F841 - layout( - rows, - ) - ) + result = show(layout(rows)) # noqa: F841 + + if browser: + import subprocess + + logger.info(f"Opening browser with command: {browser}") + subprocess.run([browser, f"file://{fname_out}"], check=True) def _cleanup() -> None: os.remove(fname_out)