Skip to content

Commit

Permalink
Continually refresh OCR and speak new text as it appears (nvaccess#15331
Browse files Browse the repository at this point in the history
)

Replaces nvaccess#11270, adding configuration.
Fixes nvaccess#2797.

Summary of the issue:
Some videos include text which is graphical only with no accompanying verbalization, thus making it inaccessible to screen reader users. OCR is very useful for this purpose. However, having to repeatedly and manually dismiss and recognize is tedious and inefficient. It would be useful if NVDA could do this automatically, reporting new text as it appears.

This could be useful for other scenarios as well such as virtual machines and games.

Description of how this pull request fixes the issue:
When the user performs content recognition (e.g. NVDA+r for Windows 10 OCR), we now periodically (every 1.5 seconds) recognize the same area of the screen again. The result document is updated with the new recognition result, keeping the previous cursor position if possible.

The LiveText NVDAObject is used to report any new text that has been added. LiveText honours the report dynamic content changes setting, so turning this off will prevent new text from being spoken.

Because the result document is updated, this means braille is updated as well, allowing braille users to see changes as they occur.

While Windows 10 OCR is local and relatively fast, other recognizers might not be; e.g. they might be resource intensive or use an internet service. Recognizers can thus specify whether they want to support auto refresh using the allowAutoRefresh attribute, which defaults to False.
  • Loading branch information
LeonarddeR authored Sep 1, 2023
1 parent 104de7c commit ce32058
Show file tree
Hide file tree
Showing 10 changed files with 221 additions and 66 deletions.
2 changes: 2 additions & 0 deletions source/config/configSpec.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@
[uwpOcr]
language = string(default="")
autoRefresh = boolean(default=false)
autoRefreshInterval = integer(default=1500, min=100)
[upgrade]
newLaptopKeyboardLayout = boolean(default=false)
Expand Down
88 changes: 53 additions & 35 deletions source/contentRecog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#contentRecog/__init__.py
#A part of NonVisual Desktop Access (NVDA)
#Copyright (C) 2017 NV Access Limited
#This file is covered by the GNU General Public License.
#See the file COPYING for more details.
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2017-2023 NV Access Limited, James Teh, Leonard de Ruijter
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.

"""Framework for recognition of content; OCR, image recognition, etc.
When authors don't provide sufficient information for a screen reader user to determine the content of something,
Expand All @@ -14,11 +13,17 @@
"""

from collections import namedtuple
import ctypes
from typing import Callable, Dict, List, Union
import garbageHandler
from baseObject import AutoPropertyObject
import cursorManager
import textInfos.offsets
from abc import ABCMeta, abstractmethod
from locationHelper import RectLTWH
from NVDAObjects import NVDAObject

onRecognizeResultCallbackT = Callable[[Union["RecognitionResult", Exception]], None]


class BaseContentRecogTextInfo(cursorManager._ReviewCursorManagerTextInfo):
Expand All @@ -27,24 +32,31 @@ class BaseContentRecogTextInfo(cursorManager._ReviewCursorManagerTextInfo):
"""


class ContentRecognizer(garbageHandler.TrackedObject, metaclass=ABCMeta):
class ContentRecognizer(AutoPropertyObject):
"""Implementation of a content recognizer.
"""

def getResizeFactor(self, width, height):
allowAutoRefresh: bool = False
"""
Whether to allow automatic, periodic refresh when using this recognizer.
This allows the user to see live changes as they occur. However, if a
recognizer uses an internet service or is very resource intensive, this
may be undesirable.
"""
autoRefreshInterval: int = 1500
"""How often (in ms) to perform recognition."""

def getResizeFactor(self, width: int, height: int) -> Union[int, float]:
"""Return the factor by which an image must be resized
before it is passed to this recognizer.
@param width: The width of the image in pixels.
@type width: int
@param height: The height of the image in pixels.
@type height: int
@return: The resize factor, C{1} for no resizing.
@rtype: int or float
"""
return 1

@abstractmethod
def recognize(self, pixels, imageInfo, onResult):
def recognize(self, pixels: ctypes.Array, imageInfo: "RecogImageInfo", onResult: onRecognizeResultCallbackT):
"""Asynchronously recognize content from an image.
This method should not block.
Only one recognition can be performed at a time.
Expand All @@ -56,9 +68,8 @@ def recognize(self, pixels, imageInfo, onResult):
However, the alpha channel should be ignored.
@type pixels: Two dimensional array (y then x) of L{winGDI.RGBQUAD}
@param imageInfo: Information about the image for recognition.
@type imageInfo: L{RecogImageInfo}
@param onResult: A callable which takes a L{RecognitionResult} (or an exception on failure) as its only argument.
@type onResult: callable
@param onResult: A callable which takes a L{RecognitionResult} (or an exception on failure)
as its only argument.
"""
raise NotImplementedError

Expand All @@ -73,17 +84,16 @@ def validateCaptureBounds(self, location: RectLTWH) -> bool:
"""
return True

def validateObject(self, nav):
def validateObject(self, nav: NVDAObject) -> bool:
"""Validation to be performed on the navigator object before content recognition
@param nav: The navigator object to be validated
@type nav: L{NVDAObjects.NVDAObject}
@return: C{True} or C{False}, depending on whether the navigator object is valid or not.
C{True} for no validation.
@rtype: bool
"""
return True

class RecogImageInfo(object):

class RecogImageInfo:
"""Encapsulates information about a recognized image and
provides functionality to convert coordinates.
An image captured for recognition can begin at any point on the screen.
Expand All @@ -97,18 +107,20 @@ class RecogImageInfo(object):
This is done using the L{convertXToScreen} and L{convertYToScreen} methods.
"""

def __init__(self, screenLeft, screenTop, screenWidth, screenHeight, resizeFactor):
def __init__(
self,
screenLeft: int,
screenTop: int,
screenWidth: int,
screenHeight: int,
resizeFactor: Union[int, float]
):
"""
@param screenLeft: The x screen coordinate of the upper-left corner of the image.
@type screenLeft: int
@param screenTop: The y screen coordinate of the upper-left corner of the image.
@type screenTop: int
@param screenWidth: The width of the image on the screen.
@type screenWidth: int
@param screenHeight: The height of the image on the screen.
@type screenHeight: int
@param resizeFactor: The factor by which the image must be resized for recognition.
@type resizeFactor: int or float
@raise ValueError: If the supplied screen coordinates indicate that
the image is not visible; e.g. width or height of 0.
"""
Expand All @@ -125,7 +137,14 @@ def __init__(self, screenLeft, screenTop, screenWidth, screenHeight, resizeFacto
self.recogHeight = int(screenHeight * resizeFactor)

@classmethod
def createFromRecognizer(cls, screenLeft, screenTop, screenWidth, screenHeight, recognizer):
def createFromRecognizer(
cls,
screenLeft: int,
screenTop: int,
screenWidth: int,
screenHeight: int,
recognizer: ContentRecognizer
):
"""Convenience method to construct an instance using a L{ContentRecognizer}.
The resize factor is obtained by calling L{ContentRecognizer.getResizeFactor}.
"""
Expand Down Expand Up @@ -172,18 +191,20 @@ def makeTextInfo(self, obj, position) -> BaseContentRecogTextInfo:
"""
raise NotImplementedError


# Used internally by LinesWordsResult.
# (Lwr is short for LinesWordsResult.)
LwrWord = namedtuple("LwrWord", ("offset", "left", "top", "width", "height"))


class LinesWordsResult(RecognitionResult):
"""A L{RecognizerResult} which can create TextInfos based on a simple lines/words data structure.
The data structure is a list of lines, wherein each line is a list of words,
wherein each word is a dict containing the keys x, y, width, height and text.
Several OCR engines produce output in a format which can be easily converted to this.
"""

def __init__(self, data, imageInfo):
def __init__(self, data: List[List[Dict[str, Union[str, int]]]], imageInfo: RecogImageInfo):
"""Constructor.
@param data: The lines/words data structure. For example:
[
Expand All @@ -196,11 +217,9 @@ def __init__(self, data, imageInfo):
{"x": 117, "y": 105, "width": 11, "height": 9, "text": "Word4"}
]
]
@type data: list of lists of dicts
@param imageInfo: Information about the recognized image.
This is used to convert coordinates in the recognized image
to screen coordinates.
@type imageInfo: L{RecogImageInfo}
"""
self.data = data
self.imageInfo = imageInfo
Expand All @@ -223,11 +242,13 @@ def _parseData(self):
# Separate with a space.
self._textList.append(" ")
self.textLen += 1
self.words.append(LwrWord(self.textLen,
self.words.append(LwrWord(
self.textLen,
self.imageInfo.convertXToScreen(word["x"]),
self.imageInfo.convertYToScreen(word["y"]),
self.imageInfo.convertWidthToScreen(word["width"]),
self.imageInfo.convertHeightToScreen(word["height"])))
self.imageInfo.convertHeightToScreen(word["height"]))
)
text = word["text"]
self._textList.append(text)
self.textLen += len(text)
Expand All @@ -249,7 +270,7 @@ class LwrTextInfo(BaseContentRecogTextInfo, textInfos.offsets.OffsetsTextInfo):

def __init__(self, obj, position, result):
self.result = result
super(LwrTextInfo, self).__init__(obj, position)
super().__init__(obj, position)

def copy(self):
return self.__class__(self.obj, self.bookmark, self.result)
Expand Down Expand Up @@ -315,7 +336,7 @@ class SimpleResultTextInfo(BaseContentRecogTextInfo, textInfos.offsets.OffsetsTe

def __init__(self, obj, position, result):
self.result = result
super(SimpleResultTextInfo, self).__init__(obj, position)
super().__init__(obj, position)

def copy(self):
return self.__class__(self.obj, self.bookmark, self.result)
Expand All @@ -325,6 +346,3 @@ def _getStoryText(self):

def _getStoryLength(self):
return len(self.result.text)

def _getStoryText(self):
return self.result.text
Loading

0 comments on commit ce32058

Please sign in to comment.