From 5705f246a2b5328289e80f86c26fbef8c1da5af5 Mon Sep 17 00:00:00 2001 From: Florian Strzelecki Date: Fri, 31 Dec 2021 19:55:32 +0100 Subject: [PATCH] tools.identifiers: documentation of casemapping --- docs/source/api.rst | 1 - sopel/tools/identifiers.py | 143 ++++++++++++++++++++++++++++++------- 2 files changed, 119 insertions(+), 25 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 349123007e..737085c21a 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -17,7 +17,6 @@ sopel.tools.identifiers .. automodule:: sopel.tools.identifiers :members: - :private-members: Identifier._lower, Identifier._lower_swapped sopel.tools.web diff --git a/sopel/tools/identifiers.py b/sopel/tools/identifiers.py index 6cf2db67e4..bbefce9a02 100644 --- a/sopel/tools/identifiers.py +++ b/sopel/tools/identifiers.py @@ -1,4 +1,37 @@ -"""Identifier tools to represent IRC names (nick or channel).""" +"""Identifier tools to represent IRC names (nick or channel). + +Nick and channel are defined by their names, which are "identifiers": their +names are used to differentiate users from each others, channels from each +others. To ensure that two channels or two users are the same, their +identifiers must be processed to be compared properly. This process depends on +which RFC and how that RFC is implemented by the server: IRC being an old +protocol, different RFCs have differents version of that process: + +* :rfc:`1549#section-2.2`: ASCII characters, and ``[]\\`` are mapped to ``{}|`` +* :rfc:`2812#section-2.2`: same as in the previous RFC, adding ``~`` mapped to + ``^`` + +Then when ISUPPORT was added, the `CASEMAPPING parameter`__ was defined so the +server can say which process to apply: + +* ``ascii``: only ``[A-Z]`` must be mapped to ``[a-z]`` (implemented by + :func:`ascii_lower`) +* ``rfc1459``: follow :rfc:`2812`; because of how it was implemented in most + server (implemented by :func:`rfc1459_lower`) +* A strict version of :rfc:`1459` also exist but it is not recommended + (implemented by :func:`rfc1459_strict_lower`) + +As a result, the :class:`Identifier` class requires a casemapping function, +which should be provided by the :class:`bot`. + +.. seealso:: + + The bot's :class:`make_identifier` method + should be used to instantiate an :class:`Identifier` to honor the + ``CASEMAPPING`` parameter. + +.. __: https://modern.ircdocs.horse/index.html#casemapping-parameter +""" from __future__ import generator_stop import string @@ -18,16 +51,29 @@ def ascii_lower(text: str) -> str: - """Lower ``text`` according to the ASCII CASEMAPPING""" + """Lower ``text`` according to the ``ascii`` value of ``CASEMAPPING``. + + In that version, only ``[A-Z]`` are to be mapped to their lowercase + equivalent (``[a-z]``). Non-ASCII characters are kept unmodified. + """ return text.translate(ASCII_TABLE) def rfc1459_lower(text: str) -> str: - """Lower ``text`` according to :rfc:`1459` (with ``~`` mapped to ``^``). + """Lower ``text`` according to :rfc:`2812`. - Similar to :func:`rfc1459_strict_lower`, but also maps ``~`` to - ``^`` as defined for the ``rfc1459`` value of the - `CASEMAPPING parameter`__. + Similar to :func:`rfc1459_strict_lower`, but also maps ``~`` to ``^``, as + per :rfc:`2812#section-2.2`: + + Because of IRC's Scandinavian origin, the characters ``{}|^`` are + considered to be the lower case equivalents of the characters + ``[]\\~``, respectively. + + .. note:: + + This is an implementation of the `CASEMAPPING parameter`__ for the + value ``rfc1459``, which doesn't use :rfc:`1459` but its updated version + :rfc:`2812`. .. __: https://modern.ircdocs.horse/index.html#casemapping-parameter """ @@ -37,12 +83,11 @@ def rfc1459_lower(text: str) -> str: def rfc1459_strict_lower(text: str) -> str: """Lower ``text`` according to :rfc:`1459` (strict version). - As per `section 2.2`__: + As per :rfc:`1459#section-2.2`: Because of IRC's scandanavian origin, the characters ``{}|`` are considered to be the lower case equivalents of the characters ``[]\\``. - .. __: https://datatracker.ietf.org/doc/html/rfc1459#section-2.2 """ return text.translate(RFC1459_STRICT_TABLE) @@ -53,14 +98,37 @@ def rfc1459_strict_lower(text: str) -> str: class Identifier(str): """A ``str`` subclass which acts appropriately for IRC identifiers. + :param str identifier: IRC identifier + :param casemapping: a casemapping function (optional keyword argument) + :type casemapping: Callable[[:class:`str`], :class:`str`] + When used as normal ``str`` objects, case will be preserved. However, when comparing two Identifier objects, or comparing an Identifier object with a ``str`` object, the comparison will be case insensitive. - This case insensitivity includes the case convention conventions regarding - ``[]``, ``{}``, ``|``, ``\\``, ``^`` and ``~`` described in RFC 2812. + + This case insensitivity uses the provided ``casemapping`` function, + following the rules for the `CASEMAPPING parameter`__ from ISUPPORT. By + default, it uses :func:`rfc1459_lower`, following :rfc:`2812#section-2.2`. + + .. note:: + + To instantiate an ``Identifier`` with the appropriate ``casemapping`` + function, it is best to rely on + :meth:`bot.make_identifier`. + + .. versionchanged:: 8.0 + + The ``casemapping`` parameter has been added. + + .. __: https://modern.ircdocs.horse/index.html#casemapping-parameter """ - def __new__(cls, *args, **kwargs) -> 'Identifier': - return str.__new__(cls, *args) + def __new__( + cls, + identifier: str, + *, + casemapping: Casemapping = rfc1459_lower, + ) -> 'Identifier': + return str.__new__(cls, identifier) def __init__( self, @@ -73,22 +141,41 @@ def __init__( """Casemapping function to lower the identifier.""" self._lowered = self.casemapping(identifier) - def lower(self): - """Get the RFC 2812-compliant lowercase version of this identifier. + def lower(self) -> str: + """Get the IRC-compliant lowercase version of this identifier. - :return: RFC 2812-compliant lowercase version of the - :py:class:`Identifier` instance - :rtype: str + :return: IRC-compliant lowercase version used for case-insensitive + comparisons + + The behavior of this method depends on the identifier's casemapping + function, which should be selected based on the ``CASEMAPPING`` + parameter from ``ISUPPORT``. + + .. versionchanged:: 8.0 + + Now use the :attr:`casemapping` function to lower the identifier. """ return self.casemapping(self) @staticmethod def _lower(identifier: str): - """Convert an identifier to lowercase per RFC 2812. + """Convert an identifier to lowercase per :rfc:`2812`. :param str identifier: the identifier (nickname or channel) to convert :return: RFC 2812-compliant lowercase version of ``identifier`` :rtype: str + + :meta public: + + .. versionchanged:: 8.0 + + Previously, this would lower all non-ASCII characters. It now uses + a strict implementation of the ``CASEMAPPING`` parameter. This is + now equivalent to call :func:`rfc1459_lower`. + + If the ``identifier`` is an instance of :class:`Identifier`, this + will call that identifier's :meth:`lower` method instead. + """ if isinstance(identifier, Identifier): return identifier.lower() @@ -98,19 +185,27 @@ def _lower(identifier: str): def _lower_swapped(identifier: str): """Backward-compatible version of :meth:`_lower`. - :param str identifier: the identifier (nickname or channel) to convert + :param identifier: the identifier (nickname or channel) to convert :return: RFC 2812-non-compliant lowercase version of ``identifier`` :rtype: str - This is what the old :meth:`_lower` function did before Sopel 7.0. It maps - ``{}``, ``[]``, ``|``, ``\\``, ``^``, and ``~`` incorrectly. + This is what the old :meth:`_lower` function did before Sopel 7.0. It + maps ``{}``, ``[]``, ``|``, ``\\``, ``^``, and ``~`` incorrectly. + + You shouldn't use this unless you need to migrate stored values from + the previous, incorrect "lowercase" representation to the correct one. + + :meta public: + + .. versionadded: 7.0 - You shouldn't use this unless you need to migrate stored values from the - previous, incorrect "lowercase" representation to the correct one. + This method was added to ensure migration of improperly lowercased + data: it reverts the data back to the previous lowercase rules. """ # The tilde replacement isn't needed for identifiers, but is for # channels, which may be useful at some point in the future. - low = identifier.lower().replace('{', '[').replace('}', ']') + # Always convert to str, to prevent using custom casemapping + low = str(identifier).lower().replace('{', '[').replace('}', ']') low = low.replace('|', '\\').replace('^', '~') return low