From f7b080638044a4b01ade4dce11e482f3baad0cb0 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Sun, 21 Apr 2024 19:10:10 -0400 Subject: [PATCH] feat(bluesky): Tweaks the method to parse hashtags --- bc/channel/utils/connectors/bluesky_api/client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bc/channel/utils/connectors/bluesky_api/client.py b/bc/channel/utils/connectors/bluesky_api/client.py index b7195d8c..d8d3d2ef 100644 --- a/bc/channel/utils/connectors/bluesky_api/client.py +++ b/bc/channel/utils/connectors/bluesky_api/client.py @@ -106,13 +106,14 @@ def _parse_tags(self, text: str) -> list[RegexMatch]: # reference: https://github.com/bluesky-social/atproto/blob/fbc7e75c402e0c268e7e411353968985eeb4bb06/packages/api/src/rich-text/util.ts#L10 # given that our needs of a hashtag is very simple, we can do away with # only parsing alphanumeric characters - tag_regex = r"(?:^|\s)#(?P[0-9]*[a-zA-Z][a-zA-Z0-9]*)" - for m in re.finditer(tag_regex, text): + tag_regex = rb"(?:^|\s)#(?P[0-9]*[a-zA-Z][a-zA-Z0-9]*)" + text_bytes = text.encode("UTF-8") + for m in re.finditer(tag_regex, text_bytes): spans.append( RegexMatch( start=m.start("tag") - 1, end=m.end("tag"), - text=m.group("tag"), + text=m.group("tag").decode("UTF-8"), ) ) return spans