From 8d2d1240825cbfa32548ecb8afc978cea533ec23 Mon Sep 17 00:00:00 2001 From: PAN <46820719+pandalee99@users.noreply.github.com> Date: Mon, 23 Dec 2024 10:29:46 +0800 Subject: [PATCH] feat(python): Hardcoding metastring into passable parameters (#1987) ## What does this PR do? In the original MetaString, MetaStringEncoder used hard coding directly to solve the special char1/2 situation, but this was not the best choice. So it's passable, allowing MetaString to select the special char it passes. ## Related issues Close #1983 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- python/pyfury/meta/metastring.py | 108 +++++++++++++++++++++---- python/pyfury/tests/test_metastring.py | 51 +++++++----- 2 files changed, 122 insertions(+), 37 deletions(-) diff --git a/python/pyfury/meta/metastring.py b/python/pyfury/meta/metastring.py index 63232b5611..4ff065105b 100644 --- a/python/pyfury/meta/metastring.py +++ b/python/pyfury/meta/metastring.py @@ -48,12 +48,20 @@ class Encoding(Enum): class MetaString: def __init__( - self, original: str, encoding: Encoding, encoded_data: bytes, length: int + self, + original: str, + encoding: Encoding, + encoded_data: bytes, + length: int, + special_char1: str = ".", + special_char2: str = "|", ): self.original = original self.encoding = encoding self.encoded_data = encoded_data self.length = length + self.special_char1 = special_char1 + self.special_char2 = special_char2 if self.encoding != Encoding.UTF_8: self.strip_last_char = (encoded_data[0] & 0x80) != 0 else: @@ -65,6 +73,17 @@ class MetaStringDecoder: Decodes MetaString objects back into their original plain text form. """ + def __init__(self, special_char1: str, special_char2: str): + """ + Creates a MetaStringDecoder with specified special characters used for decoding. + + Args: + special_char1 (str): The first special character used for encoding. + special_char2 (str): The second special character used for encoding. + """ + self.special_char1 = special_char1 + self.special_char2 = special_char2 + def decode(self, encoded_data: bytes, encoding: Encoding) -> str: """ Decodes the encoded data using the specified encoding. @@ -203,9 +222,9 @@ def _decode_lower_upper_digit_special_char(self, char_value: int) -> str: elif 52 <= char_value <= 61: return chr(ord("0") + (char_value - 52)) elif char_value == 62: - return "." + return self.special_char1 # Use special_char1 for the encoding elif char_value == 63: - return "_" + return self.special_char2 # Use special_char2 for the encoding else: raise ValueError( f"Invalid character value for LOWER_UPPER_DIGIT_SPECIAL: {char_value}" @@ -250,9 +269,16 @@ def _decode_rep_all_to_lower_special(self, data: bytes) -> str: class MetaStringEncoder: - """ - Encodes plain text strings into MetaString objects with specified encoding mechanisms. - """ + def __init__(self, special_char1: str, special_char2: str): + """ + Creates a MetaStringEncoder with specified special characters used for encoding. + + Args: + special_char1 (str): The first special character used in custom encoding. + special_char2 (str): The second special character used in custom encoding. + """ + self.special_char1 = special_char1 + self.special_char2 = special_char2 def encode(self, input_string: str) -> MetaString: """ @@ -270,7 +296,14 @@ def encode(self, input_string: str) -> MetaString: ), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed." if not input_string: - return MetaString(input_string, Encoding.UTF_8, bytes(), 0) + return MetaString( + input_string, + Encoding.UTF_8, + bytes(), + 0, + self.special_char1, + self.special_char2, + ) encoding = self.compute_encoding(input_string) return self.encode_with_encoding(input_string, encoding) @@ -292,29 +325,67 @@ def encode_with_encoding(self, input_string: str, encoding: Encoding) -> MetaStr ), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed." if not input_string: - return MetaString(input_string, Encoding.UTF_8, bytes(), 0) + return MetaString( + input_string, + Encoding.UTF_8, + bytes(), + 0, + self.special_char1, + self.special_char2, + ) length = len(input_string) if encoding == Encoding.LOWER_SPECIAL: encoded_data = self._encode_lower_special(input_string) - return MetaString(input_string, encoding, encoded_data, length * 5) + return MetaString( + input_string, + encoding, + encoded_data, + length * 5, + self.special_char1, + self.special_char2, + ) elif encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL: encoded_data = self._encode_lower_upper_digit_special(input_string) - return MetaString(input_string, encoding, encoded_data, length * 6) + return MetaString( + input_string, + encoding, + encoded_data, + length * 6, + self.special_char1, + self.special_char2, + ) elif encoding == Encoding.FIRST_TO_LOWER_SPECIAL: encoded_data = self._encode_first_to_lower_special(input_string) - return MetaString(input_string, encoding, encoded_data, length * 5) + return MetaString( + input_string, + encoding, + encoded_data, + length * 5, + self.special_char1, + self.special_char2, + ) elif encoding == Encoding.ALL_TO_LOWER_SPECIAL: chars = list(input_string) upper_count = sum(1 for c in chars if c.isupper()) encoded_data = self._encode_all_to_lower_special(chars) return MetaString( - input_string, encoding, encoded_data, (upper_count + length) * 5 + input_string, + encoding, + encoded_data, + (upper_count + length) * 5, + self.special_char1, + self.special_char2, ) else: encoded_data = bytes(input_string, "utf-8") return MetaString( - input_string, Encoding.UTF_8, encoded_data, len(encoded_data) * 8 + input_string, + Encoding.UTF_8, + encoded_data, + len(encoded_data) * 8, + self.special_char1, + self.special_char2, ) def compute_encoding(self, input_string: str) -> Encoding: @@ -363,7 +434,12 @@ def _compute_statistics(self, chars: List[str]) -> Statistics: upper_count = 0 for c in chars: if can_lower_upper_digit_special_encoded: - if not (c.islower() or c.isupper() or c.isdigit() or c in {".", "_"}): + if not ( + c.islower() + or c.isupper() + or c.isdigit() + or c in {self.special_char1, self.special_char2} + ): can_lower_upper_digit_special_encoded = False if can_lower_special_encoded: if not (c.islower() or c in {".", "_", "$", "|"}): @@ -500,9 +576,9 @@ def _char_to_value(self, c: str, bits_per_char: int) -> int: return 26 + (ord(c) - ord("A")) elif "0" <= c <= "9": return 52 + (ord(c) - ord("0")) - elif c == ".": + elif c == self.special_char1: return 62 - elif c == "_": + elif c == self.special_char2: return 63 else: raise ValueError( diff --git a/python/pyfury/tests/test_metastring.py b/python/pyfury/tests/test_metastring.py index 7dd98ff73e..95596edf21 100644 --- a/python/pyfury/tests/test_metastring.py +++ b/python/pyfury/tests/test_metastring.py @@ -24,8 +24,10 @@ def test_encode_metastring_lower_special(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + + # Test for encoding and decoding encoded = encoder._encode_lower_special("abc_def") assert len(encoded) == 5 assert len(encoder.encode("org.apache.fury.benchmark.data").encoded_data) == 19 @@ -41,10 +43,12 @@ def test_encode_metastring_lower_special(): def test_encode_metastring_lower_upper_digit_special(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + + # Test for encoding and decoding encoded = encoder._encode_lower_upper_digit_special("ExampleInput123") assert len(encoded) == 12 - decoder = MetaStringDecoder() decoded = decoder.decode(encoded, Encoding.LOWER_UPPER_DIGIT_SPECIAL) assert decoded == "ExampleInput123" @@ -73,8 +77,9 @@ def create_string(length): def test_metastring(): + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") - encoder = MetaStringEncoder() for i in range(1, 128): try: string = create_string(i) @@ -82,7 +87,6 @@ def test_metastring(): assert metastring.encoding != Encoding.UTF_8 assert metastring.original == string - decoder = MetaStringDecoder() new_string = decoder.decode(metastring.encoded_data, metastring.encoding) assert new_string == string except Exception as e: @@ -90,8 +94,9 @@ def test_metastring(): def test_encode_empty_string(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + for encoding in [ Encoding.LOWER_SPECIAL, Encoding.LOWER_UPPER_DIGIT_SPECIAL, @@ -106,7 +111,7 @@ def test_encode_empty_string(): def test_encode_characters_outside_of_lower_special(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "abcdefABCDEF1234!@#" metastring = encoder.encode(test_string) @@ -114,8 +119,9 @@ def test_encode_characters_outside_of_lower_special(): def test_all_to_upper_special_encoding(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + test_string = "ABC_DEF" metastring = encoder.encode(test_string) assert metastring.encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL @@ -124,8 +130,9 @@ def test_all_to_upper_special_encoding(): def test_first_to_lower_special_encoding(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + test_string = "Aabcdef" metastring = encoder.encode(test_string) assert metastring.encoding == Encoding.FIRST_TO_LOWER_SPECIAL @@ -134,8 +141,9 @@ def test_first_to_lower_special_encoding(): def test_utf8_encoding(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + test_string = "你好,世界" # Non-Latin characters metastring = encoder.encode(test_string) assert metastring.encoding == Encoding.UTF_8 @@ -144,7 +152,7 @@ def test_utf8_encoding(): def test_strip_last_char(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "abc" # encoded as 1|00000|00, 001|00010, exactly two bytes encoded_metastring = encoder.encode(test_string) @@ -156,8 +164,9 @@ def test_strip_last_char(): def test_empty_string(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + metastring = encoder.encode("") assert metastring.encoded_data == bytes() @@ -166,7 +175,7 @@ def test_empty_string(): def test_ascii_encoding(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "asciiOnly" encoded_metastring = encoder.encode(test_string) @@ -175,7 +184,7 @@ def test_ascii_encoding(): def test_non_ascii_encoding(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "こんにちは" # Non-ASCII string encoded_metastring = encoder.encode(test_string) @@ -183,7 +192,7 @@ def test_non_ascii_encoding(): def test_non_ascii_encoding_and_non_utf8(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") non_ascii_string = "こんにちは" # Non-ASCII string