diff --git a/go/fury/meta/meta_string.go b/go/fury/meta/meta_string.go new file mode 100644 index 0000000000..429e6e58ed --- /dev/null +++ b/go/fury/meta/meta_string.go @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package meta + +// Encoding Algorithms Flags +type Encoding uint8 + +const ( + UTF_8 Encoding = 0x00 + LOWER_SPECIAL Encoding = 0x01 + LOWER_UPPER_DIGIT_SPECIAL Encoding = 0x02 + FIRST_TO_LOWER_SPECIAL Encoding = 0x03 + ALL_TO_LOWER_SPECIAL Encoding = 0x04 +) + +// MetaString saves the serialized data +type MetaString struct { + inputString string + encoding Encoding // encoding flag + specialChar1 byte + specialChar2 byte + encodedBytes []byte // serialized data +} + +func (ms *MetaString) GetInputString() string { return ms.inputString } + +func (ms *MetaString) GetEncoding() Encoding { return ms.encoding } + +func (ms *MetaString) GetSpecialChar1() byte { return ms.specialChar1 } + +func (ms *MetaString) GetSpecialChar2() byte { return ms.specialChar2 } + +func (ms *MetaString) GetEncodedBytes() []byte { return ms.encodedBytes } + +// StripLastChar return true if last char should be stripped +func (ms *MetaString) StripLastChar() bool { + if ms.encoding == UTF_8 || ms.encodedBytes == nil { + return false + } + return (ms.encodedBytes[0] & 0x80) > 0 +} diff --git a/go/fury/meta/meta_string_decoder.go b/go/fury/meta/meta_string_decoder.go new file mode 100644 index 0000000000..59273b0324 --- /dev/null +++ b/go/fury/meta/meta_string_decoder.go @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package meta + +import ( + "fmt" +) + +type Decoder struct { + specialChar1 byte + specialChar2 byte +} + +func NewDecoder(specialCh1 byte, specialCh2 byte) *Decoder { + return &Decoder{ + specialChar1: specialCh1, + specialChar2: specialCh2, + } +} + +// Decode +// Accept an encodedBytes byte array, and the encoding method +func (d *Decoder) Decode(data []byte, encoding Encoding) (result string, err error) { + // we prepend one bit at the start to indicate whether strip last char + // so checking empty here will be convenient for decoding procedure + if data == nil { + return "", err + } + var chars []byte + switch encoding { + case LOWER_SPECIAL: + chars, err = d.decodeGeneric(data, encoding) + case LOWER_UPPER_DIGIT_SPECIAL: + chars, err = d.decodeGeneric(data, encoding) + case FIRST_TO_LOWER_SPECIAL: + chars, err = d.decodeGeneric(data, LOWER_SPECIAL) + if err == nil { + chars[0] = chars[0] - 'a' + 'A' + } + case ALL_TO_LOWER_SPECIAL: + chars, err = d.decodeRepAllToLowerSpecial(data, LOWER_SPECIAL) + case UTF_8: + chars = data + default: + err = fmt.Errorf("Unexpected encoding flag: %v\n", encoding) + } + if err != nil { + return "", err + } + return string(chars), err +} + +// DecodeGeneric +// algorithm is LowerSpecial or LowerUpperDigit +func (d *Decoder) decodeGeneric(data []byte, algorithm Encoding) ([]byte, error) { + bitsPerChar := 5 + if algorithm == LOWER_UPPER_DIGIT_SPECIAL { + bitsPerChar = 6 + } + // Retrieve 5 bits every iteration from data, convert them to characters, and save them to chars + // "abc" encodedBytes as [00000] [000,01] [00010] [0, corresponding to three bytes, which are 0, 68, 0 + // Take the highest digit first, then the lower, in order + + // here access data[0] before entering the loop, so we had to deal with empty data in Decode method + // totChars * bitsPerChar <= totBits < (totChars + 1) * bitsPerChar + stripLastChar := (data[0] & 0x80) >> 7 + totBits := len(data)*8 - 1 - int(stripLastChar)*bitsPerChar + totChars := totBits / bitsPerChar + chars := make([]byte, totChars) + bitPos, bitCount := 6, 1 // first highest bit indicates whether strip last char + for i := 0; i < totChars; i++ { + var val byte = 0 + for i := 0; i < bitsPerChar; i++ { + if data[bitCount/8]&(1< 0 { + val |= 1 << (bitsPerChar - i - 1) + } + bitPos = (bitPos - 1 + 8) % 8 + bitCount++ + } + ch, err := d.decodeChar(val, algorithm) + if err != nil { + return nil, err + } + chars[i] = ch + } + return chars, nil +} + +func (d *Decoder) decodeRepAllToLowerSpecial(data []byte, algorithm Encoding) ([]byte, error) { + // Decode the data to the lowercase letters, then convert + str, err := d.decodeGeneric(data, algorithm) + if err != nil { + return nil, err + } + chars := make([]byte, len(str)) + j := 0 + for i := 0; i < len(str); i++ { + if str[i] == '|' { + chars[j] = str[i+1] - 'a' + 'A' + i++ + } else { + chars[j] = str[i] + } + j++ + } + return chars[0:j], nil +} + +/** Decoding char for two encoding algorithms */ +func (d *Decoder) decodeChar(val byte, encoding Encoding) (byte, error) { + switch encoding { + case LOWER_SPECIAL: + return d.decodeLowerSpecialChar(val) + case LOWER_UPPER_DIGIT_SPECIAL: + return d.decodeLowerUpperDigitSpecialChar(val) + } + return 0, fmt.Errorf("Illegal encoding flag: %v\n", encoding) +} + +/** Decoding char for LOWER_SPECIAL Encoding Algorithm */ +func (d *Decoder) decodeLowerSpecialChar(charValue byte) (val byte, err error) { + if charValue <= 25 { + val = 'a' + charValue + } else if charValue == 26 { + val = '.' + } else if charValue == 27 { + val = '_' + } else if charValue == 28 { + val = '$' + } else if charValue == 29 { + val = '|' + } else { + err = fmt.Errorf("Invalid character value for LOWER_SPECIAL: %v\n", charValue) + } + return +} + +/** Decoding char for LOWER_UPPER_DIGIT_SPECIAL Encoding Algorithm. */ +func (d *Decoder) decodeLowerUpperDigitSpecialChar(charValue byte) (val byte, err error) { + if charValue <= 25 { + val = 'a' + charValue + } else if charValue >= 26 && charValue <= 51 { + val = 'A' + (charValue - 26) + } else if charValue >= 52 && charValue <= 61 { + val = '0' + (charValue - 52) + } else if charValue == 62 { + val = d.specialChar1 + } else if charValue == 63 { + val = d.specialChar2 + } else { + err = fmt.Errorf("invalid character value for LOWER_UPPER_DIGIT_SPECIAL: %v", charValue) + } + return +} diff --git a/go/fury/meta/meta_string_encoder.go b/go/fury/meta/meta_string_encoder.go new file mode 100644 index 0000000000..f88e2c1b77 --- /dev/null +++ b/go/fury/meta/meta_string_encoder.go @@ -0,0 +1,253 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package meta + +import ( + "errors" + "fmt" +) + +type Encoder struct { + specialChar1 byte + specialChar2 byte +} + +func NewEncoder(specialCh1 byte, specialCh2 byte) *Encoder { + return &Encoder{ + specialChar1: specialCh1, + specialChar2: specialCh2, + } +} + +// Encode the input string to MetaString using adaptive encoding +func (e *Encoder) Encode(input string) (MetaString, error) { + encoding := e.ComputeEncoding(input) + return e.EncodeWithEncoding(input, encoding) +} + +// EncodeWithEncoding Encodes the input string to MetaString using specified encoding. +func (e *Encoder) EncodeWithEncoding(input string, encoding Encoding) (MetaString, error) { + if len(input) > 32767 { + return MetaString{}, errors.New("long meta string than 32767 is not allowed") + } + if len(input) == 0 { + // we prepend one bit at the start to indicate whether strip last char + // so checking empty here will be convenient for encoding procedure + return MetaString{ + inputString: input, + encoding: encoding, + specialChar1: e.specialChar1, + specialChar2: e.specialChar2, + encodedBytes: nil, + }, nil + } + // execute encoding algorithm according to the encoding mode + var encodedBytes []byte + var err error + switch encoding { + case LOWER_SPECIAL: + encodedBytes, err = e.EncodeLowerSpecial(input) + case LOWER_UPPER_DIGIT_SPECIAL: + encodedBytes, err = e.EncodeLowerUpperDigitSpecial(input) + case FIRST_TO_LOWER_SPECIAL: + encodedBytes, err = e.EncodeFirstToLowerSpecial(input) + case ALL_TO_LOWER_SPECIAL: + encodedBytes, err = e.EncodeAllToLowerSpecial(input) + default: + // UTF-8 Encoding, stay the same + encodedBytes = []byte(input) + } + return MetaString{ + inputString: input, + encoding: encoding, + specialChar1: e.specialChar1, + specialChar2: e.specialChar2, + encodedBytes: encodedBytes, + }, err +} + +func (e *Encoder) EncodeLowerSpecial(input string) ([]byte, error) { + return e.EncodeGeneric([]byte(input), 5) +} + +func (e *Encoder) EncodeLowerUpperDigitSpecial(input string) ([]byte, error) { + return e.EncodeGeneric([]byte(input), 6) +} + +func (e *Encoder) EncodeFirstToLowerSpecial(input string) ([]byte, error) { + // all chars in string are ASCII, so we can modify input[0] directly + chars := []byte(input) + chars[0] = chars[0] - 'A' + 'a' // chars[0] is sure to exist and is upper letter + return e.EncodeGeneric(chars, 5) +} + +func (e *Encoder) EncodeAllToLowerSpecial(input string) ([]byte, error) { + chars := make([]byte, len(input)+countUppers(input)) + idx := 0 + for i := 0; i < len(input); i++ { + if input[i] >= 'A' && input[i] <= 'Z' { + chars[idx] = '|' + chars[idx+1] = input[i] - 'A' + 'a' + idx += 2 + } else { + chars[idx] = input[i] + idx += 1 + } + } + return e.EncodeGeneric(chars, 5) +} + +func (e *Encoder) EncodeGeneric(chars []byte, bitsPerChar int) (result []byte, err error) { + totBits := len(chars)*bitsPerChar + 1 + result = make([]byte, (totBits+7)/8) + currentBit := 1 + for _, c := range chars { + var value byte + if bitsPerChar == 5 { + value, err = e.charToValueLowerSpecial(c) + } else if bitsPerChar == 6 { + value, err = e.charToValueLowerUpperDigitSpecial(c) + } + if err != nil { + return nil, err + } + // Use currentBit to figure out where the result should be filled + // abc encodedBytes as [00000] [000,01] [00010] [0, corresponding to three bytes, which are 0, 68, 0 (68 = 64 + 4) + // In order, put the highest bit first, then the lower + for i := bitsPerChar - 1; i >= 0; i-- { + if (value & (1 << i)) > 0 { + bytePos := currentBit / 8 + bitPos := currentBit % 8 + result[bytePos] |= 1 << (7 - bitPos) + } + currentBit++ + } + } + if totBits+bitsPerChar <= len(result)*8 { + result[0] |= byte(0x80) + } + return +} + +func (e *Encoder) ComputeEncoding(input string) Encoding { + statistics := e.computeStringStatistics(input) + if statistics.canLowerSpecialEncoded { + return LOWER_SPECIAL + } + if statistics.canLowerUpperDigitSpecialEncoded { + // Here, the string contains only letters, numbers, and two special symbols + if statistics.digitCount != 0 { + return LOWER_UPPER_DIGIT_SPECIAL + } + upperCount := statistics.upperCount + chars := []byte(input) + if upperCount == 1 && chars[0] >= 'A' && chars[0] <= 'Z' { + return FIRST_TO_LOWER_SPECIAL + } + if (len(chars)+upperCount)*5 < len(chars)*6 { + return ALL_TO_LOWER_SPECIAL + } + return LOWER_UPPER_DIGIT_SPECIAL + } + return UTF_8 +} + +type stringStatistics struct { + digitCount int + upperCount int + canLowerSpecialEncoded bool + canLowerUpperDigitSpecialEncoded bool +} + +func (e *Encoder) computeStringStatistics(input string) *stringStatistics { + digitCount, upperCount := 0, 0 + canLowerSpecialEncoded := true + canLowerUpperDigitSpecialEncoded := true + for _, c := range []byte(input) { + if canLowerUpperDigitSpecialEncoded { + if !(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || + c >= '0' && c <= '9' || c == e.specialChar1 || c == e.specialChar2) { + canLowerUpperDigitSpecialEncoded = false + } + } + + if canLowerSpecialEncoded { + if !(c >= 'a' && c <= 'z' || c == '.' || c == '_' || c == '$' || c == '|') { + canLowerSpecialEncoded = false + } + } + + if c >= '0' && c <= '9' { + digitCount++ + } + + if c >= 'A' && c <= 'Z' { + upperCount++ + } + } + return &stringStatistics{ + digitCount: digitCount, + upperCount: upperCount, + canLowerSpecialEncoded: canLowerSpecialEncoded, + canLowerUpperDigitSpecialEncoded: canLowerUpperDigitSpecialEncoded, + } +} + +func countUppers(str string) int { + cnt := 0 + for i := 0; i < len(str); i++ { + if str[i] >= 'A' && str[i] <= 'Z' { + cnt++ + } + } + return cnt +} + +func (e *Encoder) charToValueLowerSpecial(c byte) (val byte, err error) { + if c >= 'a' && c <= 'z' { + val = c - 'a' + } else if c == '.' { + val = 26 + } else if c == '_' { + val = 27 + } else if c == '$' { + val = 28 + } else if c == '|' { + val = 29 + } else { + err = fmt.Errorf("Unsupported character for LOWER_SPECIAL encoding: %v\n", c) + } + return +} + +func (e *Encoder) charToValueLowerUpperDigitSpecial(c byte) (val byte, err error) { + if c >= 'a' && c <= 'z' { + val = c - 'a' + } else if c >= 'A' && c <= 'Z' { + val = 26 + (c - 'A') + } else if c >= '0' && c <= '9' { + val = 52 + (c - '0') + } else if c == e.specialChar1 { + val = 62 + } else if c == e.specialChar2 { + val = 63 + } else { + err = fmt.Errorf("Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: %v\n", c) + } + return +} diff --git a/go/fury/meta/meta_string_test.go b/go/fury/meta/meta_string_test.go new file mode 100644 index 0000000000..0fb89f6dbb --- /dev/null +++ b/go/fury/meta/meta_string_test.go @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package meta + +import ( + "github.com/stretchr/testify/require" + "testing" +) + +func TestEncodeAndDecodeMetaString(t *testing.T) { + var data MetaString + var dst string + var err error + + str2bits := map[string]int{ + // "abc_def" should be encoded as 0|00000|00, 001|00010|, 11011|000, 11|00100|0, 0101 + "abc_def": 5, + "org.apache.fury.benchmark.data": 5, + "HelloWorld__123.2024": 6, + "MediaContent": 5, + "Apple_banana": 5, + "你好,世界": 0, // not used + } + str2encoding := map[string]Encoding{ + "abc_def": LOWER_SPECIAL, + "org.apache.fury.benchmark.data": LOWER_SPECIAL, + "MediaContent": ALL_TO_LOWER_SPECIAL, + "HelloWorld__123.2024": LOWER_UPPER_DIGIT_SPECIAL, + "Apple_banana": FIRST_TO_LOWER_SPECIAL, + "你好,世界": UTF_8, + } + encoder := NewEncoder('.', '_') + decoder := NewDecoder('.', '_') + + for src, bitsPerChar := range str2bits { + data, err = encoder.Encode(src) + require.Equal(t, nil, err) + require.Equal(t, str2encoding[src], data.GetEncoding()) + require.Equal(t, calcTotalBytes(src, bitsPerChar, data.GetEncoding()), len(data.GetEncodedBytes())) + dst, err = decoder.Decode(data.GetEncodedBytes(), data.GetEncoding()) + require.Equal(t, nil, err) + require.Equal(t, src, dst) + } + + // error situation + dst, err = decoder.Decode([]byte{0xFF, 0x31}, LOWER_SPECIAL) + require.NotEqual(t, nil, err) + + // empty string + data, err = encoder.Encode("") + require.Equal(t, nil, err) + require.Equal(t, 0, len(data.GetEncodedBytes())) + dst, err = decoder.Decode(data.GetEncodedBytes(), data.GetEncoding()) + require.Equal(t, nil, err) + require.Equal(t, "", dst) +} + +func calcTotalBytes(src string, bitsPerChar int, encoding Encoding) int { + if encoding == UTF_8 { + return len(src) + } + ret := len(src)*bitsPerChar + 1 + if encoding == ALL_TO_LOWER_SPECIAL { + ret += countUppers(src) * bitsPerChar + } + return (ret + 7) / 8 +}