Skip to content

Commit

Permalink
feat!: added null support
Browse files Browse the repository at this point in the history
  • Loading branch information
vicantwin committed Oct 10, 2024
1 parent 5bb55be commit a9abd29
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 26 deletions.
52 changes: 33 additions & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use unicode_segmentation::UnicodeSegmentation;
#[serde(tag = "type", try_from = "PrecompiledDeserializer")]
pub struct Precompiled {
#[serde(serialize_with = "as_base64", deserialize_with = "from_base64")]
precompiled_charsmap: Vec<u8>,
precompiled_charsmap: Option<Vec<u8>>,
#[serde(skip)]
normalized: String,
#[serde(skip)]
Expand All @@ -38,31 +38,40 @@ pub struct Precompiled {
#[serde(tag = "type")]
struct PrecompiledDeserializer {
#[serde(deserialize_with = "from_base64")]
precompiled_charsmap: Vec<u8>,
precompiled_charsmap: Option<Vec<u8>>,
}

fn as_base64<T, S>(key: &T, serializer: S) -> Result<S::Ok, S::Error>
fn as_base64<T, S>(key: &Option<T>, serializer: S) -> Result<S::Ok, S::Error>
where
T: AsRef<[u8]>,
S: Serializer,
{
serializer.serialize_str(&base64::encode(key.as_ref()))
match key {
Some(k) => serializer.serialize_str(&base64::encode(k.as_ref())),
None => serializer.serialize_none(),
}
}

fn from_base64<'de, D>(deserializer: D) -> Result<Vec<u8>, D::Error>
fn from_base64<'de, D>(deserializer: D) -> Result<Option<Vec<u8>>, D::Error>
where
D: Deserializer<'de>,
{
let s: &str = Deserialize::deserialize(deserializer)?;
let precompiled_charsmap = base64::decode(s).map_err(|err| Error::custom(err.to_string()))?;
Ok(precompiled_charsmap)
let opt_s: Option<&str> = Option::deserialize(deserializer)?;
match opt_s {
Some(s) => {
let precompiled_charsmap =
base64::decode(s).map_err(|err| Error::custom(err.to_string()))?;
Ok(Some(precompiled_charsmap))
}
None => Ok(None), // Handle the null case
}
}

impl TryFrom<PrecompiledDeserializer> for Precompiled {
type Error = PrecompiledError;

fn try_from(t: PrecompiledDeserializer) -> Result<Self, Self::Error> {
Self::from(&t.precompiled_charsmap)
Self::from(t.precompiled_charsmap.as_deref())
}
}

Expand Down Expand Up @@ -158,16 +167,21 @@ impl std::fmt::Display for PrecompiledError {
impl std::error::Error for PrecompiledError {}

impl Precompiled {
pub fn from(precompiled_charsmap: &[u8]) -> Result<Precompiled, PrecompiledError> {
let (normalized_blob, trie_blob) =
parse(precompiled_charsmap).map_err(|_| PrecompiledError::ParseError)?;
let normalized = String::from_utf8(normalized_blob.to_vec())
.map_err(|_| PrecompiledError::NormalizedInvalidUtf8)?;
let trie = DoubleArray::from(trie_blob);
let precompiled = Precompiled {
precompiled_charsmap: precompiled_charsmap.to_vec(),
normalized,
trie,
pub fn from(precompiled_charsmap: Option<&[u8]>) -> Result<Precompiled, PrecompiledError> {
let precompiled = match precompiled_charsmap {
Some(charsmap) => {
let (normalized_blob, trie_blob) =
parse(charsmap).map_err(|_| PrecompiledError::ParseError)?;
let normalized = String::from_utf8(normalized_blob.to_vec())
.map_err(|_| PrecompiledError::NormalizedInvalidUtf8)?;
let trie = DoubleArray::from(trie_blob);
Precompiled {
precompiled_charsmap: Some(charsmap.to_vec()),
normalized,
trie,
}
}
None => Precompiled::default(),
};
Ok(precompiled)
}
Expand Down
27 changes: 20 additions & 7 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::io::Read;

#[test]
fn test_load_precompiled_map() {
let precompiled = Precompiled::from(&nmt_nfkc()).unwrap();
let precompiled = Precompiled::from(nmt_nfkc().as_deref()).unwrap();
let results = precompiled.trie.common_prefix_search("\u{fb01}".as_bytes());
assert_eq!(results, vec![2130]);
// Check the null termination
Expand All @@ -24,23 +24,23 @@ fn test_load_precompiled_map() {

#[test]
fn test_precompiled_failure_mode() {
let precompiled = Precompiled::from(&nmt_nfkc()).unwrap();
let precompiled = Precompiled::from(nmt_nfkc().as_deref()).unwrap();
let original = "เขาไม่ได้พูดสักคำ".to_string();
let normalized = "เขาไม\u{e48}ได\u{e49}\u{e39}ดส\u{e31}กค\u{e4d}า".to_string();
assert_eq!(precompiled.normalize_string(&original), normalized);
}

#[test]
fn test_precompiled_hindi() {
let precompiled = Precompiled::from(&nmt_nfkc()).unwrap();
let precompiled = Precompiled::from(nmt_nfkc().as_deref()).unwrap();
let original = "ड़ी दुख".to_string();
let normalized = "ड\u{93c}ी द\u{941}ख".to_string();
assert_eq!(precompiled.normalize_string(&original), normalized);
}

#[test]
fn test_precompiled_multi_char_replace_bug() {
let precompiled = Precompiled::from(&nmt_nfkc()).unwrap();
let precompiled = Precompiled::from(nmt_nfkc().as_deref()).unwrap();
// آپ
let original_bytes = vec![0xd8, 0xa7, 0xd9, 0x93];
let results = precompiled.trie.common_prefix_search(&original_bytes);
Expand All @@ -55,7 +55,7 @@ fn test_precompiled_multi_char_replace_bug() {

#[test]
fn test_serialization() {
let precompiled = Precompiled::from(&nmt_nfkc()).unwrap();
let precompiled = Precompiled::from(nmt_nfkc().as_deref()).unwrap();

let string = &serde_json::to_string(&precompiled).unwrap();
let reconstructed: Precompiled = serde_json::from_str(string).unwrap();
Expand All @@ -69,10 +69,23 @@ fn test_serialization() {
let _reconstructed2: Precompiled = serde_json::from_str(&string).unwrap();
}

fn nmt_nfkc() -> Vec<u8> {
#[test]
fn test_null_serialization() {
let precompiled = Precompiled::default();

let string = &serde_json::to_string(&precompiled).unwrap();
let reconstructed: Precompiled = serde_json::from_str(string).unwrap();
assert_eq!(
string,
"{\"type\":\"Precompiled\",\"precompiled_charsmap\":null}"
);
assert_eq!(reconstructed, precompiled);
}

fn nmt_nfkc() -> Option<Vec<u8>> {
let mut buffer = Vec::new();
let mut file = File::open("nmt_nfkc.bin").unwrap();
file.read_to_end(&mut buffer).unwrap();

buffer
Some(buffer)
}

0 comments on commit a9abd29

Please sign in to comment.