diff --git a/columnar/src/columnar/writer/serializer.rs b/columnar/src/columnar/writer/serializer.rs index d3f8b04661..394e61cd96 100644 --- a/columnar/src/columnar/writer/serializer.rs +++ b/columnar/src/columnar/writer/serializer.rs @@ -18,7 +18,12 @@ pub struct ColumnarSerializer { /// code. fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec) { buffer.clear(); - buffer.extend_from_slice(key); + // Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path. + if key.contains(&0u8) { + buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b })); + } else { + buffer.extend_from_slice(key); + } buffer.push(0u8); buffer.push(column_type.to_code()); } @@ -102,7 +107,7 @@ mod tests { let mut buffer: Vec = b"somegarbage".to_vec(); prepare_key(b"root\0child", ColumnType::Str, &mut buffer); assert_eq!(buffer.len(), 12); - assert_eq!(&buffer[..10], b"root\0child"); + assert_eq!(&buffer[..10], b"root0child"); assert_eq!(buffer[10], 0u8); assert_eq!(buffer[11], ColumnType::Str.to_code()); } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 5b43a03301..eb2ae0b13e 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -144,6 +144,115 @@ mod tests_mmap { assert_eq!(num_docs, 256); } } + #[test] + fn test_json_field_null_byte() { + // Test when field name contains a zero byte, which has special meaning in tantivy. + // As a workaround, we convert the zero byte to the ASCII character '0'. + // https://github.com/quickwit-oss/tantivy/issues/2340 + // https://github.com/quickwit-oss/tantivy/issues/2193 + let field_name_in = "\u{0000}"; + let field_name_out = "0"; + test_json_field_name(field_name_in, field_name_out); + } + #[test] + fn test_json_field_1byte() { + // Test when field name contains a 1 byte, which has special meaning in tantivy. + let field_name_in = "\u{0001}"; + let field_name_out = "\u{0001}"; + test_json_field_name(field_name_in, field_name_out); + + // Test when field name contains a 1 byte, which has special meaning in tantivy. + let field_name_in = "\u{0001}"; + let field_name_out = "."; + test_json_field_name(field_name_in, field_name_out); + } + fn test_json_field_name(field_name_in: &str, field_name_out: &str) { + let mut schema_builder = Schema::builder(); + + let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled(); + let field = schema_builder.add_json_field("json", options); + let index = Index::create_in_ram(schema_builder.build()); + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer + .add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"}))) + .unwrap(); + index_writer + .add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"}))) + .unwrap(); + index_writer + .add_document(doc!(field=>json!({format!("a{field_name_in}a"): "test3"}))) + .unwrap(); + index_writer + .add_document( + doc!(field=>json!({format!("a{field_name_in}a{field_name_in}"): "test4"})), + ) + .unwrap(); + index_writer + .add_document( + doc!(field=>json!({format!("a{field_name_in}.ab{field_name_in}"): "test5"})), + ) + .unwrap(); + index_writer + .add_document( + doc!(field=>json!({format!("a{field_name_in}"): json!({format!("a{field_name_in}"): "test6"}) })), + ) + .unwrap(); + index_writer + .add_document(doc!(field=>json!({format!("{field_name_in}a" ): "test7"}))) + .unwrap(); + + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let parse_query = QueryParser::for_index(&index, Vec::new()); + let test_query = |field_name: &str| { + let query = parse_query.parse_query(field_name).unwrap(); + let num_docs = searcher.search(&query, &Count).unwrap(); + assert_eq!(num_docs, 1); + }; + test_query(format!("json.{field_name_out}:test1").as_str()); + test_query(format!("json.a{field_name_out}:test2").as_str()); + test_query(format!("json.a{field_name_out}a:test3").as_str()); + test_query(format!("json.a{field_name_out}a{field_name_out}:test4").as_str()); + test_query(format!("json.a{field_name_out}.ab{field_name_out}:test5").as_str()); + test_query(format!("json.a{field_name_out}.a{field_name_out}:test6").as_str()); + test_query(format!("json.{field_name_out}a:test7").as_str()); + + let test_agg = |field_name: &str, expected: &str| { + let agg_req_str = json!( + { + "termagg": { + "terms": { + "field": field_name, + } + } + }); + + let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap(); + let collector = AggregationCollector::from_aggs(agg_req, Default::default()); + let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap(); + let res = serde_json::to_value(agg_res).unwrap(); + assert_eq!(res["termagg"]["buckets"][0]["doc_count"], 1); + assert_eq!(res["termagg"]["buckets"][0]["key"], expected); + }; + + test_agg(format!("json.{field_name_out}").as_str(), "test1"); + test_agg(format!("json.a{field_name_out}").as_str(), "test2"); + test_agg(format!("json.a{field_name_out}a").as_str(), "test3"); + test_agg( + format!("json.a{field_name_out}a{field_name_out}").as_str(), + "test4", + ); + test_agg( + format!("json.a{field_name_out}.ab{field_name_out}").as_str(), + "test5", + ); + test_agg( + format!("json.a{field_name_out}.a{field_name_out}").as_str(), + "test6", + ); + test_agg(format!("json.{field_name_out}a").as_str(), "test7"); + } #[test] fn test_json_field_expand_dots_enabled_dot_escape_not_required() { diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs index 9f0d8eb06f..dfadca7ed7 100644 --- a/src/postings/json_postings_writer.rs +++ b/src/postings/json_postings_writer.rs @@ -67,10 +67,18 @@ impl PostingsWriter for JsonPostingsWriter { ) -> io::Result<()> { let mut term_buffer = Term::with_capacity(48); let mut buffer_lender = BufferLender::default(); + term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0)); + let mut prev_term_id = u32::MAX; + let mut term_path_len = 0; // this will be set in the first iteration for (_field, path_id, term, addr) in term_addrs { - term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0)); - term_buffer.append_bytes(ordered_id_to_path[path_id.path_id() as usize].as_bytes()); - term_buffer.append_bytes(&[JSON_END_OF_PATH]); + if prev_term_id != path_id.path_id() { + term_buffer.truncate_value_bytes(0); + term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes()); + term_buffer.append_bytes(&[JSON_END_OF_PATH]); + term_path_len = term_buffer.len_bytes(); + prev_term_id = path_id.path_id(); + } + term_buffer.truncate_value_bytes(term_path_len); term_buffer.append_bytes(term); if let Some(json_value) = term_buffer.value().as_json_value_bytes() { let typ = json_value.typ(); diff --git a/src/schema/term.rs b/src/schema/term.rs index db707e2948..2bfac53e82 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -218,6 +218,23 @@ impl Term { &mut self.0[len_before..] } + /// Appends json path bytes to the Term. + /// If the path contains 0 bytes, they are replaced by a "0" string. + /// The 0 byte is used to mark the end of the path. + /// + /// This function returns the segment that has just been added. + #[inline] + pub fn append_path(&mut self, bytes: &[u8]) -> &mut [u8] { + let len_before = self.0.len(); + if bytes.contains(&0u8) { + self.0 + .extend(bytes.iter().map(|&b| if b == 0 { b'0' } else { b })); + } else { + self.0.extend_from_slice(bytes); + } + &mut self.0[len_before..] + } + /// Appends a JSON_PATH_SEGMENT_SEP to the term. /// Only used for JSON type. #[inline]