From 9d06390a077ed782d7cda49d0104d2e00ddd6083 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Fri, 1 Nov 2024 10:06:40 +0800 Subject: [PATCH] test: fix tokenizer and monkey patch faker function Signed-off-by: zhuwenxing --- tests/python_client/common/common_func.py | 76 ++++++++++++++++++- .../testcases/test_full_text_search.py | 14 +++- tests/python_client/testcases/test_query.py | 15 ++-- tests/python_client/testcases/test_search.py | 5 ++ 4 files changed, 100 insertions(+), 10 deletions(-) diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index d0770c62a7010..80dc98f15fe85 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -80,6 +80,72 @@ def prepare_param_info(self, host, port, handler, replica_num, user, password, s param_info = ParamInfo() +en_vocabularies_distribution = { + "hello": 0.01, + "milvus": 0.01, + "vector": 0.01, + "database": 0.01 +} + +zh_vocabularies_distribution = { + "你好": 0.01, + "向量": 0.01, + "数据": 0.01, + "库": 0.01 +} + +def patch_faker_text(fake_instance, vocabularies_distribution): + """ + Monkey patch the text() method of a Faker instance to include custom vocabulary. + Each word in vocabularies_distribution has an independent chance to be inserted. + + Args: + fake_instance: Faker instance to patch + vocabularies_distribution: Dictionary where: + - key: word to insert + - value: probability (0-1) of inserting this word into each sentence + + Example: + vocabularies_distribution = { + "hello": 0.1, # 10% chance to insert "hello" in each sentence + "milvus": 0.1, # 10% chance to insert "milvus" in each sentence + } + """ + original_text = fake_instance.text + + def new_text(nb_sentences=100, *args, **kwargs): + sentences = [] + # Split original text into sentences + original_sentences = original_text(nb_sentences).split('.') + original_sentences = [s.strip() for s in original_sentences if s.strip()] + + for base_sentence in original_sentences: + words = base_sentence.split() + + # Independently decide whether to insert each word + for word, probability in vocabularies_distribution.items(): + if random.random() < probability: + # Choose random position to insert the word + insert_pos = random.randint(0, len(words)) + words.insert(insert_pos, word) + + # Reconstruct the sentence + base_sentence = ' '.join(words) + + # Ensure proper capitalization + base_sentence = base_sentence[0].upper() + base_sentence[1:] + sentences.append(base_sentence) + + return '. '.join(sentences) + '.' + + + + # Replace the original text method with our custom one + fake_instance.text = new_text + + + + def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"): """ @@ -147,6 +213,14 @@ def blank_space_split(text): ) return tokenizer +def manual_check_text_match(df, word, col): + id_list = [] + for i in range(len(df)): + row = df.iloc[i] + # log.info(f"word :{word}, row: {row[col]}") + if word in row[col]: + id_list.append(row["id"]) + return id_list def analyze_documents(texts, language="en"): @@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"): def split_dataframes(df, fields, language="en"): df_copy = df.copy() - tokenizer = custom_tokenizer(language) for col in fields: + tokenizer = custom_tokenizer(language) texts = df[col].to_list() tokenized = tokenizer.tokenize(texts, return_as="tuple") new_texts = [] diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 4cd8a7c8a037e..c2b149ca95840 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -15,6 +15,11 @@ Faker.seed(19530) fake_en = Faker("en_US") fake_zh = Faker("zh_CN") + +# patch faker to generate text with specific distribution +cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution) +cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution) + pd.set_option("expand_frame_repr", False) prefix = "full_text_search_collection" @@ -2214,6 +2219,7 @@ def test_full_text_search_default( if i + batch_size < len(df) else data[i: len(df)] ) + collection_w.flush() collection_w.create_index( "emb", {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, @@ -2429,9 +2435,10 @@ def test_full_text_search_with_jieba_tokenizer( collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() limit = 100 - search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)] + token = random.choice(tokens) + search_data = [fake.text().lower() + " " + token for _ in range(nq)] if expr == "text_match": - filter = f"TextMatch(text, '{tokens[0]}')" + filter = f"TextMatch(text, '{token}')" res, _ = collection_w.query( expr=filter, ) @@ -2488,7 +2495,7 @@ def test_full_text_search_with_jieba_tokenizer( result_text = r.text # verify search result satisfies the filter if expr == "text_match": - assert tokens[0] in result_text + assert token in result_text if expr == "id_range": assert _id < data_size // 2 # verify search result has overlap with search text @@ -2497,7 +2504,6 @@ def test_full_text_search_with_jieba_tokenizer( assert len( overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" - @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nq", [2]) @pytest.mark.parametrize("empty_percent", [0]) diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index b7c2553caaaa5..1929c6c969e42 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -29,6 +29,11 @@ fake_en = Faker("en_US") fake_zh = Faker("zh_CN") fake_de = Faker("de_DE") + +# patch faker to generate text with specific distribution +cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution) +cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution) + pd.set_option("expand_frame_repr", False) @@ -4787,6 +4792,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self): wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) df_new = cf.split_dataframes(df, fields=text_fields) + log.info(f"df \n{df}") log.info(f"new df \n{df_new}") for field in text_fields: expr_list = [] @@ -4796,16 +4802,15 @@ def test_query_text_match_with_combined_expression_for_single_field(self): tmp = f"TextMatch({field}, '{word}')" log.info(f"tmp expr {tmp}") expr_list.append(tmp) - manual_result = df_new[ - df_new.apply(lambda row: word in row[field], axis=1) - ] - tmp_res = set(manual_result["id"].tolist()) - log.info(f"manual check result for {tmp} {len(manual_result)}") + tmp_res = cf.manual_check_text_match(df_new, word, field) + log.info(f"manual check result for {tmp} {len(tmp_res)}") pd_tmp_res_list.append(tmp_res) + log.info(f"manual res {len(pd_tmp_res_list)}, {pd_tmp_res_list}") final_res = set(pd_tmp_res_list[0]) for i in range(1, len(pd_tmp_res_list)): final_res = final_res.intersection(set(pd_tmp_res_list[i])) log.info(f"intersection res {len(final_res)}") + log.info(f"final res {final_res}") and_expr = " and ".join(expr_list) log.info(f"expr: {and_expr}") res, _ = collection_w.query(expr=and_expr, output_fields=text_fields) diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py index e320c91c1e3d4..7f91f933b4985 100644 --- a/tests/python_client/testcases/test_search.py +++ b/tests/python_client/testcases/test_search.py @@ -29,6 +29,11 @@ Faker.seed(19530) fake_en = Faker("en_US") fake_zh = Faker("zh_CN") + +# patch faker to generate text with specific distribution +cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution) +cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution) + pd.set_option("expand_frame_repr", False)