From 9d06390a077ed782d7cda49d0104d2e00ddd6083 Mon Sep 17 00:00:00 2001
From: zhuwenxing <wenxing.zhu@zilliz.com>
Date: Fri, 1 Nov 2024 10:06:40 +0800
Subject: [PATCH] test: fix tokenizer and monkey patch faker function

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
---
 tests/python_client/common/common_func.py     | 76 ++++++++++++++++++-
 .../testcases/test_full_text_search.py        | 14 +++-
 tests/python_client/testcases/test_query.py   | 15 ++--
 tests/python_client/testcases/test_search.py  |  5 ++
 4 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py
index d0770c62a7010..80dc98f15fe85 100644
--- a/tests/python_client/common/common_func.py
+++ b/tests/python_client/common/common_func.py
@@ -80,6 +80,72 @@ def prepare_param_info(self, host, port, handler, replica_num, user, password, s
 
 param_info = ParamInfo()
 
+en_vocabularies_distribution = {
+    "hello": 0.01,
+    "milvus": 0.01,
+    "vector": 0.01,
+    "database": 0.01
+}
+
+zh_vocabularies_distribution = {
+    "你好": 0.01,
+    "向量": 0.01,
+    "数据": 0.01,
+    "库": 0.01
+}
+
+def patch_faker_text(fake_instance, vocabularies_distribution):
+    """
+    Monkey patch the text() method of a Faker instance to include custom vocabulary.
+    Each word in vocabularies_distribution has an independent chance to be inserted.
+
+    Args:
+        fake_instance: Faker instance to patch
+        vocabularies_distribution: Dictionary where:
+            - key: word to insert
+            - value: probability (0-1) of inserting this word into each sentence
+
+    Example:
+        vocabularies_distribution = {
+            "hello": 0.1,    # 10% chance to insert "hello" in each sentence
+            "milvus": 0.1,   # 10% chance to insert "milvus" in each sentence
+        }
+    """
+    original_text = fake_instance.text
+
+    def new_text(nb_sentences=100, *args, **kwargs):
+        sentences = []
+        # Split original text into sentences
+        original_sentences = original_text(nb_sentences).split('.')
+        original_sentences = [s.strip() for s in original_sentences if s.strip()]
+
+        for base_sentence in original_sentences:
+            words = base_sentence.split()
+
+            # Independently decide whether to insert each word
+            for word, probability in vocabularies_distribution.items():
+                if random.random() < probability:
+                    # Choose random position to insert the word
+                    insert_pos = random.randint(0, len(words))
+                    words.insert(insert_pos, word)
+
+            # Reconstruct the sentence
+            base_sentence = ' '.join(words)
+
+            # Ensure proper capitalization
+            base_sentence = base_sentence[0].upper() + base_sentence[1:]
+            sentences.append(base_sentence)
+
+        return '. '.join(sentences) + '.'
+
+
+
+    # Replace the original text method with our custom one
+    fake_instance.text = new_text
+
+
+
+
 
 def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
     """
@@ -147,6 +213,14 @@ def blank_space_split(text):
         )
     return tokenizer
 
+def manual_check_text_match(df, word, col):
+    id_list = []
+    for i in range(len(df)):
+        row = df.iloc[i]
+        # log.info(f"word :{word}, row: {row[col]}")
+        if word in row[col]:
+            id_list.append(row["id"])
+    return id_list
 
 def analyze_documents(texts, language="en"):
 
@@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"):
 
 def split_dataframes(df, fields, language="en"):
     df_copy = df.copy()
-    tokenizer = custom_tokenizer(language)
     for col in fields:
+        tokenizer = custom_tokenizer(language)
         texts = df[col].to_list()
         tokenized = tokenizer.tokenize(texts, return_as="tuple")
         new_texts = []
diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py
index 4cd8a7c8a037e..c2b149ca95840 100644
--- a/tests/python_client/testcases/test_full_text_search.py
+++ b/tests/python_client/testcases/test_full_text_search.py
@@ -15,6 +15,11 @@
 Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)
 
 prefix = "full_text_search_collection"
@@ -2214,6 +2219,7 @@ def test_full_text_search_default(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
+        collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2429,9 +2435,10 @@ def test_full_text_search_with_jieba_tokenizer(
             collection_w.create_index("text", {"index_type": "INVERTED"})
         collection_w.load()
         limit = 100
-        search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
+        token = random.choice(tokens)
+        search_data = [fake.text().lower() + " " + token for _ in range(nq)]
         if expr == "text_match":
-            filter = f"TextMatch(text, '{tokens[0]}')"
+            filter = f"TextMatch(text, '{token}')"
             res, _ = collection_w.query(
                 expr=filter,
             )
@@ -2488,7 +2495,7 @@ def test_full_text_search_with_jieba_tokenizer(
                 result_text = r.text
                 # verify search result satisfies the filter
                 if expr == "text_match":
-                    assert tokens[0] in result_text
+                    assert token in result_text
                 if expr == "id_range":
                     assert _id < data_size // 2
                 # verify search result has overlap with search text
@@ -2497,7 +2504,6 @@ def test_full_text_search_with_jieba_tokenizer(
                 assert len(
                     overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
 
-
     @pytest.mark.tags(CaseLabel.L1)
     @pytest.mark.parametrize("nq", [2])
     @pytest.mark.parametrize("empty_percent", [0])
diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py
index b7c2553caaaa5..1929c6c969e42 100644
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py
@@ -29,6 +29,11 @@
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
 fake_de = Faker("de_DE")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)
 
 
@@ -4787,6 +4792,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self):
             wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
 
         df_new = cf.split_dataframes(df, fields=text_fields)
+        log.info(f"df \n{df}")
         log.info(f"new df \n{df_new}")
         for field in text_fields:
             expr_list = []
@@ -4796,16 +4802,15 @@ def test_query_text_match_with_combined_expression_for_single_field(self):
                 tmp = f"TextMatch({field}, '{word}')"
                 log.info(f"tmp expr {tmp}")
                 expr_list.append(tmp)
-                manual_result = df_new[
-                    df_new.apply(lambda row: word in row[field], axis=1)
-                ]
-                tmp_res = set(manual_result["id"].tolist())
-                log.info(f"manual check result for  {tmp} {len(manual_result)}")
+                tmp_res = cf.manual_check_text_match(df_new, word, field)
+                log.info(f"manual check result for  {tmp} {len(tmp_res)}")
                 pd_tmp_res_list.append(tmp_res)
+            log.info(f"manual res {len(pd_tmp_res_list)}, {pd_tmp_res_list}")
             final_res = set(pd_tmp_res_list[0])
             for i in range(1, len(pd_tmp_res_list)):
                 final_res = final_res.intersection(set(pd_tmp_res_list[i]))
             log.info(f"intersection res {len(final_res)}")
+            log.info(f"final res {final_res}")
             and_expr = " and ".join(expr_list)
             log.info(f"expr: {and_expr}")
             res, _ = collection_w.query(expr=and_expr, output_fields=text_fields)
diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py
index e320c91c1e3d4..7f91f933b4985 100644
--- a/tests/python_client/testcases/test_search.py
+++ b/tests/python_client/testcases/test_search.py
@@ -29,6 +29,11 @@
 Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)