test: fix tokenizer and monkey patch faker function (#37119)

/kind improvement --------- Signed-off-by: zhuwenxing <[email protected]> Signed-off-by: zhuwenxing <[email protected]>
milvus-io · Nov 5, 2024 · 0fc6c63 · 0fc6c63
1 parent 266ed5b
commit 0fc6c63
Show file tree

Hide file tree

Showing 7 changed files with 460 additions and 59 deletions.
diff --git a/tests/python_client/chaos/checker.py b/tests/python_client/chaos/checker.py
@@ -414,7 +414,7 @@ def __init__(self, collection_name=None, partition_name=None, shards_num=2, dim=
             self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name)
             log.info(f"insert data for collection {c_name} cost {time.perf_counter() - t0}s")
 
-        self.initial_entities = self.c_wrap.num_entities  # do as a flush
+        self.initial_entities = self.c_wrap.collection.num_entities
         self.scale = 100000  # timestamp scale to make time.time() as int64
 
     def insert_data(self, nb=constants.DELTA_PER_INS, partition_name=None):
@@ -759,8 +759,7 @@ class InsertFlushChecker(Checker):
     def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None):
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
         self._flush = flush
-        self.initial_entities = self.c_wrap.num_entities
-
+        self.initial_entities = self.c_wrap.collection.num_entities
     def keep_running(self):
         while True:
             t0 = time.time()
@@ -803,17 +802,12 @@ def __init__(self, collection_name=None, shards_num=2, schema=None):
         if collection_name is None:
             collection_name = cf.gen_unique_str("FlushChecker_")
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
-        self.initial_entities = self.c_wrap.num_entities
+        self.initial_entities = self.c_wrap.collection.num_entities
 
     @trace()
     def flush(self):
-        num_entities = self.c_wrap.num_entities
-        if num_entities >= (self.initial_entities + constants.DELTA_PER_INS):
-            result = True
-            self.initial_entities += constants.DELTA_PER_INS
-        else:
-            result = False
-        return num_entities, result
+        res, result = self.c_wrap.flush()
+        return res, result
 
     @exception_handler()
     def run_task(self):
@@ -839,7 +833,7 @@ def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None)
             collection_name = cf.gen_unique_str("InsertChecker_")
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
         self._flush = flush
-        self.initial_entities = self.c_wrap.num_entities
+        self.initial_entities = self.c_wrap.collection.num_entities
         self.inserted_data = []
         self.scale = 1 * 10 ** 6
         self.start_time_stamp = int(time.time() * self.scale)  # us
@@ -917,7 +911,7 @@ def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None)
             collection_name = cf.gen_unique_str("InsertChecker_")
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
         self._flush = flush
-        self.initial_entities = self.c_wrap.num_entities
+        self.initial_entities = self.c_wrap.collection.num_entities
         self.inserted_data = []
         self.scale = 1 * 10 ** 6
         self.start_time_stamp = int(time.time() * self.scale)  # us

diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py
@@ -80,6 +80,72 @@ def prepare_param_info(self, host, port, handler, replica_num, user, password, s
 
 param_info = ParamInfo()
 
+en_vocabularies_distribution = {
+    "hello": 0.01,
+    "milvus": 0.01,
+    "vector": 0.01,
+    "database": 0.01
+}
+
+zh_vocabularies_distribution = {
+    "你好": 0.01,
+    "向量": 0.01,
+    "数据": 0.01,
+    "库": 0.01
+}
+
+def patch_faker_text(fake_instance, vocabularies_distribution):
+    """
+    Monkey patch the text() method of a Faker instance to include custom vocabulary.
+    Each word in vocabularies_distribution has an independent chance to be inserted.
+
+    Args:
+        fake_instance: Faker instance to patch
+        vocabularies_distribution: Dictionary where:
+            - key: word to insert
+            - value: probability (0-1) of inserting this word into each sentence
+
+    Example:
+        vocabularies_distribution = {
+            "hello": 0.1,    # 10% chance to insert "hello" in each sentence
+            "milvus": 0.1,   # 10% chance to insert "milvus" in each sentence
+        }
+    """
+    original_text = fake_instance.text
+
+    def new_text(nb_sentences=100, *args, **kwargs):
+        sentences = []
+        # Split original text into sentences
+        original_sentences = original_text(nb_sentences).split('.')
+        original_sentences = [s.strip() for s in original_sentences if s.strip()]
+
+        for base_sentence in original_sentences:
+            words = base_sentence.split()
+
+            # Independently decide whether to insert each word
+            for word, probability in vocabularies_distribution.items():
+                if random.random() < probability:
+                    # Choose random position to insert the word
+                    insert_pos = random.randint(0, len(words))
+                    words.insert(insert_pos, word)
+
+            # Reconstruct the sentence
+            base_sentence = ' '.join(words)
+
+            # Ensure proper capitalization
+            base_sentence = base_sentence[0].upper() + base_sentence[1:]
+            sentences.append(base_sentence)
+
+        return '. '.join(sentences) + '.'
+
+
+
+    # Replace the original text method with our custom one
+    fake_instance.text = new_text
+
+
+
+
 
 def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
     """
@@ -147,6 +213,14 @@ def blank_space_split(text):
         )
     return tokenizer
 
+def manual_check_text_match(df, word, col):
+    id_list = []
+    for i in range(len(df)):
+        row = df.iloc[i]
+        # log.info(f"word :{word}, row: {row[col]}")
+        if word in row[col]:
+            id_list.append(row["id"])
+    return id_list
 
 def analyze_documents(texts, language="en"):
 
@@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"):
 
 def split_dataframes(df, fields, language="en"):
     df_copy = df.copy()
-    tokenizer = custom_tokenizer(language)
     for col in fields:
+        tokenizer = custom_tokenizer(language)
         texts = df[col].to_list()
         tokenized = tokenizer.tokenize(texts, return_as="tuple")
         new_texts = []

diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py
@@ -15,6 +15,11 @@
 Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)
 
 prefix = "full_text_search_collection"
@@ -2214,6 +2219,7 @@ def test_full_text_search_default(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
+        collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2429,9 +2435,10 @@ def test_full_text_search_with_jieba_tokenizer(
             collection_w.create_index("text", {"index_type": "INVERTED"})
         collection_w.load()
         limit = 100
-        search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
+        token = random.choice(tokens)
+        search_data = [fake.text().lower() + " " + token for _ in range(nq)]
         if expr == "text_match":
-            filter = f"text_match(text, '{tokens[0]}')"
+            filter = f"text_match(text, '{token}')"
             res, _ = collection_w.query(
                 expr=filter,
             )
@@ -2488,7 +2495,7 @@ def test_full_text_search_with_jieba_tokenizer(
                 result_text = r.text
                 # verify search result satisfies the filter
                 if expr == "text_match":
-                    assert tokens[0] in result_text
+                    assert token in result_text
                 if expr == "id_range":
                     assert _id < data_size // 2
                 # verify search result has overlap with search text
@@ -2497,7 +2504,6 @@ def test_full_text_search_with_jieba_tokenizer(
                 assert len(
                     overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
 
-
     @pytest.mark.tags(CaseLabel.L1)
     @pytest.mark.parametrize("nq", [2])
     @pytest.mark.parametrize("empty_percent", [0])