Skip to content

Commit

Permalink
test: fix tokenizer and monkey patch faker function (#37119)
Browse files Browse the repository at this point in the history
/kind improvement

---------

Signed-off-by: zhuwenxing <[email protected]>
Signed-off-by: zhuwenxing <[email protected]>
  • Loading branch information
zhuwenxing authored Nov 5, 2024
1 parent 266ed5b commit 0fc6c63
Show file tree
Hide file tree
Showing 7 changed files with 460 additions and 59 deletions.
20 changes: 7 additions & 13 deletions tests/python_client/chaos/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def __init__(self, collection_name=None, partition_name=None, shards_num=2, dim=
self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name)
log.info(f"insert data for collection {c_name} cost {time.perf_counter() - t0}s")

self.initial_entities = self.c_wrap.num_entities # do as a flush
self.initial_entities = self.c_wrap.collection.num_entities
self.scale = 100000 # timestamp scale to make time.time() as int64

def insert_data(self, nb=constants.DELTA_PER_INS, partition_name=None):
Expand Down Expand Up @@ -759,8 +759,7 @@ class InsertFlushChecker(Checker):
def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None):
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self._flush = flush
self.initial_entities = self.c_wrap.num_entities

self.initial_entities = self.c_wrap.collection.num_entities
def keep_running(self):
while True:
t0 = time.time()
Expand Down Expand Up @@ -803,17 +802,12 @@ def __init__(self, collection_name=None, shards_num=2, schema=None):
if collection_name is None:
collection_name = cf.gen_unique_str("FlushChecker_")
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self.initial_entities = self.c_wrap.num_entities
self.initial_entities = self.c_wrap.collection.num_entities

@trace()
def flush(self):
num_entities = self.c_wrap.num_entities
if num_entities >= (self.initial_entities + constants.DELTA_PER_INS):
result = True
self.initial_entities += constants.DELTA_PER_INS
else:
result = False
return num_entities, result
res, result = self.c_wrap.flush()
return res, result

@exception_handler()
def run_task(self):
Expand All @@ -839,7 +833,7 @@ def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None)
collection_name = cf.gen_unique_str("InsertChecker_")
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self._flush = flush
self.initial_entities = self.c_wrap.num_entities
self.initial_entities = self.c_wrap.collection.num_entities
self.inserted_data = []
self.scale = 1 * 10 ** 6
self.start_time_stamp = int(time.time() * self.scale) # us
Expand Down Expand Up @@ -917,7 +911,7 @@ def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None)
collection_name = cf.gen_unique_str("InsertChecker_")
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self._flush = flush
self.initial_entities = self.c_wrap.num_entities
self.initial_entities = self.c_wrap.collection.num_entities
self.inserted_data = []
self.scale = 1 * 10 ** 6
self.start_time_stamp = int(time.time() * self.scale) # us
Expand Down
76 changes: 75 additions & 1 deletion tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,72 @@ def prepare_param_info(self, host, port, handler, replica_num, user, password, s

param_info = ParamInfo()

en_vocabularies_distribution = {
"hello": 0.01,
"milvus": 0.01,
"vector": 0.01,
"database": 0.01
}

zh_vocabularies_distribution = {
"你好": 0.01,
"向量": 0.01,
"数据": 0.01,
"库": 0.01
}

def patch_faker_text(fake_instance, vocabularies_distribution):
"""
Monkey patch the text() method of a Faker instance to include custom vocabulary.
Each word in vocabularies_distribution has an independent chance to be inserted.
Args:
fake_instance: Faker instance to patch
vocabularies_distribution: Dictionary where:
- key: word to insert
- value: probability (0-1) of inserting this word into each sentence
Example:
vocabularies_distribution = {
"hello": 0.1, # 10% chance to insert "hello" in each sentence
"milvus": 0.1, # 10% chance to insert "milvus" in each sentence
}
"""
original_text = fake_instance.text

def new_text(nb_sentences=100, *args, **kwargs):
sentences = []
# Split original text into sentences
original_sentences = original_text(nb_sentences).split('.')
original_sentences = [s.strip() for s in original_sentences if s.strip()]

for base_sentence in original_sentences:
words = base_sentence.split()

# Independently decide whether to insert each word
for word, probability in vocabularies_distribution.items():
if random.random() < probability:
# Choose random position to insert the word
insert_pos = random.randint(0, len(words))
words.insert(insert_pos, word)

# Reconstruct the sentence
base_sentence = ' '.join(words)

# Ensure proper capitalization
base_sentence = base_sentence[0].upper() + base_sentence[1:]
sentences.append(base_sentence)

return '. '.join(sentences) + '.'



# Replace the original text method with our custom one
fake_instance.text = new_text





def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
"""
Expand Down Expand Up @@ -147,6 +213,14 @@ def blank_space_split(text):
)
return tokenizer

def manual_check_text_match(df, word, col):
id_list = []
for i in range(len(df)):
row = df.iloc[i]
# log.info(f"word :{word}, row: {row[col]}")
if word in row[col]:
id_list.append(row["id"])
return id_list

def analyze_documents(texts, language="en"):

Expand Down Expand Up @@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"):

def split_dataframes(df, fields, language="en"):
df_copy = df.copy()
tokenizer = custom_tokenizer(language)
for col in fields:
tokenizer = custom_tokenizer(language)
texts = df[col].to_list()
tokenized = tokenizer.tokenize(texts, return_as="tuple")
new_texts = []
Expand Down
14 changes: 10 additions & 4 deletions tests/python_client/testcases/test_full_text_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")

# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)

pd.set_option("expand_frame_repr", False)

prefix = "full_text_search_collection"
Expand Down Expand Up @@ -2214,6 +2219,7 @@ def test_full_text_search_default(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2429,9 +2435,10 @@ def test_full_text_search_with_jieba_tokenizer(
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
limit = 100
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
token = random.choice(tokens)
search_data = [fake.text().lower() + " " + token for _ in range(nq)]
if expr == "text_match":
filter = f"text_match(text, '{tokens[0]}')"
filter = f"text_match(text, '{token}')"
res, _ = collection_w.query(
expr=filter,
)
Expand Down Expand Up @@ -2488,7 +2495,7 @@ def test_full_text_search_with_jieba_tokenizer(
result_text = r.text
# verify search result satisfies the filter
if expr == "text_match":
assert tokens[0] in result_text
assert token in result_text
if expr == "id_range":
assert _id < data_size // 2
# verify search result has overlap with search text
Expand All @@ -2497,7 +2504,6 @@ def test_full_text_search_with_jieba_tokenizer(
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"


@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0])
Expand Down
Loading

0 comments on commit 0fc6c63

Please sign in to comment.