Skip to content

Commit

Permalink
test: fix tokenizer and monkey patch faker function
Browse files Browse the repository at this point in the history
Signed-off-by: zhuwenxing <[email protected]>
  • Loading branch information
zhuwenxing committed Nov 1, 2024
1 parent 0ac8b16 commit 9d06390
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 10 deletions.
76 changes: 75 additions & 1 deletion tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,72 @@ def prepare_param_info(self, host, port, handler, replica_num, user, password, s

param_info = ParamInfo()

en_vocabularies_distribution = {
"hello": 0.01,
"milvus": 0.01,
"vector": 0.01,
"database": 0.01
}

zh_vocabularies_distribution = {
"你好": 0.01,
"向量": 0.01,
"数据": 0.01,
"库": 0.01
}

def patch_faker_text(fake_instance, vocabularies_distribution):
"""
Monkey patch the text() method of a Faker instance to include custom vocabulary.
Each word in vocabularies_distribution has an independent chance to be inserted.
Args:
fake_instance: Faker instance to patch
vocabularies_distribution: Dictionary where:
- key: word to insert
- value: probability (0-1) of inserting this word into each sentence
Example:
vocabularies_distribution = {
"hello": 0.1, # 10% chance to insert "hello" in each sentence
"milvus": 0.1, # 10% chance to insert "milvus" in each sentence
}
"""
original_text = fake_instance.text

def new_text(nb_sentences=100, *args, **kwargs):
sentences = []
# Split original text into sentences
original_sentences = original_text(nb_sentences).split('.')
original_sentences = [s.strip() for s in original_sentences if s.strip()]

for base_sentence in original_sentences:
words = base_sentence.split()

# Independently decide whether to insert each word
for word, probability in vocabularies_distribution.items():
if random.random() < probability:
# Choose random position to insert the word
insert_pos = random.randint(0, len(words))
words.insert(insert_pos, word)

# Reconstruct the sentence
base_sentence = ' '.join(words)

# Ensure proper capitalization
base_sentence = base_sentence[0].upper() + base_sentence[1:]
sentences.append(base_sentence)

return '. '.join(sentences) + '.'



# Replace the original text method with our custom one
fake_instance.text = new_text





def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
"""
Expand Down Expand Up @@ -147,6 +213,14 @@ def blank_space_split(text):
)
return tokenizer

def manual_check_text_match(df, word, col):
id_list = []
for i in range(len(df)):
row = df.iloc[i]
# log.info(f"word :{word}, row: {row[col]}")
if word in row[col]:
id_list.append(row["id"])
return id_list

def analyze_documents(texts, language="en"):

Expand Down Expand Up @@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"):

def split_dataframes(df, fields, language="en"):
df_copy = df.copy()
tokenizer = custom_tokenizer(language)
for col in fields:
tokenizer = custom_tokenizer(language)
texts = df[col].to_list()
tokenized = tokenizer.tokenize(texts, return_as="tuple")
new_texts = []
Expand Down
14 changes: 10 additions & 4 deletions tests/python_client/testcases/test_full_text_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")

# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)

pd.set_option("expand_frame_repr", False)

prefix = "full_text_search_collection"
Expand Down Expand Up @@ -2214,6 +2219,7 @@ def test_full_text_search_default(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2429,9 +2435,10 @@ def test_full_text_search_with_jieba_tokenizer(
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
limit = 100
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
token = random.choice(tokens)
search_data = [fake.text().lower() + " " + token for _ in range(nq)]
if expr == "text_match":
filter = f"TextMatch(text, '{tokens[0]}')"
filter = f"TextMatch(text, '{token}')"
res, _ = collection_w.query(
expr=filter,
)
Expand Down Expand Up @@ -2488,7 +2495,7 @@ def test_full_text_search_with_jieba_tokenizer(
result_text = r.text
# verify search result satisfies the filter
if expr == "text_match":
assert tokens[0] in result_text
assert token in result_text
if expr == "id_range":
assert _id < data_size // 2
# verify search result has overlap with search text
Expand All @@ -2497,7 +2504,6 @@ def test_full_text_search_with_jieba_tokenizer(
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"


@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0])
Expand Down
15 changes: 10 additions & 5 deletions tests/python_client/testcases/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
fake_de = Faker("de_DE")

# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)

pd.set_option("expand_frame_repr", False)


Expand Down Expand Up @@ -4787,6 +4792,7 @@ def test_query_text_match_with_combined_expression_for_single_field(self):
wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)

df_new = cf.split_dataframes(df, fields=text_fields)
log.info(f"df \n{df}")
log.info(f"new df \n{df_new}")
for field in text_fields:
expr_list = []
Expand All @@ -4796,16 +4802,15 @@ def test_query_text_match_with_combined_expression_for_single_field(self):
tmp = f"TextMatch({field}, '{word}')"
log.info(f"tmp expr {tmp}")
expr_list.append(tmp)
manual_result = df_new[
df_new.apply(lambda row: word in row[field], axis=1)
]
tmp_res = set(manual_result["id"].tolist())
log.info(f"manual check result for {tmp} {len(manual_result)}")
tmp_res = cf.manual_check_text_match(df_new, word, field)
log.info(f"manual check result for {tmp} {len(tmp_res)}")
pd_tmp_res_list.append(tmp_res)
log.info(f"manual res {len(pd_tmp_res_list)}, {pd_tmp_res_list}")
final_res = set(pd_tmp_res_list[0])
for i in range(1, len(pd_tmp_res_list)):
final_res = final_res.intersection(set(pd_tmp_res_list[i]))
log.info(f"intersection res {len(final_res)}")
log.info(f"final res {final_res}")
and_expr = " and ".join(expr_list)
log.info(f"expr: {and_expr}")
res, _ = collection_w.query(expr=and_expr, output_fields=text_fields)
Expand Down
5 changes: 5 additions & 0 deletions tests/python_client/testcases/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")

# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)

pd.set_option("expand_frame_repr", False)


Expand Down

0 comments on commit 9d06390

Please sign in to comment.