Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: fix tokenizer and monkey patch faker function #37119

Merged
merged 6 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 7 additions & 13 deletions tests/python_client/chaos/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def __init__(self, collection_name=None, partition_name=None, shards_num=2, dim=
self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name)
log.info(f"insert data for collection {c_name} cost {time.perf_counter() - t0}s")

self.initial_entities = self.c_wrap.num_entities # do as a flush
self.initial_entities = self.c_wrap.collection.num_entities
self.scale = 100000 # timestamp scale to make time.time() as int64

def insert_data(self, nb=constants.DELTA_PER_INS, partition_name=None):
Expand Down Expand Up @@ -759,8 +759,7 @@ class InsertFlushChecker(Checker):
def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None):
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self._flush = flush
self.initial_entities = self.c_wrap.num_entities

self.initial_entities = self.c_wrap.collection.num_entities
def keep_running(self):
while True:
t0 = time.time()
Expand Down Expand Up @@ -803,17 +802,12 @@ def __init__(self, collection_name=None, shards_num=2, schema=None):
if collection_name is None:
collection_name = cf.gen_unique_str("FlushChecker_")
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self.initial_entities = self.c_wrap.num_entities
self.initial_entities = self.c_wrap.collection.num_entities

@trace()
def flush(self):
num_entities = self.c_wrap.num_entities
if num_entities >= (self.initial_entities + constants.DELTA_PER_INS):
result = True
self.initial_entities += constants.DELTA_PER_INS
else:
result = False
return num_entities, result
res, result = self.c_wrap.flush()
return res, result

@exception_handler()
def run_task(self):
Expand All @@ -839,7 +833,7 @@ def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None)
collection_name = cf.gen_unique_str("InsertChecker_")
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self._flush = flush
self.initial_entities = self.c_wrap.num_entities
self.initial_entities = self.c_wrap.collection.num_entities
self.inserted_data = []
self.scale = 1 * 10 ** 6
self.start_time_stamp = int(time.time() * self.scale) # us
Expand Down Expand Up @@ -917,7 +911,7 @@ def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None)
collection_name = cf.gen_unique_str("InsertChecker_")
super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
self._flush = flush
self.initial_entities = self.c_wrap.num_entities
self.initial_entities = self.c_wrap.collection.num_entities
self.inserted_data = []
self.scale = 1 * 10 ** 6
self.start_time_stamp = int(time.time() * self.scale) # us
Expand Down
76 changes: 75 additions & 1 deletion tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,72 @@ def prepare_param_info(self, host, port, handler, replica_num, user, password, s

param_info = ParamInfo()

en_vocabularies_distribution = {
"hello": 0.01,
"milvus": 0.01,
"vector": 0.01,
"database": 0.01
}

zh_vocabularies_distribution = {
"你好": 0.01,
"向量": 0.01,
"数据": 0.01,
"库": 0.01
}

def patch_faker_text(fake_instance, vocabularies_distribution):
"""
Monkey patch the text() method of a Faker instance to include custom vocabulary.
Each word in vocabularies_distribution has an independent chance to be inserted.

Args:
fake_instance: Faker instance to patch
vocabularies_distribution: Dictionary where:
- key: word to insert
- value: probability (0-1) of inserting this word into each sentence

Example:
vocabularies_distribution = {
"hello": 0.1, # 10% chance to insert "hello" in each sentence
"milvus": 0.1, # 10% chance to insert "milvus" in each sentence
}
"""
original_text = fake_instance.text

def new_text(nb_sentences=100, *args, **kwargs):
sentences = []
# Split original text into sentences
original_sentences = original_text(nb_sentences).split('.')
original_sentences = [s.strip() for s in original_sentences if s.strip()]

for base_sentence in original_sentences:
words = base_sentence.split()

# Independently decide whether to insert each word
for word, probability in vocabularies_distribution.items():
if random.random() < probability:
# Choose random position to insert the word
insert_pos = random.randint(0, len(words))
words.insert(insert_pos, word)

# Reconstruct the sentence
base_sentence = ' '.join(words)

# Ensure proper capitalization
base_sentence = base_sentence[0].upper() + base_sentence[1:]
sentences.append(base_sentence)

return '. '.join(sentences) + '.'



# Replace the original text method with our custom one
fake_instance.text = new_text





def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
"""
Expand Down Expand Up @@ -147,6 +213,14 @@ def blank_space_split(text):
)
return tokenizer

def manual_check_text_match(df, word, col):
id_list = []
for i in range(len(df)):
row = df.iloc[i]
# log.info(f"word :{word}, row: {row[col]}")
if word in row[col]:
id_list.append(row["id"])
return id_list

def analyze_documents(texts, language="en"):

Expand Down Expand Up @@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"):

def split_dataframes(df, fields, language="en"):
df_copy = df.copy()
tokenizer = custom_tokenizer(language)
for col in fields:
tokenizer = custom_tokenizer(language)
texts = df[col].to_list()
tokenized = tokenizer.tokenize(texts, return_as="tuple")
new_texts = []
Expand Down
14 changes: 10 additions & 4 deletions tests/python_client/testcases/test_full_text_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")

# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)

pd.set_option("expand_frame_repr", False)

prefix = "full_text_search_collection"
Expand Down Expand Up @@ -2214,6 +2219,7 @@ def test_full_text_search_default(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2429,9 +2435,10 @@ def test_full_text_search_with_jieba_tokenizer(
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
limit = 100
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
token = random.choice(tokens)
search_data = [fake.text().lower() + " " + token for _ in range(nq)]
if expr == "text_match":
filter = f"text_match(text, '{tokens[0]}')"
filter = f"text_match(text, '{token}')"
res, _ = collection_w.query(
expr=filter,
)
Expand Down Expand Up @@ -2488,7 +2495,7 @@ def test_full_text_search_with_jieba_tokenizer(
result_text = r.text
# verify search result satisfies the filter
if expr == "text_match":
assert tokens[0] in result_text
assert token in result_text
if expr == "id_range":
assert _id < data_size // 2
# verify search result has overlap with search text
Expand All @@ -2497,7 +2504,6 @@ def test_full_text_search_with_jieba_tokenizer(
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"


@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0])
Expand Down
Loading