Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
techofer committed Dec 28, 2024
1 parent 9f3e89b commit 5092033
Showing 1 changed file with 22 additions and 13 deletions.
35 changes: 22 additions & 13 deletions nbs/benchmark/covid_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,12 @@ def rewrite_docs(docs,span_label,new_version):
return pd.DataFrame(new_tuples,columns=['P','D','V'])

file_paths = []
def main(input_dir,data_dir,logic_file, start=0, end=10):
def main(input_dir,data_dir,logic_file, start=0, end=10, cache=None):
global file_paths
sess = Session()
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"] and cache:
sess.engine.spannerflow_engine.set_cache(cache)

sess.register('py_rgx', rgx, [str, Span], span_arity)
sess.register('py_rgx_split', rgx_split, [str, Span], [Span,Span])
sess.register('py_rgx_is_match', rgx_is_match, [str, Span], [bool])
Expand Down Expand Up @@ -192,9 +195,7 @@ def main(input_dir,data_dir,logic_file, start=0, end=10):
sess.import_var('section_delimeter_pattern',section_delimeter_pattern)

# bring in data
file_paths = [Path(p) for p in glob(str(input_dir/'*.txt'))]
file_paths.sort()
file_paths = file_paths[start:end]
file_paths = [Path(f"{input_dir}/sample{i}.txt") for i in range(start, end)]
raw_docs = pd.DataFrame([
[p.name,p.read_text(),'raw_text'] for p in file_paths
],columns=['Path','Doc','Version']
Expand Down Expand Up @@ -246,15 +247,23 @@ def main(input_dir,data_dir,logic_file, start=0, end=10):
classification = paths.merge(doc_tags,on='P',how='outer')
classification['T']=classification['T'].fillna('UNK')

return classification

k = 10
total_docs = 0
for i in range(0, 101-k, k):
res = main(input_dir,data_dir,slog_file, start=i, end=i+k)
total_docs += len(res)
print(res)
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
cache = sess.engine.spannerflow_engine.get_cache()
return cache, classification

k = 1000
steps = 100
round = 1
cache = {}
last_round_end_time = start_time
for i in range(1, k+1, steps):
cache, res = main(input_dir,data_dir,slog_file, start=i, end=i+steps, cache=cache)
res.to_csv(f'covid_data/results/{start_time}-{VERSION}.csv', index=False, mode='a')
current_time = time.time()
print(f"Time taken for round {round}: {current_time-last_round_end_time:.2f} seconds")
round += 1
last_round_end_time = current_time

end_time = time.time()
print(f"Number of Documents: {total_docs}")
print(f"Number of Documents: {k}")
print(f"Time taken: {end_time-start_time:.2f} seconds")

0 comments on commit 5092033

Please sign in to comment.