Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
techofer committed Dec 23, 2024
1 parent 8aa7ec4 commit 10a9d06
Show file tree
Hide file tree
Showing 10,042 changed files with 292 additions and 1,351 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
1,065 changes: 0 additions & 1,065 deletions nbs/benchmark/001_covid.ipynb

This file was deleted.

213 changes: 213 additions & 0 deletions nbs/benchmark/covid_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import time
from glob import glob
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from spannerlib import get_magic_session,Session,Span
sess = get_magic_session()

import spacy
nlp = spacy.load("en_core_web_sm")

# configurations
slog_file = Path('covid_bench_logic.pl')
input_dir = Path('covid_data/sample_inputs')
data_dir = Path('covid_data/rules_data')

start_time = time.time()

def split_sentence(text):
"""
Splits a text into individual sentences. using spacy's sentence detection.
Returns:
str: Individual sentences extracted from the input text.
"""

doc = nlp(str(text))
start = 0
for sentence in doc.sents:
end = start+len(sentence.text)
# note that we yield a Span object, so we can keep track of the locations of the sentences
yield Span(text,start,end)
start = end + 1

class LemmaFromList():
def __init__(self,lemma_list):
self.lemma_list = lemma_list

def __call__(self,text):
doc = nlp(str(text))
for word in doc:
start = word.idx
end = start + len(word.text)
if word.lemma_ in self.lemma_list:
yield (Span(text,start,end),word.lemma_)
elif word.like_num:
yield (Span(text,start,end),'like_num')
else:
pass

lemma_list = (data_dir/'lemma_words.txt').read_text().split()
lemmatizer = LemmaFromList(lemma_list)

class PosFromList():
def __init__(self,pos_list):
self.pos_list = pos_list
def __call__(self,text):
doc = nlp(str(text))
for word in doc:
start = word.idx
end = start + len(word.text)
if word.pos_ in self.pos_list:
yield (Span(text,start,end),word.pos_)

pos_annotator = PosFromList(["NOUN", "PROPN", "PRON", "ADJ"])

def agg_mention(group):
"""
aggregates attribute groups of covid spans
"""
if 'IGNORE' in group.values:
return 'IGNORE'
elif 'negated' in group.values and not 'no_negated' in group.values:
return 'negated'
elif 'future' in group.values and not 'no_future' in group.values:
return 'negated'
elif 'other experiencer' in group.values or 'not relevant' in group.values:
return 'negated'
elif 'positive' in group.values and not 'uncertain' in group.values and not 'no_positive' in group.values:
return 'positive'
else:
return 'uncertain'

def AggDocumentTags(group):
"""
Classifies a document as 'POS', 'UNK', or 'NEG' based on COVID-19 attributes.
"""
if 'positive' in group.values:
return 'POS'
elif 'uncertain' in group.values:
return 'UNK'
elif 'negated' in group.values:
return 'NEG'
else:
return 'UNK'

def rewrite(text,span_label_pairs):
"""rewrites a string given a dataframe with spans and the string to rewrite them to
assumes that the spans belong to the text
Args:
text (str like): string to rewrite
span_label_pairs (pd.Dataframe) dataframe with two columns, first is spans in the doc to rewrite
second is what to rewrite to
Returns:
The rewritten string
"""
if isinstance(text,Span):
text = text.as_str()
span_label_pairs = sorted(list(span_label_pairs.itertuples(index=False,name=None)), key=lambda x: x[0].start)

rewritten_text = ''
current_pos = 0
for span,label in span_label_pairs:
rewritten_text += text[current_pos:span.start] + label
current_pos = span.end

rewritten_text += text[current_pos:]

return rewritten_text


def rewrite_docs(docs,span_label,new_version):
"""Given a dataframe of documents of the form (path,doc,version) and a dataframe of spans to rewrite
of the form (path,word,from_span,to_tag), rewrites the documents and returns a new dataframe of the form
(path,doc,new_version)
"""
new_tuples =[]
span_label.columns = ['P','D','W','L']
for path,doc,_ in docs.itertuples(index=False,name=None):
span_label_per_doc = span_label[span_label['P'] == path][['W','L']]
new_text = rewrite(doc,span_label_per_doc)
new_tuples.append((path,new_text,new_version))
return pd.DataFrame(new_tuples,columns=['P','D','V'])

file_paths = []
def main(input_dir,data_dir,logic_file, start=0, end=10):
global file_paths
sess = Session()
# define callback functions
sess.register('split_sentence',split_sentence,[(str,Span)],[Span])
sess.register('pos',pos_annotator,[(Span,str)],[Span,str])
sess.register('lemma',lemmatizer,[(Span,str)],[Span,str])
sess.register_agg('agg_mention',agg_mention,[str],[str])
sess.register_agg('agg_doc_tags',AggDocumentTags,[str],[str])

# bring in code as data
sess.import_rel("ConceptTagRules",data_dir/"concept_tags_rules.csv" , delim=",")
sess.import_rel("TargetTagRules",data_dir/"target_rules.csv",delim=",")
sess.import_rel("SectionTags",data_dir/"section_tags.csv",delim=",")
sess.import_rel("PositiveSectionTags",data_dir/"positive_section_tags.csv",delim=",")
sess.import_rel("SentenceContextRules",data_dir/'sentence_context_rules.csv',delim="#")
sess.import_rel("PostprocessPatternRules",data_dir/'postprocess_pattern_rules.csv',delim="#")
sess.import_rel("PostprocessRulesWithAttributes",data_dir/'postprocess_attributes_rules.csv',delim="#")
sess.import_rel("NextSentencePostprocessPatternRules",data_dir/'postprocess_pattern_next_sentence_rules.csv',delim=',')


# we will programatically build a regex that matches all the section patterns
section_tags = pd.read_csv(data_dir/'section_tags.csv',names=['literal','tag'])
section_delimeter_pattern = section_tags['literal'].str.cat(sep='|')
sess.import_var('section_delimeter_pattern',section_delimeter_pattern)

# bring in data
file_paths = [Path(p) for p in glob(str(input_dir/'*.txt'))]
file_paths.sort()
file_paths = file_paths[start:end]
raw_docs = pd.DataFrame([
[p.name,p.read_text(),'raw_text'] for p in file_paths
],columns=['Path','Doc','Version']
)
sess.import_rel('Docs',raw_docs)

# load logic, note that since we did not define the data relations in the logic file,
# we need to load the logic after the data has been loaded
sess.export(logic_file.read_text())

## Rewritting the documents
lemma_tags = sess.export('?Lemmas(P,D,W,L)')
lemma_docs = rewrite_docs(raw_docs,lemma_tags,'lemma')
sess.import_rel('Docs',lemma_docs)

lemma_concept_matches = sess.export('?LemmaConceptMatches(Path,Doc,Span,Label)')
lemma_concepts = rewrite_docs(lemma_docs,lemma_concept_matches,'lemma_concept')
sess.import_rel('Docs',lemma_concepts)

pos_concept_matches = sess.export('?PosConceptMatches(P,D,W,L)')
pos_concept_docs = rewrite_docs(lemma_concepts,pos_concept_matches,'pos_concept')
sess.import_rel('Docs',pos_concept_docs)

target_matches = sess.export('?TargetMatches(P,D,W,L)')
target_rule_docs = rewrite_docs(pos_concept_docs,target_matches,'target_concept')
sess.import_rel('Docs',target_rule_docs)

## computing the tags based on the target concept documents
doc_tags = sess.export('?DocumentTags(P,T)')

# handling files with no mentions
paths = pd.DataFrame([p.name for p in file_paths],columns=['P'])
classification = paths.merge(doc_tags,on='P',how='outer')
classification['T']=classification['T'].fillna('UNK')
classification

return classification


for i in range(0, 1000, 20):
res = main(input_dir,data_dir,slog_file, start=i, end=i+20)
print(res)

end_time = time.time()
print(f"Number of Documents: {len(file_paths)}")
print(f"Time taken: {end_time-start_time:.2f} seconds")
79 changes: 79 additions & 0 deletions nbs/benchmark/covid_bench_logic.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
Lemmas(P,D,Word,Lem)<-Docs(P,D,"raw_text"),lemma(D)->(Word,Lem).

LemmaConceptMatches(Path,Doc,Span,Label) <- \
Docs(Path,Doc,"lemma"),\
ConceptTagRules(Pattern, Label, "lemma"),\
rgx(Pattern,Doc) -> (Span).

# here we get the spans of all POS
Pos(P,D,Word,Lem)<-Docs(P,D,"lemma_concept"),pos(D)->(Word,Lem).

# here we look for concept rule matches where the matched word is also tagged via POS
PosConceptMatches(Path,Doc,Span,Label) <- \
Docs(Path,Doc,"lemma_concept"),\
ConceptTagRules(Pattern, Label, "pos"),\
rgx(Pattern,Doc) -> (Span),\
Pos(Path,Doc,Span,POSLabel).

TargetMatches(Path,Doc, Span, Label) <- \
Docs(Path,Doc,"pos_concept"),\
TargetTagRules(Pattern, Label), rgx(Pattern,Doc) -> (Span).

# we get section spans and their content using our regex pattern and the rgx_split ie function
Sections(P,D,Sec,Content)<-Docs(P,D,"target_concept"),\
rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\
as_str(SecSpan)->(Sec).

PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag).

Sents(P,S)<-Docs(P,D,"target_concept"),split_sentence(D)->(S).

SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),expr_eval("{0}.end +1 == {1}.start",S1,S2)->(True).

# first we get the covid mentions and their surrounding sentences, using the span_contained ie function
CovidMentions(Path, Span) <- Docs(Path,D,"target_concept"), rgx("COVID-19",D) -> (Span).
CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True).

# note that for ease of debugging, we extended our head to track which rule a fact was derived from
# a tag is positive if it is contained in a positive section
CovidTags(Path,Mention,'positive','section')<-\
PositiveSections(Path,D,Title,Section),\
CovidMentions(Path,Mention),\
span_contained(Mention,Section)->(True).

# Context rules tags
CovidTags(Path,Mention,Tag,'sentence context')<-\
CovidMentionSents(Path,Mention,Sent),\
SentenceContextRules(Pattern,Tag,DisambiguationPattern),\
rgx(Pattern,Sent)->(ContextSpan),\
span_contained(Mention,ContextSpan)->(True),\
rgx_is_match(DisambiguationPattern,Sent)->(False).

# post processing based on pattern
CovidTags(Path,Mention,Tag,'post pattern')<-\
CovidMentionSents(Path,Mention,Sent),\
PostprocessPatternRules(Pattern,Tag),\
rgx(Pattern,Sent)->(ContextSpan),\
span_contained(Mention,ContextSpan)->(True).

# post processing based on pattern and existing attributes
# notice the recursive call to CovidTags
CovidTags(Path,Mention,Tag,"post attribute change")<-\
CovidTags(Path,Mention,OldTag,Derivation),\
PostprocessRulesWithAttributes(Pattern,OldTag,Tag),\
CovidMentionSents(Path,Mention,Sent),\
rgx(Pattern,Sent)->(ContextSpan),\
span_contained(Mention,ContextSpan)->(True).

# post processing based on pattern in the next sentence
CovidTags(Path,Mention,Tag,"next sentence")<-\
CovidMentionSents(Path,Mention,Sent),\
SentPairs(Path,Sent,NextSent),\
PostprocessPatternRules(Pattern,Tag),\
rgx(Pattern,NextSent)->(ContextSpan).

AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-\
CovidTags(Path,Mention,Tag,Derivation).

DocumentTags(Path,agg_doc_tags(Tag))<-\
AggregatedCovidTags(Path,Mention,Tag).
2 changes: 0 additions & 2 deletions nbs/benchmark/covid_data/covid_logic.pl

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 10a9d06

Please sign in to comment.