-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10,042 changed files
with
292 additions
and
1,351 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
import time | ||
from glob import glob | ||
import pandas as pd | ||
from pandas import DataFrame | ||
from pathlib import Path | ||
from spannerlib import get_magic_session,Session,Span | ||
sess = get_magic_session() | ||
|
||
import spacy | ||
nlp = spacy.load("en_core_web_sm") | ||
|
||
# configurations | ||
slog_file = Path('covid_bench_logic.pl') | ||
input_dir = Path('covid_data/sample_inputs') | ||
data_dir = Path('covid_data/rules_data') | ||
|
||
start_time = time.time() | ||
|
||
def split_sentence(text): | ||
""" | ||
Splits a text into individual sentences. using spacy's sentence detection. | ||
Returns: | ||
str: Individual sentences extracted from the input text. | ||
""" | ||
|
||
doc = nlp(str(text)) | ||
start = 0 | ||
for sentence in doc.sents: | ||
end = start+len(sentence.text) | ||
# note that we yield a Span object, so we can keep track of the locations of the sentences | ||
yield Span(text,start,end) | ||
start = end + 1 | ||
|
||
class LemmaFromList(): | ||
def __init__(self,lemma_list): | ||
self.lemma_list = lemma_list | ||
|
||
def __call__(self,text): | ||
doc = nlp(str(text)) | ||
for word in doc: | ||
start = word.idx | ||
end = start + len(word.text) | ||
if word.lemma_ in self.lemma_list: | ||
yield (Span(text,start,end),word.lemma_) | ||
elif word.like_num: | ||
yield (Span(text,start,end),'like_num') | ||
else: | ||
pass | ||
|
||
lemma_list = (data_dir/'lemma_words.txt').read_text().split() | ||
lemmatizer = LemmaFromList(lemma_list) | ||
|
||
class PosFromList(): | ||
def __init__(self,pos_list): | ||
self.pos_list = pos_list | ||
def __call__(self,text): | ||
doc = nlp(str(text)) | ||
for word in doc: | ||
start = word.idx | ||
end = start + len(word.text) | ||
if word.pos_ in self.pos_list: | ||
yield (Span(text,start,end),word.pos_) | ||
|
||
pos_annotator = PosFromList(["NOUN", "PROPN", "PRON", "ADJ"]) | ||
|
||
def agg_mention(group): | ||
""" | ||
aggregates attribute groups of covid spans | ||
""" | ||
if 'IGNORE' in group.values: | ||
return 'IGNORE' | ||
elif 'negated' in group.values and not 'no_negated' in group.values: | ||
return 'negated' | ||
elif 'future' in group.values and not 'no_future' in group.values: | ||
return 'negated' | ||
elif 'other experiencer' in group.values or 'not relevant' in group.values: | ||
return 'negated' | ||
elif 'positive' in group.values and not 'uncertain' in group.values and not 'no_positive' in group.values: | ||
return 'positive' | ||
else: | ||
return 'uncertain' | ||
|
||
def AggDocumentTags(group): | ||
""" | ||
Classifies a document as 'POS', 'UNK', or 'NEG' based on COVID-19 attributes. | ||
""" | ||
if 'positive' in group.values: | ||
return 'POS' | ||
elif 'uncertain' in group.values: | ||
return 'UNK' | ||
elif 'negated' in group.values: | ||
return 'NEG' | ||
else: | ||
return 'UNK' | ||
|
||
def rewrite(text,span_label_pairs): | ||
"""rewrites a string given a dataframe with spans and the string to rewrite them to | ||
assumes that the spans belong to the text | ||
Args: | ||
text (str like): string to rewrite | ||
span_label_pairs (pd.Dataframe) dataframe with two columns, first is spans in the doc to rewrite | ||
second is what to rewrite to | ||
Returns: | ||
The rewritten string | ||
""" | ||
if isinstance(text,Span): | ||
text = text.as_str() | ||
span_label_pairs = sorted(list(span_label_pairs.itertuples(index=False,name=None)), key=lambda x: x[0].start) | ||
|
||
rewritten_text = '' | ||
current_pos = 0 | ||
for span,label in span_label_pairs: | ||
rewritten_text += text[current_pos:span.start] + label | ||
current_pos = span.end | ||
|
||
rewritten_text += text[current_pos:] | ||
|
||
return rewritten_text | ||
|
||
|
||
def rewrite_docs(docs,span_label,new_version): | ||
"""Given a dataframe of documents of the form (path,doc,version) and a dataframe of spans to rewrite | ||
of the form (path,word,from_span,to_tag), rewrites the documents and returns a new dataframe of the form | ||
(path,doc,new_version) | ||
""" | ||
new_tuples =[] | ||
span_label.columns = ['P','D','W','L'] | ||
for path,doc,_ in docs.itertuples(index=False,name=None): | ||
span_label_per_doc = span_label[span_label['P'] == path][['W','L']] | ||
new_text = rewrite(doc,span_label_per_doc) | ||
new_tuples.append((path,new_text,new_version)) | ||
return pd.DataFrame(new_tuples,columns=['P','D','V']) | ||
|
||
file_paths = [] | ||
def main(input_dir,data_dir,logic_file, start=0, end=10): | ||
global file_paths | ||
sess = Session() | ||
# define callback functions | ||
sess.register('split_sentence',split_sentence,[(str,Span)],[Span]) | ||
sess.register('pos',pos_annotator,[(Span,str)],[Span,str]) | ||
sess.register('lemma',lemmatizer,[(Span,str)],[Span,str]) | ||
sess.register_agg('agg_mention',agg_mention,[str],[str]) | ||
sess.register_agg('agg_doc_tags',AggDocumentTags,[str],[str]) | ||
|
||
# bring in code as data | ||
sess.import_rel("ConceptTagRules",data_dir/"concept_tags_rules.csv" , delim=",") | ||
sess.import_rel("TargetTagRules",data_dir/"target_rules.csv",delim=",") | ||
sess.import_rel("SectionTags",data_dir/"section_tags.csv",delim=",") | ||
sess.import_rel("PositiveSectionTags",data_dir/"positive_section_tags.csv",delim=",") | ||
sess.import_rel("SentenceContextRules",data_dir/'sentence_context_rules.csv',delim="#") | ||
sess.import_rel("PostprocessPatternRules",data_dir/'postprocess_pattern_rules.csv',delim="#") | ||
sess.import_rel("PostprocessRulesWithAttributes",data_dir/'postprocess_attributes_rules.csv',delim="#") | ||
sess.import_rel("NextSentencePostprocessPatternRules",data_dir/'postprocess_pattern_next_sentence_rules.csv',delim=',') | ||
|
||
|
||
# we will programatically build a regex that matches all the section patterns | ||
section_tags = pd.read_csv(data_dir/'section_tags.csv',names=['literal','tag']) | ||
section_delimeter_pattern = section_tags['literal'].str.cat(sep='|') | ||
sess.import_var('section_delimeter_pattern',section_delimeter_pattern) | ||
|
||
# bring in data | ||
file_paths = [Path(p) for p in glob(str(input_dir/'*.txt'))] | ||
file_paths.sort() | ||
file_paths = file_paths[start:end] | ||
raw_docs = pd.DataFrame([ | ||
[p.name,p.read_text(),'raw_text'] for p in file_paths | ||
],columns=['Path','Doc','Version'] | ||
) | ||
sess.import_rel('Docs',raw_docs) | ||
|
||
# load logic, note that since we did not define the data relations in the logic file, | ||
# we need to load the logic after the data has been loaded | ||
sess.export(logic_file.read_text()) | ||
|
||
## Rewritting the documents | ||
lemma_tags = sess.export('?Lemmas(P,D,W,L)') | ||
lemma_docs = rewrite_docs(raw_docs,lemma_tags,'lemma') | ||
sess.import_rel('Docs',lemma_docs) | ||
|
||
lemma_concept_matches = sess.export('?LemmaConceptMatches(Path,Doc,Span,Label)') | ||
lemma_concepts = rewrite_docs(lemma_docs,lemma_concept_matches,'lemma_concept') | ||
sess.import_rel('Docs',lemma_concepts) | ||
|
||
pos_concept_matches = sess.export('?PosConceptMatches(P,D,W,L)') | ||
pos_concept_docs = rewrite_docs(lemma_concepts,pos_concept_matches,'pos_concept') | ||
sess.import_rel('Docs',pos_concept_docs) | ||
|
||
target_matches = sess.export('?TargetMatches(P,D,W,L)') | ||
target_rule_docs = rewrite_docs(pos_concept_docs,target_matches,'target_concept') | ||
sess.import_rel('Docs',target_rule_docs) | ||
|
||
## computing the tags based on the target concept documents | ||
doc_tags = sess.export('?DocumentTags(P,T)') | ||
|
||
# handling files with no mentions | ||
paths = pd.DataFrame([p.name for p in file_paths],columns=['P']) | ||
classification = paths.merge(doc_tags,on='P',how='outer') | ||
classification['T']=classification['T'].fillna('UNK') | ||
classification | ||
|
||
return classification | ||
|
||
|
||
for i in range(0, 1000, 20): | ||
res = main(input_dir,data_dir,slog_file, start=i, end=i+20) | ||
print(res) | ||
|
||
end_time = time.time() | ||
print(f"Number of Documents: {len(file_paths)}") | ||
print(f"Time taken: {end_time-start_time:.2f} seconds") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
Lemmas(P,D,Word,Lem)<-Docs(P,D,"raw_text"),lemma(D)->(Word,Lem). | ||
|
||
LemmaConceptMatches(Path,Doc,Span,Label) <- \ | ||
Docs(Path,Doc,"lemma"),\ | ||
ConceptTagRules(Pattern, Label, "lemma"),\ | ||
rgx(Pattern,Doc) -> (Span). | ||
|
||
# here we get the spans of all POS | ||
Pos(P,D,Word,Lem)<-Docs(P,D,"lemma_concept"),pos(D)->(Word,Lem). | ||
|
||
# here we look for concept rule matches where the matched word is also tagged via POS | ||
PosConceptMatches(Path,Doc,Span,Label) <- \ | ||
Docs(Path,Doc,"lemma_concept"),\ | ||
ConceptTagRules(Pattern, Label, "pos"),\ | ||
rgx(Pattern,Doc) -> (Span),\ | ||
Pos(Path,Doc,Span,POSLabel). | ||
|
||
TargetMatches(Path,Doc, Span, Label) <- \ | ||
Docs(Path,Doc,"pos_concept"),\ | ||
TargetTagRules(Pattern, Label), rgx(Pattern,Doc) -> (Span). | ||
|
||
# we get section spans and their content using our regex pattern and the rgx_split ie function | ||
Sections(P,D,Sec,Content)<-Docs(P,D,"target_concept"),\ | ||
rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\ | ||
as_str(SecSpan)->(Sec). | ||
|
||
PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag). | ||
|
||
Sents(P,S)<-Docs(P,D,"target_concept"),split_sentence(D)->(S). | ||
|
||
SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),expr_eval("{0}.end +1 == {1}.start",S1,S2)->(True). | ||
|
||
# first we get the covid mentions and their surrounding sentences, using the span_contained ie function | ||
CovidMentions(Path, Span) <- Docs(Path,D,"target_concept"), rgx("COVID-19",D) -> (Span). | ||
CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True). | ||
|
||
# note that for ease of debugging, we extended our head to track which rule a fact was derived from | ||
# a tag is positive if it is contained in a positive section | ||
CovidTags(Path,Mention,'positive','section')<-\ | ||
PositiveSections(Path,D,Title,Section),\ | ||
CovidMentions(Path,Mention),\ | ||
span_contained(Mention,Section)->(True). | ||
|
||
# Context rules tags | ||
CovidTags(Path,Mention,Tag,'sentence context')<-\ | ||
CovidMentionSents(Path,Mention,Sent),\ | ||
SentenceContextRules(Pattern,Tag,DisambiguationPattern),\ | ||
rgx(Pattern,Sent)->(ContextSpan),\ | ||
span_contained(Mention,ContextSpan)->(True),\ | ||
rgx_is_match(DisambiguationPattern,Sent)->(False). | ||
|
||
# post processing based on pattern | ||
CovidTags(Path,Mention,Tag,'post pattern')<-\ | ||
CovidMentionSents(Path,Mention,Sent),\ | ||
PostprocessPatternRules(Pattern,Tag),\ | ||
rgx(Pattern,Sent)->(ContextSpan),\ | ||
span_contained(Mention,ContextSpan)->(True). | ||
|
||
# post processing based on pattern and existing attributes | ||
# notice the recursive call to CovidTags | ||
CovidTags(Path,Mention,Tag,"post attribute change")<-\ | ||
CovidTags(Path,Mention,OldTag,Derivation),\ | ||
PostprocessRulesWithAttributes(Pattern,OldTag,Tag),\ | ||
CovidMentionSents(Path,Mention,Sent),\ | ||
rgx(Pattern,Sent)->(ContextSpan),\ | ||
span_contained(Mention,ContextSpan)->(True). | ||
|
||
# post processing based on pattern in the next sentence | ||
CovidTags(Path,Mention,Tag,"next sentence")<-\ | ||
CovidMentionSents(Path,Mention,Sent),\ | ||
SentPairs(Path,Sent,NextSent),\ | ||
PostprocessPatternRules(Pattern,Tag),\ | ||
rgx(Pattern,NextSent)->(ContextSpan). | ||
|
||
AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-\ | ||
CovidTags(Path,Mention,Tag,Derivation). | ||
|
||
DocumentTags(Path,agg_doc_tags(Tag))<-\ | ||
AggregatedCovidTags(Path,Mention,Tag). |
This file was deleted.
Oops, something went wrong.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Oops, something went wrong.