Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
techofer committed Dec 16, 2024
1 parent 4e1b307 commit f983818
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 57 deletions.
80 changes: 30 additions & 50 deletions nbs/benchmark/001_covid.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@
"sess = get_magic_session()\n",
"\n",
"\n",
"sess.register('rgx', rgx, [str, Span], span_arity)\n",
"sess.register('rgx_split', rgx_split, [str, Span], [Span,Span])\n",
"sess.register('rgx_is_match', rgx_is_match, [str, Span], [bool])\n",
"sess.register('span_contained', span_contained, [Span, Span], [bool]) \n",
"sess.register('py_rgx', rgx, [str, Span], span_arity)\n",
"sess.register('py_rgx_split', rgx_split, [str, Span], [Span,Span])\n",
"sess.register('py_rgx_is_match', rgx_is_match, [str, Span], [bool])\n",
"sess.register('py_span_contained', span_contained, [Span, Span], [bool]) \n",
"\n",
"import spacy\n",
"nlp = spacy.load(\"en_core_web_sm\")\n"
Expand Down Expand Up @@ -269,8 +269,7 @@
"if VERSION in [\"SPANNERFLOW\", \"SPANNERFLOW_PYTHON_IE\"]:\n",
" sess.import_rel('Docs',raw_docs, scheme=[str, Span, str])\n",
"else:\n",
" sess.import_rel('Docs',raw_docs)\n",
"raw_docs"
" sess.import_rel('Docs',raw_docs)"
]
},
{
Expand All @@ -280,8 +279,7 @@
"outputs": [],
"source": [
"%%spannerlog\n",
"Lemmas(P,D,Word,Lem)<-Docs(P,D,\"raw_text\"),lemma(D)->(Word,Lem).\n",
"?Lemmas(A,B,C,D)"
"Lemmas(P,D,Word,Lem)<-Docs(P,D,\"raw_text\"),lemma(D)->(Word,Lem)."
]
},
{
Expand Down Expand Up @@ -311,8 +309,7 @@
" Docs(Path,Doc,\"lemma\"),\n",
" ConceptTagRules(Pattern, Label, \"lemma\"),\n",
" # TODO CHANGE: on different version\n",
" rgx(Pattern,Doc) -> (Span).\n",
"?LemmaConceptMatches(A,B,C,D)"
" py_rgx(Pattern,Doc) -> (Span)."
]
},
{
Expand All @@ -338,8 +335,7 @@
"source": [
"%%spannerlog\n",
"# here we get the spans of all POS\n",
"Pos(P,D,Word,Lem)<-Docs(P,D,\"lemma_concept\"),pos(D)->(Word,Lem).\n",
"?Pos(A,B,C,D)"
"Pos(P,D,Word,Lem)<-Docs(P,D,\"lemma_concept\"),pos(D)->(Word,Lem)."
]
},
{
Expand All @@ -354,9 +350,8 @@
" Docs(Path,Doc,\"lemma_concept\"),\n",
" ConceptTagRules(Pattern, Label, \"pos\"),\n",
" # TODO CHANGE: on different version\n",
" rgx(Pattern,Doc) -> (Span),\n",
" Pos(Path,Doc,Span,POSLabel).\n",
"?PosConceptMatches(A,B,C,D)"
" py_rgx(Pattern,Doc) -> (Span),\n",
" Pos(Path,Doc,Span,POSLabel)."
]
},
{
Expand Down Expand Up @@ -384,8 +379,7 @@
"TargetMatches(Path,Doc, Span, Label) <- \n",
" Docs(Path,Doc,\"pos_concept\"),\n",
" # TODO CHANGE: on different version\n",
" TargetTagRules(Pattern, Label), rgx(Pattern,Doc) -> (Span).\n",
"?TargetMatches(A,B,C,D)"
" TargetTagRules(Pattern, Label), py_rgx(Pattern,Doc) -> (Span)."
]
},
{
Expand Down Expand Up @@ -433,14 +427,12 @@
"outputs": [],
"source": [
"%%spannerlog\n",
"# we get section spans and their content using our regex pattern and the rgx_split ie function\n",
"# we get section spans and their content using our regex pattern and the py_rgx_split ie function\n",
"Sections(P,D,Sec,Content)<-Docs(P,D,\"target_concept\"),\n",
" rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\n",
" py_rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\n",
" as_str(SecSpan)->(Sec).\n",
"?Sections(A,B,C,D)\n",
"\n",
"PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag).\n",
"?PositiveSections(A,B,C,D)"
"PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag).\n"
]
},
{
Expand All @@ -450,8 +442,7 @@
"outputs": [],
"source": [
"%%spannerlog\n",
"Sents(P,S)<-Docs(P,D,\"target_concept\"),split_sentence(D)->(S).\n",
"?Sents(A,B)"
"Sents(P,S)<-Docs(P,D,\"target_concept\"),split_sentence(D)->(S).\n"
]
},
{
Expand Down Expand Up @@ -490,8 +481,7 @@
"outputs": [],
"source": [
"%%spannerlog\n",
"SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),is_adjacent(S1,S2)->(True).\n",
"?SentPairs(A,B,C)"
"SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),is_adjacent(S1,S2)->(True).\n"
]
},
{
Expand All @@ -501,13 +491,12 @@
"outputs": [],
"source": [
"%%spannerlog\n",
"# first we get the covid mentions and their surrounding sentences, using the span_contained ie function\n",
"# first we get the covid mentions and their surrounding sentences, using the py_span_contained ie function\n",
"# TODO CHANGE: on different version\n",
"CovidMentions(Path, Span) <- Docs(Path,D,\"target_concept\"), rgx(\"COVID-19\",D) -> (Span).\n",
"?CovidMentions(A,B)\n",
"CovidMentions(Path, Span) <- Docs(Path,D,\"target_concept\"), py_rgx(\"COVID-19\",D) -> (Span).\n",
"\n",
"# TODO CHANGE: on different version\n",
"CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True).\n",
"?CovidMentionSents(A,B,C)"
"CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),py_span_contained(Mention,Sent)->(True).\n"
]
},
{
Expand All @@ -525,31 +514,27 @@
" PositiveSections(Path,D,Title,Section),\n",
" CovidMentions(Path,Mention),\n",
" # TODO CHANGE: on different version\n",
" span_contained(Mention,Section)->(True).\n",
"\n",
"?CovidTags(A,B,C,D)\n",
" py_span_contained(Mention,Section)->(True).\n",
"\n",
"# Context rules tags\n",
"CovidTags(Path,Mention,Tag,'sentence context')<-\n",
" CovidMentionSents(Path,Mention,Sent),\n",
" SentenceContextRules(Pattern,Tag,DisambiguationPattern),\n",
" # TODO CHANGE: on different version\n",
" rgx(Pattern,Sent)->(ContextSpan),\n",
" py_rgx(Pattern,Sent)->(ContextSpan),\n",
" # TODO CHANGE: on different version\n",
" span_contained(Mention,ContextSpan)->(True),\n",
" py_span_contained(Mention,ContextSpan)->(True),\n",
" # TODO CHANGE: on different version\n",
" rgx_is_match(DisambiguationPattern,Sent)->(False).\n",
"\n",
"?CovidTags(A,B,C,D)\n",
" py_rgx_is_match(DisambiguationPattern,Sent)->(False).\n",
"\n",
"# post processing based on pattern\n",
"CovidTags(Path,Mention,Tag,'post pattern')<-\n",
" CovidMentionSents(Path,Mention,Sent),\n",
" PostprocessPatternRules(Pattern,Tag),\n",
" # TODO CHANGE: on different version\n",
" rgx(Pattern,Sent)->(ContextSpan),\n",
" py_rgx(Pattern,Sent)->(ContextSpan),\n",
" # TODO CHANGE: on different version\n",
" span_contained(Mention,ContextSpan)->(True).\n",
" py_span_contained(Mention,ContextSpan)->(True).\n",
"\n",
"# post processing based on pattern and existing attributes\n",
"# notice the recursive call to CovidTags\n",
Expand All @@ -558,21 +543,18 @@
" PostprocessRulesWithAttributes(Pattern,OldTag,Tag),\n",
" CovidMentionSents(Path,Mention,Sent),\n",
" # TODO CHANGE: on different version\n",
" rgx(Pattern,Sent)->(ContextSpan),\n",
" py_rgx(Pattern,Sent)->(ContextSpan),\n",
" # TODO CHANGE: on different version\n",
" span_contained(Mention,ContextSpan)->(True).\n",
" py_span_contained(Mention,ContextSpan)->(True).\n",
"\n",
"?CovidTags(A,B,C,D)\n",
"\n",
"# post processing based on pattern in the next sentence\n",
"CovidTags(Path,Mention,Tag,\"next sentence\")<-\n",
" CovidMentionSents(Path,Mention,Sent),\n",
" SentPairs(Path,Sent,NextSent),\n",
" PostprocessPatternRules(Pattern,Tag),\n",
" # TODO CHANGE: on different version\n",
" rgx(Pattern,NextSent)->(ContextSpan).\n",
"\n",
"?CovidTags(A,B,C,D)"
" py_rgx(Pattern,NextSent)->(ContextSpan).\n"
]
},
{
Expand Down Expand Up @@ -627,11 +609,9 @@
"%%spannerlog\n",
"AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-\n",
" CovidTags(Path,Mention,Tag,Derivation).\n",
"?AggregatedCovidTags(A,B,C)\n",
"\n",
"DocumentTags(Path,agg_doc_tags(Tag))<-\n",
" AggregatedCovidTags(Path,Mention,Tag).\n",
"?DocumentTags(A,B)\n"
" AggregatedCovidTags(Path,Mention,Tag).\n"
]
},
{
Expand Down
1 change: 0 additions & 1 deletion nbs/callbacks/001_basic_ies.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,6 @@
"#| export\n",
"def span_contained(s1, s2):\n",
" \"\"\"yields True if s1 is contained in s2, otherwise yield False\"\"\"\n",
" print(s1, s2, s1.doc, s2.doc, s1.start, s2.start, s1.end, s2.end)\n",
" if s1.name == s2.name and s1.start >= s2.start and s1.end <= s2.end:\n",
" yield True\n",
" else:\n",
Expand Down
9 changes: 4 additions & 5 deletions spannerlib/banchmark/covid.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
sess = get_magic_session()


sess.register('rgx', rgx, [str, Span], span_arity)
sess.register('rgx_split', rgx_split, [str, Span], [Span,Span])
sess.register('rgx_is_match', rgx_is_match, [str, Span], [bool])
sess.register('span_contained', span_contained, [Span, Span], [bool])
sess.register('py_rgx', rgx, [str, Span], span_arity)
sess.register('py_rgx_split', rgx_split, [str, Span], [Span,Span])
sess.register('py_rgx_is_match', rgx_is_match, [str, Span], [bool])
sess.register('py_span_contained', span_contained, [Span, Span], [bool])

import spacy
nlp = spacy.load("en_core_web_sm")
Expand Down Expand Up @@ -168,7 +168,6 @@ def rewrite_docs(docs,span_label,new_version):
sess.import_rel('Docs',raw_docs, scheme=[str, Span, str])
else:
sess.import_rel('Docs',raw_docs)
raw_docs

# %% ../../nbs/benchmark/001_covid.ipynb 15
lemma_tags = sess.export('?Lemmas(P,D,W,L)')
Expand Down
1 change: 0 additions & 1 deletion spannerlib/ie_func/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ def as_str(obj):
# %% ../../nbs/callbacks/001_basic_ies.ipynb 30
def span_contained(s1, s2):
"""yields True if s1 is contained in s2, otherwise yield False"""
print(s1, s2, s1.doc, s2.doc, s1.start, s2.start, s1.end, s2.end)
if s1.name == s2.name and s1.start >= s2.start and s1.end <= s2.end:
yield True
else:
Expand Down

0 comments on commit f983818

Please sign in to comment.