From f9838184d946fe63335a39d7276d61c7c874aa19 Mon Sep 17 00:00:00 2001 From: Ofer Katz Date: Mon, 16 Dec 2024 04:01:39 +0200 Subject: [PATCH] WIP --- nbs/benchmark/001_covid.ipynb | 80 ++++++++++++------------------- nbs/callbacks/001_basic_ies.ipynb | 1 - spannerlib/banchmark/covid.py | 9 ++-- spannerlib/ie_func/basic.py | 1 - 4 files changed, 34 insertions(+), 57 deletions(-) diff --git a/nbs/benchmark/001_covid.ipynb b/nbs/benchmark/001_covid.ipynb index 3e387a02..1f51ecb2 100644 --- a/nbs/benchmark/001_covid.ipynb +++ b/nbs/benchmark/001_covid.ipynb @@ -61,10 +61,10 @@ "sess = get_magic_session()\n", "\n", "\n", - "sess.register('rgx', rgx, [str, Span], span_arity)\n", - "sess.register('rgx_split', rgx_split, [str, Span], [Span,Span])\n", - "sess.register('rgx_is_match', rgx_is_match, [str, Span], [bool])\n", - "sess.register('span_contained', span_contained, [Span, Span], [bool]) \n", + "sess.register('py_rgx', rgx, [str, Span], span_arity)\n", + "sess.register('py_rgx_split', rgx_split, [str, Span], [Span,Span])\n", + "sess.register('py_rgx_is_match', rgx_is_match, [str, Span], [bool])\n", + "sess.register('py_span_contained', span_contained, [Span, Span], [bool]) \n", "\n", "import spacy\n", "nlp = spacy.load(\"en_core_web_sm\")\n" @@ -269,8 +269,7 @@ "if VERSION in [\"SPANNERFLOW\", \"SPANNERFLOW_PYTHON_IE\"]:\n", " sess.import_rel('Docs',raw_docs, scheme=[str, Span, str])\n", "else:\n", - " sess.import_rel('Docs',raw_docs)\n", - "raw_docs" + " sess.import_rel('Docs',raw_docs)" ] }, { @@ -280,8 +279,7 @@ "outputs": [], "source": [ "%%spannerlog\n", - "Lemmas(P,D,Word,Lem)<-Docs(P,D,\"raw_text\"),lemma(D)->(Word,Lem).\n", - "?Lemmas(A,B,C,D)" + "Lemmas(P,D,Word,Lem)<-Docs(P,D,\"raw_text\"),lemma(D)->(Word,Lem)." ] }, { @@ -311,8 +309,7 @@ " Docs(Path,Doc,\"lemma\"),\n", " ConceptTagRules(Pattern, Label, \"lemma\"),\n", " # TODO CHANGE: on different version\n", - " rgx(Pattern,Doc) -> (Span).\n", - "?LemmaConceptMatches(A,B,C,D)" + " py_rgx(Pattern,Doc) -> (Span)." ] }, { @@ -338,8 +335,7 @@ "source": [ "%%spannerlog\n", "# here we get the spans of all POS\n", - "Pos(P,D,Word,Lem)<-Docs(P,D,\"lemma_concept\"),pos(D)->(Word,Lem).\n", - "?Pos(A,B,C,D)" + "Pos(P,D,Word,Lem)<-Docs(P,D,\"lemma_concept\"),pos(D)->(Word,Lem)." ] }, { @@ -354,9 +350,8 @@ " Docs(Path,Doc,\"lemma_concept\"),\n", " ConceptTagRules(Pattern, Label, \"pos\"),\n", " # TODO CHANGE: on different version\n", - " rgx(Pattern,Doc) -> (Span),\n", - " Pos(Path,Doc,Span,POSLabel).\n", - "?PosConceptMatches(A,B,C,D)" + " py_rgx(Pattern,Doc) -> (Span),\n", + " Pos(Path,Doc,Span,POSLabel)." ] }, { @@ -384,8 +379,7 @@ "TargetMatches(Path,Doc, Span, Label) <- \n", " Docs(Path,Doc,\"pos_concept\"),\n", " # TODO CHANGE: on different version\n", - " TargetTagRules(Pattern, Label), rgx(Pattern,Doc) -> (Span).\n", - "?TargetMatches(A,B,C,D)" + " TargetTagRules(Pattern, Label), py_rgx(Pattern,Doc) -> (Span)." ] }, { @@ -433,14 +427,12 @@ "outputs": [], "source": [ "%%spannerlog\n", - "# we get section spans and their content using our regex pattern and the rgx_split ie function\n", + "# we get section spans and their content using our regex pattern and the py_rgx_split ie function\n", "Sections(P,D,Sec,Content)<-Docs(P,D,\"target_concept\"),\n", - " rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\n", + " py_rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\n", " as_str(SecSpan)->(Sec).\n", - "?Sections(A,B,C,D)\n", "\n", - "PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag).\n", - "?PositiveSections(A,B,C,D)" + "PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag).\n" ] }, { @@ -450,8 +442,7 @@ "outputs": [], "source": [ "%%spannerlog\n", - "Sents(P,S)<-Docs(P,D,\"target_concept\"),split_sentence(D)->(S).\n", - "?Sents(A,B)" + "Sents(P,S)<-Docs(P,D,\"target_concept\"),split_sentence(D)->(S).\n" ] }, { @@ -490,8 +481,7 @@ "outputs": [], "source": [ "%%spannerlog\n", - "SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),is_adjacent(S1,S2)->(True).\n", - "?SentPairs(A,B,C)" + "SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),is_adjacent(S1,S2)->(True).\n" ] }, { @@ -501,13 +491,12 @@ "outputs": [], "source": [ "%%spannerlog\n", - "# first we get the covid mentions and their surrounding sentences, using the span_contained ie function\n", + "# first we get the covid mentions and their surrounding sentences, using the py_span_contained ie function\n", "# TODO CHANGE: on different version\n", - "CovidMentions(Path, Span) <- Docs(Path,D,\"target_concept\"), rgx(\"COVID-19\",D) -> (Span).\n", - "?CovidMentions(A,B)\n", + "CovidMentions(Path, Span) <- Docs(Path,D,\"target_concept\"), py_rgx(\"COVID-19\",D) -> (Span).\n", + "\n", "# TODO CHANGE: on different version\n", - "CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True).\n", - "?CovidMentionSents(A,B,C)" + "CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),py_span_contained(Mention,Sent)->(True).\n" ] }, { @@ -525,31 +514,27 @@ " PositiveSections(Path,D,Title,Section),\n", " CovidMentions(Path,Mention),\n", " # TODO CHANGE: on different version\n", - " span_contained(Mention,Section)->(True).\n", - "\n", - "?CovidTags(A,B,C,D)\n", + " py_span_contained(Mention,Section)->(True).\n", "\n", "# Context rules tags\n", "CovidTags(Path,Mention,Tag,'sentence context')<-\n", " CovidMentionSents(Path,Mention,Sent),\n", " SentenceContextRules(Pattern,Tag,DisambiguationPattern),\n", " # TODO CHANGE: on different version\n", - " rgx(Pattern,Sent)->(ContextSpan),\n", + " py_rgx(Pattern,Sent)->(ContextSpan),\n", " # TODO CHANGE: on different version\n", - " span_contained(Mention,ContextSpan)->(True),\n", + " py_span_contained(Mention,ContextSpan)->(True),\n", " # TODO CHANGE: on different version\n", - " rgx_is_match(DisambiguationPattern,Sent)->(False).\n", - "\n", - "?CovidTags(A,B,C,D)\n", + " py_rgx_is_match(DisambiguationPattern,Sent)->(False).\n", "\n", "# post processing based on pattern\n", "CovidTags(Path,Mention,Tag,'post pattern')<-\n", " CovidMentionSents(Path,Mention,Sent),\n", " PostprocessPatternRules(Pattern,Tag),\n", " # TODO CHANGE: on different version\n", - " rgx(Pattern,Sent)->(ContextSpan),\n", + " py_rgx(Pattern,Sent)->(ContextSpan),\n", " # TODO CHANGE: on different version\n", - " span_contained(Mention,ContextSpan)->(True).\n", + " py_span_contained(Mention,ContextSpan)->(True).\n", "\n", "# post processing based on pattern and existing attributes\n", "# notice the recursive call to CovidTags\n", @@ -558,11 +543,10 @@ " PostprocessRulesWithAttributes(Pattern,OldTag,Tag),\n", " CovidMentionSents(Path,Mention,Sent),\n", " # TODO CHANGE: on different version\n", - " rgx(Pattern,Sent)->(ContextSpan),\n", + " py_rgx(Pattern,Sent)->(ContextSpan),\n", " # TODO CHANGE: on different version\n", - " span_contained(Mention,ContextSpan)->(True).\n", + " py_span_contained(Mention,ContextSpan)->(True).\n", "\n", - "?CovidTags(A,B,C,D)\n", "\n", "# post processing based on pattern in the next sentence\n", "CovidTags(Path,Mention,Tag,\"next sentence\")<-\n", @@ -570,9 +554,7 @@ " SentPairs(Path,Sent,NextSent),\n", " PostprocessPatternRules(Pattern,Tag),\n", " # TODO CHANGE: on different version\n", - " rgx(Pattern,NextSent)->(ContextSpan).\n", - "\n", - "?CovidTags(A,B,C,D)" + " py_rgx(Pattern,NextSent)->(ContextSpan).\n" ] }, { @@ -627,11 +609,9 @@ "%%spannerlog\n", "AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-\n", " CovidTags(Path,Mention,Tag,Derivation).\n", - "?AggregatedCovidTags(A,B,C)\n", "\n", "DocumentTags(Path,agg_doc_tags(Tag))<-\n", - " AggregatedCovidTags(Path,Mention,Tag).\n", - "?DocumentTags(A,B)\n" + " AggregatedCovidTags(Path,Mention,Tag).\n" ] }, { diff --git a/nbs/callbacks/001_basic_ies.ipynb b/nbs/callbacks/001_basic_ies.ipynb index 36bd6a42..d770eca9 100644 --- a/nbs/callbacks/001_basic_ies.ipynb +++ b/nbs/callbacks/001_basic_ies.ipynb @@ -522,7 +522,6 @@ "#| export\n", "def span_contained(s1, s2):\n", " \"\"\"yields True if s1 is contained in s2, otherwise yield False\"\"\"\n", - " print(s1, s2, s1.doc, s2.doc, s1.start, s2.start, s1.end, s2.end)\n", " if s1.name == s2.name and s1.start >= s2.start and s1.end <= s2.end:\n", " yield True\n", " else:\n", diff --git a/spannerlib/banchmark/covid.py b/spannerlib/banchmark/covid.py index c5ab9248..fd8b7a41 100644 --- a/spannerlib/banchmark/covid.py +++ b/spannerlib/banchmark/covid.py @@ -28,10 +28,10 @@ sess = get_magic_session() -sess.register('rgx', rgx, [str, Span], span_arity) -sess.register('rgx_split', rgx_split, [str, Span], [Span,Span]) -sess.register('rgx_is_match', rgx_is_match, [str, Span], [bool]) -sess.register('span_contained', span_contained, [Span, Span], [bool]) +sess.register('py_rgx', rgx, [str, Span], span_arity) +sess.register('py_rgx_split', rgx_split, [str, Span], [Span,Span]) +sess.register('py_rgx_is_match', rgx_is_match, [str, Span], [bool]) +sess.register('py_span_contained', span_contained, [Span, Span], [bool]) import spacy nlp = spacy.load("en_core_web_sm") @@ -168,7 +168,6 @@ def rewrite_docs(docs,span_label,new_version): sess.import_rel('Docs',raw_docs, scheme=[str, Span, str]) else: sess.import_rel('Docs',raw_docs) -raw_docs # %% ../../nbs/benchmark/001_covid.ipynb 15 lemma_tags = sess.export('?Lemmas(P,D,W,L)') diff --git a/spannerlib/ie_func/basic.py b/spannerlib/ie_func/basic.py index a0858219..5f26a8c5 100644 --- a/spannerlib/ie_func/basic.py +++ b/spannerlib/ie_func/basic.py @@ -190,7 +190,6 @@ def as_str(obj): # %% ../../nbs/callbacks/001_basic_ies.ipynb 30 def span_contained(s1, s2): """yields True if s1 is contained in s2, otherwise yield False""" - print(s1, s2, s1.doc, s2.doc, s1.start, s2.start, s1.end, s2.end) if s1.name == s2.name and s1.start >= s2.start and s1.end <= s2.end: yield True else: