From e5f281c335eb3a03bbf22c72f166de53b8fcc5ab Mon Sep 17 00:00:00 2001 From: Ido Tarazi Date: Sat, 7 Dec 2024 10:46:21 +0200 Subject: [PATCH] fixed bugs --- nbs/010_engine.ipynb | 3 + nbs/session_tests/001_basic_tests.ipynb | 6 +- nbs/session_tests/002_test_ie_functions.ipynb | 645 ++++- .../004_rewriting_a_real_codebase.ipynb | 2194 +++++++---------- nbs/tutorials/covid_data/covid_logic.pl | 48 - spannerlib/_modidx.py | 1 + spannerlib/engine.py | 3 + spannerlib/tutorials/covid.py | 22 +- 8 files changed, 1568 insertions(+), 1354 deletions(-) diff --git a/nbs/010_engine.ipynb b/nbs/010_engine.ipynb index b9c675b4..0b99a44c 100644 --- a/nbs/010_engine.ipynb +++ b/nbs/010_engine.ipynb @@ -256,6 +256,9 @@ " self.spannerflow_engine.delete_row(fact.name, fact.terms)\n", " # self.db[fact.name] = _pd_drop_row(df = self.db[fact.name],row_vals=fact.terms)\n", "\n", + " def get_span(self, document_id: str, start: int, end: int):\n", + " return self.spannerflow_engine.get_span(document_id, start, end)\n", + " \n", " def get_ie_function(self,name:str):\n", " return self.ie_functions.get(name,None)\n", "\n", diff --git a/nbs/session_tests/001_basic_tests.ipynb b/nbs/session_tests/001_basic_tests.ipynb index 22bbbd22..a2d24572 100644 --- a/nbs/session_tests/001_basic_tests.ipynb +++ b/nbs/session_tests/001_basic_tests.ipynb @@ -2304,9 +2304,13 @@ ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": ".env", "language": "python", "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13.0" } }, "nbformat": 4, diff --git a/nbs/session_tests/002_test_ie_functions.ipynb b/nbs/session_tests/002_test_ie_functions.ipynb index f94be0d1..22601eed 100644 --- a/nbs/session_tests/002_test_ie_functions.ipynb +++ b/nbs/session_tests/002_test_ie_functions.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -40,9 +40,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[@doc,0,1) \"h\",\n", + " [@doc,0,2) \"he\",\n", + " [@doc,0,3) \"hel\",\n", + " [@doc,0,4) \"hell\",\n", + " [@doc,0,5) \"hello\"]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def yield_range_span(text: str,name:str):\n", " for i in range(len(text)):\n", @@ -53,9 +68,251 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "I0000 00:00:1732644923.528662 7967395 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "data": { + "text/plain": [ + "'?test_range(X)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X
0
1
2
3
4
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1732644927.191288 7967395 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "data": { + "text/plain": [ + "'?test_range_str(X)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X
string0
string1
string2
string3
string4
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1732644929.744600 7967395 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "data": { + "text/plain": [ + "'?test_range_span(S)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S
h
he
hel
hell
hello
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "def yield_range(num: int):\n", " for i in range(num):\n", @@ -104,9 +361,145 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'?test_range_raw(S)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S
[@doc,0,1) \"h\"
[@doc,0,2) \"he\"
[@doc,0,3) \"hel\"
[@doc,0,4) \"hell\"
[@doc,0,5) \"hello\"
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S
0(h)
1(h, e)
2(h, e, l)
3(h, e, l, l)
4(h, e, l, l, o)
\n", + "
" + ], + "text/plain": [ + " S\n", + "0 (h)\n", + "1 (h, e)\n", + "2 (h, e, l)\n", + "3 (h, e, l, l)\n", + "4 (h, e, l, l, o)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\n", "sess.export('?test_range_raw(S)',display_results=True)" @@ -141,9 +534,147 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1732645425.758613 8004413 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n", + "I0000 00:00:1732645426.997195 7967395 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "data": { + "text/plain": [ + "'?py_span_rel(X)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X
[@e0c903,0,2) \"aa\"
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1732645444.780671 7967395 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "data": { + "text/plain": [ + "'?py_string_rel(X)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X
aa
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "sess = test_session(\n", "[\n", @@ -170,9 +701,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1732645470.038415 8011530 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n", + "I0000 00:00:1732645471.258000 7967395 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, + { + "data": { + "text/plain": [ + "'?py_span_rel(X,DEBUG)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XDEBUG
\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "AssertionError", + "evalue": "values: set() != {('aa', 'X=aa, const=aa')}\nvalues only in df1: set()\nvalues only in df2: {('aa', 'X=aa, const=aa')}", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m sess \u001b[38;5;241m=\u001b[39m \u001b[43mtest_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m[\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;250;43m \u001b[39;49m\u001b[38;5;124;43;03m\"\"\"\u001b[39;49;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124;43;03m py_span_rel(X,DEBUG) <- rgx(\".+\",\"aa\") -> (X), print(\"X={0}, const={1}\",X,\"aa\")->(DEBUG).\u001b[39;49;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124;43;03m ?py_span_rel(X,DEBUG)\u001b[39;49;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124;43;03m \"\"\"\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \n\u001b[1;32m 8\u001b[0m \u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m[\u001b[49m\n\u001b[1;32m 10\u001b[0m \n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mX=aa, const=aa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mX\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDEBUG\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# _ = sess.export('?py_span_rel(X)',display_results=True)\u001b[39;00m\n", + "File \u001b[0;32m~/Documents/Technion/spanner/spannerlib/spannerlib/session.py:508\u001b[0m, in \u001b[0;36mtest_session\u001b[0;34m(code_strings, expected_outputs, ie_funcs, agg_funcs, csvs, debug, display_results)\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 507\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(expected,pd\u001b[38;5;241m.\u001b[39mDataFrame) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(res,pd\u001b[38;5;241m.\u001b[39mDataFrame):\n\u001b[0;32m--> 508\u001b[0m \u001b[43massert_df_equals\u001b[49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[43m,\u001b[49m\u001b[43mexpected\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 510\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m res \u001b[38;5;241m==\u001b[39m expected, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexpected\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mres\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Documents/Technion/spanner/spannerlib/spannerlib/utils.py:196\u001b[0m, in \u001b[0;36massert_df_equals\u001b[0;34m(df1, df2)\u001b[0m\n\u001b[1;32m 194\u001b[0m vals1 \u001b[38;5;241m=\u001b[39m serialize_df_values(df1)\n\u001b[1;32m 195\u001b[0m vals2 \u001b[38;5;241m=\u001b[39m serialize_df_values(df2) \n\u001b[0;32m--> 196\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m vals1\u001b[38;5;241m==\u001b[39mvals2 , (\n\u001b[1;32m 197\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvals1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m != \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvals2\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues only in df1: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvals1\u001b[38;5;241m-\u001b[39mvals2\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues only in df2: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvals2\u001b[38;5;241m-\u001b[39mvals1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 200\u001b[0m )\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m df1\n", + "\u001b[0;31mAssertionError\u001b[0m: values: set() != {('aa', 'X=aa, const=aa')}\nvalues only in df1: set()\nvalues only in df2: {('aa', 'X=aa, const=aa')}" + ] + } + ], "source": [ "sess = test_session(\n", "[\n", @@ -310,9 +917,21 @@ ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": ".env", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" } }, "nbformat": 4, diff --git a/nbs/tutorials/004_rewriting_a_real_codebase.ipynb b/nbs/tutorials/004_rewriting_a_real_codebase.ipynb index 8ed013db..50fc4bf4 100644 --- a/nbs/tutorials/004_rewriting_a_real_codebase.ipynb +++ b/nbs/tutorials/004_rewriting_a_real_codebase.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -28,17 +28,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", + "\n", + "\n" ], "text/plain": [ "" @@ -695,7 +709,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -715,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -802,7 +816,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -826,7 +840,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -835,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -846,7 +860,7 @@ " [@a6c01c,101,139) \"SARS-COV-2...\"]" ] }, - "execution_count": null, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -872,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -899,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -908,7 +922,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -929,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -938,7 +952,7 @@ "[([@01e12d,0,4) \"sick\", 'ADJ'), ([@01e12d,5,8) \"boy\", 'NOUN')]" ] }, - "execution_count": null, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -950,14 +964,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "#| exports\n", - "sess.register('split_sentence',split_sentence,[(str,Span)],[Span])\n", - "sess.register('pos',pos_annotator,[(Span,str)],[Span,str])\n", - "sess.register('lemma',lemmatizer,[(Span,str)],[Span,str])" + "sess.register('split_sentence',split_sentence,[Span],[Span])\n", + "sess.register('pos',pos_annotator,[Span],[Span,str])\n", + "sess.register('lemma',lemmatizer,[Span],[Span,str])" ] }, { @@ -1035,7 +1049,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1068,7 +1082,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1125,7 +1139,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1174,7 +1188,7 @@ "1 [@doc,22,28) \"friend\" 'nemesis'" ] }, - "execution_count": null, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1190,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1199,7 +1213,7 @@ "'hello darkness my young nemesis, I come ...'" ] }, - "execution_count": null, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1212,7 +1226,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -1278,7 +1292,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1302,9 +1316,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "I0000 00:00:1733560524.989903 2260301 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, { "data": { "text/plain": [ @@ -1387,6 +1409,13 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1733560527.945666 2260301 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, { "data": { "text/plain": [ @@ -1460,6 +1489,13 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1733560529.633185 2260301 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, { "data": { "text/plain": [ @@ -1472,7 +1508,7 @@ { "data": { "text/plain": [ - "110" + "111" ] }, "metadata": {}, @@ -1614,7 +1650,7 @@ { "data": { "text/plain": [ - "176" + "177" ] }, "metadata": {}, @@ -1684,6 +1720,13 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1733560535.108147 2260301 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, { "data": { "text/plain": [ @@ -1834,6 +1877,13 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1733560538.065689 2260301 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, { "data": { "text/plain": [ @@ -1922,7 +1972,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2030,7 +2080,7 @@ "9 sample3.txt Problem List: 1. Pneumonia 2. Novel Coronaviru... raw_text" ] }, - "execution_count": null, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2043,7 +2093,7 @@ " [p.name,p.read_text(),'raw_text'] for p in file_paths\n", "],columns=['Path','Doc','Version']\n", ")\n", - "sess.import_rel('Docs',raw_docs)\n", + "sess.import_rel('Docs',raw_docs, scheme=[str, Span, str])\n", "raw_docs" ] }, @@ -2069,9 +2119,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0000 00:00:1733560539.758622 2260301 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers\n" + ] + }, { "data": { "text/plain": [ @@ -2085,156 +2142,159 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
PDWordLemPDWordLem
sample1.txtPatient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive.[@a6c01c,0,7) \"Patient\"patientsample1.txt[@a6c01c,0,139) \"Patient pr...\"[@a6c01c,0,7) \"Patient\"patient
sample1.txtPatient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive.[@a6c01c,20,22) \"be\"besample1.txt[@a6c01c,0,139) \"Patient pr...\"[@a6c01c,20,22) \"be\"be
sample10.txtpatient was screened for cov-19. results came back positive.[@9f417c,0,7) \"patient\"patientsample10.txt[@9f417c,0,60) \"patient wa...\"[@9f417c,0,7) \"patient\"patient
sample10.txtpatient was screened for cov-19. results came back positive.[@9f417c,8,11) \"was\"besample10.txt[@9f417c,0,60) \"patient wa...\"[@9f417c,8,11) \"was\"be
sample2.txtThe patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education.[@591f89,4,11) \"patient\"patientsample2.txt[@591f89,0,115) \"The patien...\"[@591f89,39,43) \"2019\"like_num
sample2.txtThe patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education.[@591f89,12,15) \"was\"besample2.txt[@591f89,0,115) \"The patien...\"[@591f89,100,104) \"2019\"like_num
sample2.txtThe patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education.[@591f89,39,43) \"2019\"like_numsample2.txt[@591f89,0,115) \"The patien...\"[@591f89,67,74) \"Patient\"patient
sample2.txtThe patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education.[@591f89,53,56) \"are\"besample2.txt[@591f89,0,115) \"The patien...\"[@591f89,53,56) \"are\"be
sample2.txtThe patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education.[@591f89,67,74) \"Patient\"patientsample2.txt[@591f89,0,115) \"The patien...\"[@591f89,4,11) \"patient\"patient
sample2.txtThe patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education.[@591f89,100,104) \"2019\"like_numsample2.txt[@591f89,0,115) \"The patien...\"[@591f89,12,15) \"was\"be
sample3.txtProblem List: 1. Pneumonia 2. Novel Coronavirus 2019 [@45bf63,14,15) \"1\"like_numsample3.txt[@45bf63,0,53) \"Problem Li...\"[@45bf63,14,15) \"1\"like_num
sample3.txtProblem List: 1. Pneumonia 2. Novel Coronavirus 2019 [@45bf63,27,28) \"2\"like_numsample3.txt[@45bf63,0,53) \"Problem Li...\"[@45bf63,27,28) \"2\"like_num
sample3.txtProblem List: 1. Pneumonia 2. Novel Coronavirus 2019 [@45bf63,48,52) \"2019\"like_numsample3.txt[@45bf63,0,53) \"Problem Li...\"[@45bf63,48,52) \"2019\"like_num
sample6.txtThe patient have reported novel coronavirus. [@2473a3,4,11) \"patient\"patientsample6.txt[@2473a3,0,45) \"The patien...\"[@2473a3,12,16) \"have\"have
sample6.txtThe patient have reported novel coronavirus. [@2473a3,12,16) \"have\"havesample6.txt[@2473a3,0,45) \"The patien...\"[@2473a3,4,11) \"patient\"patient
sample8.txtPatient was sent for a covid test. Someone was tested positive.[@aad8ff,8,11) \"was\"besample8.txt[@aad8ff,0,63) \"Patient wa...\"[@aad8ff,8,11) \"was\"be
sample8.txtPatient was sent for a covid test. Someone was tested positive.[@aad8ff,43,46) \"was\"besample8.txt[@aad8ff,0,63) \"Patient wa...\"[@aad8ff,43,46) \"was\"be
sample9.txtPatient had contact patient with coronavirus. screening positive coronavirus.[@0e1178,8,11) \"had\"havesample9.txt[@0e1178,0,77) \"Patient ha...\"[@0e1178,12,19) \"contact\"contact
sample9.txtPatient had contact patient with coronavirus. screening positive coronavirus.[@0e1178,12,19) \"contact\"contactsample9.txt[@0e1178,0,77) \"Patient ha...\"[@0e1178,8,11) \"had\"have
sample9.txtPatient had contact patient with coronavirus. screening positive coronavirus.[@0e1178,20,27) \"patient\"patientsample9.txt[@0e1178,0,77) \"Patient ha...\"[@0e1178,20,27) \"patient\"patient
\n", "\n", "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'?CovidMentionSents(P,Mention,Sent)'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PMentionSent
sample1.txt[@931cb5,34,42) \"COVID-19\"[@931cb5,0,43) \"patient pr...\"
sample1.txt[@931cb5,84,92) \"COVID-19\"[@931cb5,44,93) \"His family...\"
sample1.txt[@931cb5,94,102) \"COVID-19\"[@931cb5,94,130) \"COVID-19 r...\"
sample2.txt[@e4b074,26,34) \"COVID-19\"[@e4b074,0,44) \"The patien...\"
sample2.txt[@e4b074,87,95) \"COVID-19\"[@e4b074,66,115) \"patient un...\"
sample3.txt[@882253,44,52) \"COVID-19\"[@882253,44,61) \"COVID-19 l...\"
sample4.txt[@77c574,4,12) \"COVID-19\"[@77c574,0,23) \"neg COVID-...\"
sample5.txt[@ffb7c7,9,17) \"COVID-19\"[@ffb7c7,0,29) \"positive C...\"
sample6.txt[@b2612f,26,34) \"COVID-19\"[@b2612f,0,35) \"The patien...\"
sample8.txt[@3db2e4,22,30) \"COVID-19\"[@3db2e4,0,36) \"patient be...\"
sample9.txt[@6d2862,34,42) \"COVID-19\"[@6d2862,0,43) \"patient ha...\"
\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "%%spannerlog -a {slog_file}\n", "\n", @@ -4581,6 +4524,28 @@ "?CovidTags(Path,Mention,Tag,Derivation)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "' '.join(['cargo', 'build', '--manifest-path', '/Users/itarazi/Documents/Technion/spanner/spannerflow/spannerflow/Cargo.toml', '-p', 'spannerflow'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import networkx as nx\n", + "from graph_rewrite import draw\n", + "\n", + "graph, root = sess.export(\"?CovidTags(Path,Mention,Tag,Derivation)\", plan_query=True, draw_query=True)\n", + "draw(nx.reverse(graph))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -4643,178 +4608,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'?AggregatedCovidTags(Path,Mention,Tag)'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PathMentionTag
sample1.txt[@931cb5,84,92) \"COVID-19\"negated
sample1.txt[@931cb5,94,102) \"COVID-19\"positive
sample2.txt[@e4b074,87,95) \"COVID-19\"IGNORE
sample3.txt[@882253,44,52) \"COVID-19\"positive
sample4.txt[@77c574,4,12) \"COVID-19\"IGNORE
sample5.txt[@ffb7c7,9,17) \"COVID-19\"positive
sample6.txt[@b2612f,26,34) \"COVID-19\"uncertain
\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'?DocumentTags(Path,Tag)'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PathTag
sample1.txtPOS
sample2.txtUNK
sample3.txtPOS
sample4.txtUNK
sample5.txtPOS
sample6.txtUNK
\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "%%spannerlog -a {slog_file}\n", "AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-\n", @@ -4832,82 +4626,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PT
0sample1.txtPOS
1sample2.txtUNK
2sample3.txtPOS
3sample4.txtUNK
4sample5.txtPOS
5sample6.txtUNK
\n", - "
" - ], - "text/plain": [ - " P T\n", - "0 sample1.txt POS\n", - "1 sample2.txt UNK\n", - "2 sample3.txt POS\n", - "3 sample4.txt UNK\n", - "4 sample5.txt POS\n", - "5 sample6.txt UNK" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "#| export\n", "doc_tags = sess.export('?DocumentTags(P,T)')\n", @@ -4926,106 +4645,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PT
0sample1.txtPOS
1sample10.txtUNK
2sample2.txtUNK
3sample3.txtPOS
4sample4.txtUNK
5sample5.txtPOS
6sample6.txtUNK
7sample7.txtUNK
8sample8.txtUNK
9sample9.txtUNK
\n", - "
" - ], - "text/plain": [ - " P T\n", - "0 sample1.txt POS\n", - "1 sample10.txt UNK\n", - "2 sample2.txt UNK\n", - "3 sample3.txt POS\n", - "4 sample4.txt UNK\n", - "5 sample5.txt POS\n", - "6 sample6.txt UNK\n", - "7 sample7.txt UNK\n", - "8 sample8.txt UNK\n", - "9 sample9.txt UNK" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "#| export\n", "paths = pd.DataFrame([p.name for p in file_paths],columns=['P'])\n", @@ -5630,9 +5250,21 @@ ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": ".env", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" } }, "nbformat": 4, diff --git a/nbs/tutorials/covid_data/covid_logic.pl b/nbs/tutorials/covid_data/covid_logic.pl index d8350473..462571ea 100644 --- a/nbs/tutorials/covid_data/covid_logic.pl +++ b/nbs/tutorials/covid_data/covid_logic.pl @@ -28,55 +28,7 @@ Sents(P,S)<-Docs(P,D,"target_concept"),split_sentence(D)->(S). -SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),expr_eval("{0}.end +1 == {1}.start",S1,S2)->(True). - # first we get the covid mentions and their surrounding sentences, using the span_contained ie function CovidMentions(Path, Span) <- Docs(Path,D,"target_concept"), rgx("COVID-19",D) -> (Span). CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True). - -# note that for ease of debugging, we extended our head to track which rule a fact was derived from - -# a tag is positive if it is contained in a positive section -CovidTags(Path,Mention,'positive','section')<- - PositiveSections(Path,D,Title,Section), - CovidMentions(Path,Mention), - span_contained(Mention,Section)->(True). - -# Context rules tags -CovidTags(Path,Mention,Tag,'sentence context')<- - CovidMentionSents(Path,Mention,Sent), - SentenceContextRules(Pattern,Tag,DisambiguationPattern), - rgx(Pattern,Sent)->(ContextSpan), - span_contained(Mention,ContextSpan)->(True), - rgx_is_match(DisambiguationPattern,Sent)->(False). - -# post processing based on pattern -CovidTags(Path,Mention,Tag,'post pattern')<- - CovidMentionSents(Path,Mention,Sent), - PostprocessPatternRules(Pattern,Tag), - rgx(Pattern,Sent)->(ContextSpan), - span_contained(Mention,ContextSpan)->(True). - -# post processing based on pattern and existing attributes -# notice the recursive call to CovidTags -CovidTags(Path,Mention,Tag,"post attribute change")<- - CovidTags(Path,Mention,OldTag,Derivation), - PostprocessRulesWithAttributes(Pattern,OldTag,Tag), - CovidMentionSents(Path,Mention,Sent), - rgx(Pattern,Sent)->(ContextSpan), - span_contained(Mention,ContextSpan)->(True). - -# post processing based on pattern in the next sentence -CovidTags(Path,Mention,Tag,"next sentence")<- - CovidMentionSents(Path,Mention,Sent), - SentPairs(Path,Sent,NextSent), - PostprocessPatternRules(Pattern,Tag), - rgx(Pattern,NextSent)->(ContextSpan). - -AggregatedCovidTags(Path,Mention,agg_mention(Tag))<- - CovidTags(Path,Mention,Tag,Derivation). - -DocumentTags(Path,agg_doc_tags(Tag))<- - AggregatedCovidTags(Path,Mention,Tag). - diff --git a/spannerlib/_modidx.py b/spannerlib/_modidx.py index 3a5546e5..e78e31be 100644 --- a/spannerlib/_modidx.py +++ b/spannerlib/_modidx.py @@ -60,6 +60,7 @@ 'spannerlib.engine.Engine.get_ie_function': ( 'engine.html#engine.get_ie_function', 'spannerlib/engine.py'), 'spannerlib.engine.Engine.get_relation': ('engine.html#engine.get_relation', 'spannerlib/engine.py'), + 'spannerlib.engine.Engine.get_span': ('engine.html#engine.get_span', 'spannerlib/engine.py'), 'spannerlib.engine.Engine.get_var': ('engine.html#engine.get_var', 'spannerlib/engine.py'), 'spannerlib.engine.Engine.load_csv': ('engine.html#engine.load_csv', 'spannerlib/engine.py'), 'spannerlib.engine.Engine.plan_query': ('engine.html#engine.plan_query', 'spannerlib/engine.py'), diff --git a/spannerlib/engine.py b/spannerlib/engine.py index 1f9a0c67..4fc2dadc 100644 --- a/spannerlib/engine.py +++ b/spannerlib/engine.py @@ -173,6 +173,9 @@ def del_fact(self,fact:Relation): self.spannerflow_engine.delete_row(fact.name, fact.terms) # self.db[fact.name] = _pd_drop_row(df = self.db[fact.name],row_vals=fact.terms) + def get_span(self, document_id: str, start: int, end: int): + return self.spannerflow_engine.get_span(document_id, start, end) + def get_ie_function(self,name:str): return self.ie_functions.get(name,None) diff --git a/spannerlib/tutorials/covid.py b/spannerlib/tutorials/covid.py index 02f3a1bb..0424adc6 100644 --- a/spannerlib/tutorials/covid.py +++ b/spannerlib/tutorials/covid.py @@ -79,9 +79,9 @@ def __call__(self,text): pos_annotator = PosFromList(["NOUN", "PROPN", "PRON", "ADJ"]) # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 49 -sess.register('split_sentence',split_sentence,[(str,Span)],[Span]) -sess.register('pos',pos_annotator,[(Span,str)],[Span,str]) -sess.register('lemma',lemmatizer,[(Span,str)],[Span,str]) +sess.register('split_sentence',split_sentence,[Span],[Span]) +sess.register('pos',pos_annotator,[Span],[Span,str]) +sess.register('lemma',lemmatizer,[Span],[Span,str]) # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 55 def rewrite(text,span_label_pairs): @@ -144,20 +144,20 @@ def rewrite_docs(docs,span_label,new_version): [p.name,p.read_text(),'raw_text'] for p in file_paths ],columns=['Path','Doc','Version'] ) -sess.import_rel('Docs',raw_docs) +sess.import_rel('Docs',raw_docs, scheme=[str, Span, str]) raw_docs # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 71 lemma_tags = sess.export('?Lemmas(P,D,W,L)') lemma_docs = rewrite_docs(raw_docs,lemma_tags,'lemma') -sess.import_rel('Docs',lemma_docs) +sess.import_rel('Docs',lemma_docs, scheme=[str, Span, str]) # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 74 lemma_concept_matches = sess.export('?LemmaConceptMatches(Path,Doc,Span,Label)') display(lemma_concept_matches.map(repr).head()) lemma_concepts = rewrite_docs(lemma_docs,lemma_concept_matches,'lemma_concept') -sess.import_rel('Docs',lemma_concepts) +sess.import_rel('Docs',lemma_concepts, scheme=[str, Span, str]) lemma_concepts.head() # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 78 @@ -165,14 +165,14 @@ def rewrite_docs(docs,span_label,new_version): display(pos_concept_matches.map(repr).head()) pos_concept_docs = rewrite_docs(lemma_concepts,pos_concept_matches,'pos_concept') -sess.import_rel('Docs',pos_concept_docs) +sess.import_rel('Docs',pos_concept_docs, scheme=[str, Span, str]) sess.export('?Docs("sample8.txt",D,V)') # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 81 target_matches = sess.export('?TargetMatches(P,D,W,L)') display(target_matches.map(repr)) target_rule_docs = rewrite_docs(pos_concept_docs,target_matches,'target_concept') -sess.import_rel('Docs',target_rule_docs) +sess.import_rel('Docs',target_rule_docs, scheme=[str, Span, str]) # %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 87 section_tags = pd.read_csv(data_dir/'section_tags.csv',names=['literal','tag']) @@ -184,7 +184,7 @@ def rewrite_docs(docs,span_label,new_version): sess.import_var('section_delimeter_pattern',section_delimeter_pattern) section_delimeter_pattern -# %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 112 +# %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 111 def agg_mention(group): """ aggregates attribute groups of covid spans @@ -220,11 +220,11 @@ def AggDocumentTags(group): sess.register_agg('agg_mention',agg_mention,[str],[str]) sess.register_agg('agg_doc_tags',AggDocumentTags,[str],[str]) -# %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 114 +# %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 113 doc_tags = sess.export('?DocumentTags(P,T)') doc_tags -# %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 116 +# %% ../../nbs/tutorials/004_rewriting_a_real_codebase.ipynb 115 paths = pd.DataFrame([p.name for p in file_paths],columns=['P']) classification = paths.merge(doc_tags,on='P',how='outer') classification['T']=classification['T'].fillna('UNK')