jupyter-server · dlqqq · Jul 6, 2022 · Jul 13, 2022 · Jul 26, 2022 · Jul 26, 2022
diff --git a/jupyter_server/benchmarks/fileidmanager_benchmark.py b/jupyter_server/benchmarks/fileidmanager_benchmark.py
@@ -0,0 +1,133 @@
+import os
+import timeit
+
+from jupyter_core.paths import jupyter_data_dir
+
+from jupyter_server.services.contents.fileidmanager import FileIdManager
+
+db_path = os.path.join(jupyter_data_dir(), "file_id_manager_perftest.db")
+
+
+def build_setup(n, insert=True):
+    def setup():
+        try:
+            os.remove(db_path)
+        except:
+            pass
+        fid_manager = FileIdManager(db_path=db_path)
+
+        if not insert:
+            return
+
+        for i in range(n):
+            fid_manager.con.execute(
+                "INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
+            )
+        fid_manager.con.commit()
+
+    return setup
+
+
+BATCH_SIZE = 100_000
+
+
+def build_test_index(n, single_transaction, batched=False):
+    def test_index():
+        fid_manager = FileIdManager(db_path=db_path)
+
+        if single_transaction:
+            if batched:
+                for batch_start in range(0, n, BATCH_SIZE):
+                    batch_end = batch_start + BATCH_SIZE
+                    fid_manager.con.execute(
+                        "INSERT INTO FILES (path) VALUES "
+                        + ",".join(
+                            [f'("abracadabra/{i}.txt")' for i in range(batch_start, batch_end)]
+                        )
+                    )
+            else:
+                for i in range(n):
+                    fid_manager.con.execute(
+                        "INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
+                    )
+
+            fid_manager.con.commit()
+        else:
+            for i in range(n):
+                fid_manager.index(f"abracadabra/{i}.txt")
+
+    return test_index
+
+
+def test_copy():
+    fid_manager = FileIdManager(db_path=db_path)
+    fid_manager.copy("abracadabra", "shazam", recursive=True)
+
+
+def test_move():
+    fid_manager = FileIdManager(db_path=db_path)
+    fid_manager.move("abracadabra", "shazam", recursive=True)
+
+
+def test_delete():
+    fid_manager = FileIdManager(db_path=db_path)
+    fid_manager.delete("abracadabra", recursive=True)
+
+
+row_template = "{:<9,d} files | {:<8.4f} s"
+
+
+# too slow for 1k+
+print("Index benchmark (separate transactions)")
+for i in [100, 1_000]:
+    print(
+        row_template.format(
+            i,
+            timeit.timeit(
+                build_test_index(i, single_transaction=False),
+                build_setup(i, insert=False),
+                number=1,
+            ),
+        )
+    )
+
+print("Index benchmark (single transaction, atomic INSERTs)")
+for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
+    print(
+        row_template.format(
+            i,
+            timeit.timeit(
+                build_test_index(i, single_transaction=True, batched=False),
+                build_setup(i, insert=False),
+                number=1,
+            ),
+        )
+    )
+
+# suggested by https://stackoverflow.com/a/72527058/12548458
+# asymptotically faster because it reduces work being done by the SQLite VDBE https://www.sqlite.org/opcode.html
+# weird constant time factor that makes it sub-optimal for <1M records.
+print("Index benchmark (single transaction, batched INSERTs)")
+for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
+    print(
+        row_template.format(
+            i,
+            timeit.timeit(
+                build_test_index(i, single_transaction=True, batched=True),
+                build_setup(i, insert=False),
+                number=1,
+            ),
+        )
+    )
+
+print("Recursive move benchmark")
+for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
+    print(row_template.format(i, timeit.timeit(test_move, build_setup(i), number=1)))
+
+print("Recursive copy benchmark")
+for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
+    print(row_template.format(i, timeit.timeit(test_copy, build_setup(i), number=1)))
+
+print("Recursive delete benchmark")
+for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
+    print(row_template.format(i, timeit.timeit(test_delete, build_setup(i), number=1)))
diff --git a/jupyter_server/pytest_plugin.py b/jupyter_server/pytest_plugin.py
@@ -18,6 +18,7 @@
 
 from jupyter_server.extension import serverextension
 from jupyter_server.serverapp import ServerApp
+from jupyter_server.services.contents.fileidmanager import FileIdManager
 from jupyter_server.services.contents.filemanager import FileContentsManager
 from jupyter_server.services.contents.largefilemanager import LargeFileManager
 from jupyter_server.utils import url_path_join
@@ -457,6 +458,27 @@ def jp_large_contents_manager(tmp_path):
     return LargeFileManager(root_dir=str(tmp_path))
 
 
+@pytest.fixture
+def fid_db_path(jp_data_dir):
+    """Fixture that returns the file ID DB path used for tests."""
+    return str(jp_data_dir / "fileidmanager_test.db")
+
+
+@pytest.fixture(autouse=True)
+def delete_db(fid_db_path):
+    """Fixture that automatically deletes the DB file before each test."""
+    try:
+        os.remove(fid_db_path)
+    except OSError:
+        pass
+
+
+@pytest.fixture
+def fid_manager(fid_db_path):
+    """Fixture returning a test-configured instance of `FileIdManager`."""
+    return FileIdManager(db_path=fid_db_path)
+
+
 @pytest.fixture
 def jp_create_notebook(jp_root_dir):
     """Creates a notebook in the test's home directory."""

diff --git a/jupyter_server/serverapp.py b/jupyter_server/serverapp.py
@@ -111,6 +111,7 @@
 )
 from jupyter_server.log import log_request
 from jupyter_server.services.config import ConfigManager
+from jupyter_server.services.contents.fileidmanager import FileIdManager
 from jupyter_server.services.contents.filemanager import (
     AsyncFileContentsManager,
     FileContentsManager,
@@ -1886,9 +1887,9 @@ def init_configurables(self):
             connection_dir=self.runtime_dir,
             kernel_spec_manager=self.kernel_spec_manager,
         )
+        self.file_id_manager = FileIdManager(parent=self, log=self.log)
         self.contents_manager = self.contents_manager_class(
-            parent=self,
-            log=self.log,
+            parent=self, log=self.log, file_id_manager=self.file_id_manager
         )
         self.session_manager = self.session_manager_class(
             parent=self,
@@ -2508,6 +2509,11 @@ async def cleanup_extensions(self):
         self.log.info(extension_msg % n_extensions)
         await run_sync_in_loop(self.extension_manager.stop_all_extensions())
 
+    def cleanup_file_id_manager(self):
+        if not getattr(self, "file_id_manager", None):
+            return
+        self.file_id_manager._cleanup()
+
     def running_server_info(self, kernel_count=True):
         "Return the current working directory and the server url information"
         info = self.contents_manager.info_string() + "\n"
@@ -2780,6 +2786,7 @@ async def _cleanup(self):
         self.remove_browser_open_files()
         await self.cleanup_extensions()
         await self.cleanup_kernels()
+        self.cleanup_file_id_manager()
         if getattr(self, "session_manager", None):
             self.session_manager.close()
         if getattr(self, "event_bus", None):

diff --git a/jupyter_server/services/contents/fileidmanager.py b/jupyter_server/services/contents/fileidmanager.py
@@ -0,0 +1,126 @@
+import os
+import sqlite3
+
+from jupyter_core.paths import jupyter_data_dir
+from traitlets import Unicode
+from traitlets.config.configurable import LoggingConfigurable
+
+
+class FileIdManager(LoggingConfigurable):
+    db_path = Unicode(
 database_filepath = Unicode( 
     default_value=":memory:", 
     help=( 
         "The filesystem path to SQLite Database file " 
         "(e.g. /path/to/session_database.db). By default, the session " 
         "database is stored in-memory (i.e. `:memory:` setting from sqlite3) " 
         "and does not persist when the current Jupyter Server shuts down." 
     ), 
 ).tag(config=True) 
 @validate("database_filepath") 
 def _validate_database_filepath(self, proposal): 
     value = proposal["value"] 
     if value == ":memory:": 
         return value 
     path = pathlib.Path(value) 
     if path.exists(): 
         # Verify that the database path is not a directory. 
         if path.is_dir(): 
             raise TraitError( 
                 "`database_filepath` expected a file path, but the given path is a directory." 
             ) 
         # Verify that database path is an SQLite 3 Database by checking its header. 
         with open(value, "rb") as f: 
             header = f.read(100) 
         if not header.startswith(b"SQLite format 3") and not header == b"": 
             raise TraitError("The given file is not an SQLite database file.") 
     return value 
 database_filepath = Unicode( 
     default_value=":memory:", 
     help=( 
         "The filesystem path to SQLite Database file " 
         "(e.g. /path/to/session_database.db). By default, the session " 
         "database is stored in-memory (i.e. `:memory:` setting from sqlite3) " 
         "and does not persist when the current Jupyter Server shuts down." 
     ), 
 ).tag(config=True) 
 @validate("database_filepath") 
 def _validate_database_filepath(self, proposal): 
     value = proposal["value"] 
     if value == ":memory:": 
         return value 
     path = pathlib.Path(value) 
     if path.exists(): 
         # Verify that the database path is not a directory. 
         if path.is_dir(): 
             raise TraitError( 
                 "`database_filepath` expected a file path, but the given path is a directory." 
             ) 
         # Verify that database path is an SQLite 3 Database by checking its header. 
         with open(value, "rb") as f: 
             header = f.read(100) 
  
         if not header.startswith(b"SQLite format 3") and not header == b"": 
             raise TraitError("The given file is not an SQLite database file.") 
     return value 
+        default_value=os.path.join(jupyter_data_dir(), "file_id_manager.db"),
+        help=(
+            "The path of the DB file used by `FileIdManager`. "
+            "Defaults to `jupyter_data_dir()/file_id_manager.db`."
+        ),
+        config=True,
+    )
+
+    def __init__(self, *args, **kwargs):
+        # pass args and kwargs to parent Configurable
+        super().__init__(*args, **kwargs)
+        # initialize connection with db
+        self.con = sqlite3.connect(self.db_path)
+        self.log.debug("Creating File ID tables and indices")
+        self.con.execute(
+            "CREATE TABLE IF NOT EXISTS Files(id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE)"
+        )
+        self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON FILES (path)")
+        self.con.commit()
+
+    def _normalize_path(self, path):
+        """Normalizes a given file path."""
+        path = os.path.normcase(path)
+        path = os.path.normpath(path)
+        return path
+
+    def index(self, path):
+        """Adds the file path to the Files table, then returns the file ID. If
+        the file is already indexed, the file ID is immediately returned."""
+        path = self._normalize_path(path)
+        existing_id = self.get_id(path)
+        if existing_id is not None:
+            return existing_id
+
+        cursor = self.con.execute("INSERT INTO Files (path) VALUES (?)", (path,))
+        self.con.commit()
+        return cursor.lastrowid
+
+    def get_id(self, path):
+        """Retrieves the file ID associated with a file path. Returns None if
+        the file path has not yet been indexed."""
+        path = self._normalize_path(path)
+        row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone()
+        self.con.commit()
+        return row[0] if row else None
+
+    def get_path(self, id):
+        """Retrieves the file path associated with a file ID. Returns None if
+        the ID does not exist in the Files table."""
+        row = self.con.execute("SELECT path FROM Files WHERE id = ?", (id,)).fetchone()
+        self.con.commit()
+        return row[0] if row else None
+
+    def move(self, old_path, new_path, recursive=False):
+        """Handles file moves by updating the file path of the associated file
+        ID.  Returns the file ID."""
+        old_path = self._normalize_path(old_path)
+        new_path = self._normalize_path(new_path)
+        self.log.debug(f"Moving file from ${old_path} to ${new_path}")
+
+        if recursive:
+            old_path_glob = os.path.join(old_path, "*")
+            self.con.execute(
+                "UPDATE Files SET path = ? || substr(path, ?) WHERE path GLOB ?",
+                (new_path, len(old_path) + 1, old_path_glob),
+            )
+            self.con.commit()
+
+        id = self.get_id(old_path)
+        if id is None:
+            return self.index(new_path)
+        else:
+            self.con.execute("UPDATE Files SET path = ? WHERE id = ?", (new_path, id))
+            self.con.commit()
+            return id
+
+    def copy(self, from_path, to_path, recursive=False):
+        """Handles file copies by creating a new record in the Files table.
+        Returns the file ID associated with `new_path`. Also indexes `old_path`
+        if record does not exist in Files table. TODO: emit to event bus to
+        inform client extensions to copy records associated with old file ID to
+        the new file ID."""
+        from_path = self._normalize_path(from_path)
+        to_path = self._normalize_path(to_path)
+        self.log.debug(f"Copying file from ${from_path} to ${to_path}")
+
+        if recursive:
+            from_path_glob = os.path.join(from_path, "*")
+            self.con.execute(
+                "INSERT INTO Files (path) SELECT (? || substr(path, ?)) FROM Files WHERE path GLOB ?",
+                (to_path, len(from_path) + 1, from_path_glob),
+            )
+            self.con.commit()
+
+        self.index(from_path)
+        return self.index(to_path)
+
+    def delete(self, path, recursive=False):
+        """Handles file deletions by deleting the associated record in the File
+        table. Returns None."""
+        path = self._normalize_path(path)
+        self.log.debug(f"Deleting file {path}")
+
+        if recursive:
+            path_glob = os.path.join(path, "*")
+            self.con.execute("DELETE FROM Files WHERE path GLOB ?", (path_glob,))
+            self.con.commit()
+
+        self.con.execute("DELETE FROM Files WHERE path = ?", (path,))
+        self.con.commit()
+
+    def _cleanup(self):
+        """Cleans up `FileIdManager` by committing any pending transactions and
+        closing the connection."""
+        self.con.commit()
+        self.con.close()
diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py
@@ -395,6 +395,10 @@ def get(self, path, content=True, type=None, format=None):
             if type == "directory":
                 raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
             model = self._file_model(path, content=content, format=format)
+
+        # append file ID to model
+        model["id"] = self.file_id_manager.index(path)
+
         return model
 
     def _save_directory(self, os_path, model, path=""):

diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py
@@ -271,7 +271,7 @@ async def delete(self, path=""):
         if await ensure_async(cm.is_hidden(path)) and not cm.allow_hidden:
             raise web.HTTPError(400, f"Cannot delete file or directory {path!r}")
 
-        self.log.warning("delete %s", path)
+        self.log.warning("Deleting file at %s", path)
         await ensure_async(cm.delete(path))
         self.set_status(204)
         self.finish()