diff --git a/jupyter_server/benchmarks/fileidmanager_benchmark.py b/jupyter_server/benchmarks/fileidmanager_benchmark.py new file mode 100644 index 0000000000..18c6079263 --- /dev/null +++ b/jupyter_server/benchmarks/fileidmanager_benchmark.py @@ -0,0 +1,133 @@ +import os +import timeit + +from jupyter_core.paths import jupyter_data_dir + +from jupyter_server.services.contents.fileidmanager import FileIdManager + +db_path = os.path.join(jupyter_data_dir(), "file_id_manager_perftest.db") + + +def build_setup(n, insert=True): + def setup(): + try: + os.remove(db_path) + except: + pass + fid_manager = FileIdManager(db_path=db_path) + + if not insert: + return + + for i in range(n): + fid_manager.con.execute( + "INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",) + ) + fid_manager.con.commit() + + return setup + + +BATCH_SIZE = 100_000 + + +def build_test_index(n, single_transaction, batched=False): + def test_index(): + fid_manager = FileIdManager(db_path=db_path) + + if single_transaction: + if batched: + for batch_start in range(0, n, BATCH_SIZE): + batch_end = batch_start + BATCH_SIZE + fid_manager.con.execute( + "INSERT INTO FILES (path) VALUES " + + ",".join( + [f'("abracadabra/{i}.txt")' for i in range(batch_start, batch_end)] + ) + ) + else: + for i in range(n): + fid_manager.con.execute( + "INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",) + ) + + fid_manager.con.commit() + else: + for i in range(n): + fid_manager.index(f"abracadabra/{i}.txt") + + return test_index + + +def test_copy(): + fid_manager = FileIdManager(db_path=db_path) + fid_manager.copy("abracadabra", "shazam", recursive=True) + + +def test_move(): + fid_manager = FileIdManager(db_path=db_path) + fid_manager.move("abracadabra", "shazam", recursive=True) + + +def test_delete(): + fid_manager = FileIdManager(db_path=db_path) + fid_manager.delete("abracadabra", recursive=True) + + +row_template = "{:<9,d} files | {:<8.4f} s" + + +# too slow for 1k+ +print("Index benchmark (separate transactions)") +for i in [100, 1_000]: + print( + row_template.format( + i, + timeit.timeit( + build_test_index(i, single_transaction=False), + build_setup(i, insert=False), + number=1, + ), + ) + ) + +print("Index benchmark (single transaction, atomic INSERTs)") +for i in [100, 1_000, 10_000, 100_000, 1_000_000]: + print( + row_template.format( + i, + timeit.timeit( + build_test_index(i, single_transaction=True, batched=False), + build_setup(i, insert=False), + number=1, + ), + ) + ) + +# suggested by https://stackoverflow.com/a/72527058/12548458 +# asymptotically faster because it reduces work being done by the SQLite VDBE https://www.sqlite.org/opcode.html +# weird constant time factor that makes it sub-optimal for <1M records. +print("Index benchmark (single transaction, batched INSERTs)") +for i in [100, 1_000, 10_000, 100_000, 1_000_000]: + print( + row_template.format( + i, + timeit.timeit( + build_test_index(i, single_transaction=True, batched=True), + build_setup(i, insert=False), + number=1, + ), + ) + ) + +print("Recursive move benchmark") +for i in [100, 1_000, 10_000, 100_000, 1_000_000]: + print(row_template.format(i, timeit.timeit(test_move, build_setup(i), number=1))) + +print("Recursive copy benchmark") +for i in [100, 1_000, 10_000, 100_000, 1_000_000]: + print(row_template.format(i, timeit.timeit(test_copy, build_setup(i), number=1))) + +print("Recursive delete benchmark") +for i in [100, 1_000, 10_000, 100_000, 1_000_000]: + print(row_template.format(i, timeit.timeit(test_delete, build_setup(i), number=1))) diff --git a/jupyter_server/pytest_plugin.py b/jupyter_server/pytest_plugin.py index e5d0d49907..57c039027b 100644 --- a/jupyter_server/pytest_plugin.py +++ b/jupyter_server/pytest_plugin.py @@ -18,6 +18,7 @@ from jupyter_server.extension import serverextension from jupyter_server.serverapp import ServerApp +from jupyter_server.services.contents.fileidmanager import FileIdManager from jupyter_server.services.contents.filemanager import FileContentsManager from jupyter_server.services.contents.largefilemanager import LargeFileManager from jupyter_server.utils import url_path_join @@ -457,6 +458,34 @@ def jp_large_contents_manager(tmp_path): return LargeFileManager(root_dir=str(tmp_path)) +@pytest.fixture +def fid_db_path(jp_data_dir): + """Fixture that returns the file ID DB path used for tests.""" + return str(jp_data_dir / "fileidmanager_test.db") + + +@pytest.fixture(autouse=True) +def delete_fid_db(fid_db_path): + """Fixture that automatically deletes the DB file after each test.""" + yield + try: + os.remove(fid_db_path) + except OSError: + pass + + +@pytest.fixture +def fid_manager(fid_db_path, jp_root_dir): + """Fixture returning a test-configured instance of `FileIdManager`.""" + fid_manager = FileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir)) + # disable journal so no temp journal file is created under `tmp_path`. + # reduces test flakiness since sometimes journal file has same ino and + # crtime as a deleted file, so FID manager detects it wrongly as a move + # also makes tests run faster :) + fid_manager.con.execute("PRAGMA journal_mode = OFF") + return fid_manager + + @pytest.fixture def jp_create_notebook(jp_root_dir): """Creates a notebook in the test's home directory.""" diff --git a/jupyter_server/serverapp.py b/jupyter_server/serverapp.py index a5f9b99d10..7978cd6640 100644 --- a/jupyter_server/serverapp.py +++ b/jupyter_server/serverapp.py @@ -111,6 +111,7 @@ ) from jupyter_server.log import log_request from jupyter_server.services.config import ConfigManager +from jupyter_server.services.contents.fileidmanager import FileIdManager from jupyter_server.services.contents.filemanager import ( AsyncFileContentsManager, FileContentsManager, @@ -1886,9 +1887,9 @@ def init_configurables(self): connection_dir=self.runtime_dir, kernel_spec_manager=self.kernel_spec_manager, ) + self.file_id_manager = FileIdManager(parent=self, log=self.log, root_dir=self.root_dir) self.contents_manager = self.contents_manager_class( - parent=self, - log=self.log, + parent=self, log=self.log, file_id_manager=self.file_id_manager ) self.session_manager = self.session_manager_class( parent=self, @@ -2508,6 +2509,11 @@ async def cleanup_extensions(self): self.log.info(extension_msg % n_extensions) await run_sync_in_loop(self.extension_manager.stop_all_extensions()) + def cleanup_file_id_manager(self): + if not getattr(self, "file_id_manager", None): + return + self.file_id_manager._cleanup() + def running_server_info(self, kernel_count=True): "Return the current working directory and the server url information" info = self.contents_manager.info_string() + "\n" @@ -2780,6 +2786,7 @@ async def _cleanup(self): self.remove_browser_open_files() await self.cleanup_extensions() await self.cleanup_kernels() + self.cleanup_file_id_manager() if getattr(self, "session_manager", None): self.session_manager.close() if getattr(self, "event_bus", None): diff --git a/jupyter_server/services/contents/fileidmanager.py b/jupyter_server/services/contents/fileidmanager.py new file mode 100644 index 0000000000..0f8d6a79c4 --- /dev/null +++ b/jupyter_server/services/contents/fileidmanager.py @@ -0,0 +1,452 @@ +import os +import sqlite3 +import stat +from collections import deque +from typing import Deque, Optional + +from jupyter_core.paths import jupyter_data_dir +from traitlets import TraitError, Unicode, validate +from traitlets.config.configurable import LoggingConfigurable + + +class StatStruct: + ino: int + crtime: Optional[int] + mtime: int + is_dir: bool + is_symlink: bool + + +class FileIdManager(LoggingConfigurable): + """ + Manager that supports tracks files across their lifetime by associating + each with a unique file ID, which is maintained across filesystem operations. + + Notes + ----- + + All private helper methods prefixed with an underscore (except `__init__()`) + do NOT commit their SQL statements in a transaction via `self.con.commit()`. + This responsibility is delegated to the public method calling them to + increase performance. Committing multiple SQL transactions in serial is much + slower than committing a single SQL transaction wrapping all SQL statements + performed during a method's procedure body. + """ + + root_dir = Unicode( + help=("The root being served by Jupyter server. Must be an absolute path."), config=True + ) + + db_path = Unicode( + default_value=os.path.join(jupyter_data_dir(), "file_id_manager.db"), + help=( + "The path of the DB file used by `FileIdManager`. " + "Defaults to `jupyter_data_dir()/file_id_manager.db`." + ), + config=True, + ) + + def __init__(self, **kwargs): + # pass args and kwargs to parent Configurable + super().__init__(**kwargs) + # initialize connection with db + self.con = sqlite3.connect(self.db_path) + self.log.debug("FileIdManager : Creating File ID tables and indices") + self.con.execute( + "CREATE TABLE IF NOT EXISTS Files(" + "id INTEGER PRIMARY KEY AUTOINCREMENT, " + # uniqueness constraint relaxed here because we need to keep records + # of deleted files which may occupy same path + "path TEXT NOT NULL, " + "ino INTEGER NOT NULL UNIQUE, " + "crtime INTEGER, " + "mtime INTEGER NOT NULL, " + "is_dir TINYINT NOT NULL" + ")" + ) + self._index_all() + # no need to index ino as it is autoindexed by sqlite via UNIQUE constraint + self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)") + self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_is_dir ON Files (is_dir)") + self.con.commit() + + @validate("root_dir", "db_path") + def _validate_abspath_traits(self, proposal): + if proposal["value"] is None: + raise TraitError("FileIdManager : %s must not be None" % proposal["trait"].name) + if not os.path.isabs(proposal["value"]): + raise TraitError("FileIdManager : %s must be an absolute path" % proposal["trait"].name) + return self._normalize_path(proposal["value"]) + + def _index_all(self): + """Recursively indexes all directories under the server root.""" + self._index_dir_recursively(self.root_dir, self._stat(self.root_dir)) + + def _index_dir_recursively(self, dir_path, stat_info): + """Recursively indexes all directories under a given path.""" + self.index(dir_path, stat_info=stat_info, commit=False) + + with os.scandir(dir_path) as scan_iter: + for entry in scan_iter: + if entry.is_dir(): + self._index_dir_recursively(entry.path, self._stat(entry.path)) + scan_iter.close() + + def _sync_all(self): + """ + Syncs Files table with the filesystem and ensures that the correct path + is associated with each file ID. Does so by iterating through all + indexed directories and syncing the contents of all dirty directories. + + Notes + ----- + A dirty directory is a directory that is either: + - unindexed + - indexed but with different `mtime` + + Dirty directories contain possibly indexed but moved files as children. + Hence we need to call _sync_file() on their contents via _sync_dir(). + Indexed directories with mtime difference are handled in this method + body. Unindexed dirty directories are handled immediately when + encountered in _sync_dir(). + + sync_deque is an additional deque of directories that should be checked + for dirtiness, and is appended to whenever _sync_file() encounters an + indexed directory that was moved out-of-band. This is necessary because + the SELECT query is not guaranteed to include the new paths following + the move. + """ + sync_deque: Deque = deque() + cursor = self.con.execute("SELECT id, path, mtime FROM Files WHERE is_dir = 1") + dir = cursor.fetchone() + while dir: + id, path, old_mtime = dir + stat_info = self._stat(path) + + # ignores directories that no longer exist + if stat_info is not None: + new_mtime = stat_info.mtime + dir_dirty = new_mtime != old_mtime + if dir_dirty: + self._sync_dir(path, sync_deque) + self._update(id, stat_info) + + dir = sync_deque.popleft() if sync_deque else cursor.fetchone() + + def _sync_dir(self, dir_path, sync_deque): + """ + Syncs the contents of a directory. If a child directory is dirty because + it is unindexed, then the contents of that child directory are synced. + See _sync_all() for more on dirty directories. + + Parameters + ---------- + dir_path : string + Path of the directory to sync contents of. + + sync_deque: deque + Deque of directory records to be checked for dirtiness in + _sync_all(). + """ + with os.scandir(dir_path) as scan_iter: + for entry in scan_iter: + stat_info = self._stat(entry.path) + id = self._sync_file(entry.path, stat_info, sync_deque) + + # if entry is unindexed directory, create new record and sync + # contents recursively. + if stat_info.is_dir and id is None: + self._create(entry.path, stat_info) + self._sync_dir(entry.path, sync_deque) + + scan_iter.close() + + def _sync_file(self, path, stat_info, sync_deque=None): + """ + Syncs the file at `path` with the Files table by detecting whether the + file was previously indexed but moved. Updates the record with the new + path. This ensures that the file at path is associated with the correct + file ID. This method does nothing if the file at `path` was not + previously indexed. + + Parameters + ---------- + path : string + Path of the file to sync. + + stat_info : StatStruct + Stat info of the file to sync. + + sync_deque : deque, optional + Deque of directory records to be checked for dirtiness in + _sync_all(). If specified, this method appends to sync_deque any + moved indexed directory and all of its children recursively. + + Returns + ------- + id : int, optional + ID of the file if it is a real file (not a symlink) and it was + previously indexed. None otherwise. + """ + # if file is symlink, do nothing + if stat_info.is_symlink: + return None + + src = self.con.execute( + "SELECT id, path, crtime, mtime FROM Files WHERE ino = ?", (stat_info.ino,) + ).fetchone() + + # if no record with matching ino, then return None + if not src: + return None + + id, old_path, src_crtime, src_mtime = src + src_timestamp = src_crtime if src_crtime is not None else src_mtime + dst_timestamp = stat_info.crtime if stat_info.crtime is not None else stat_info.mtime + + # if record has identical ino and crtime/mtime to an existing record, + # update it with new destination path and stat info, returning its id + if src_timestamp == dst_timestamp: + self._update_with_path(id, stat_info, path) + + # update paths of indexed children under moved directories + if stat_info.is_dir and old_path != path: + self._move_recursive(old_path, path, sync_deque) + if sync_deque is not None: + sync_deque.appendleft((id, path, src_mtime)) + + return id + + # otherwise delete the existing record with identical `ino`, since inos + # must be unique. then return None + self.con.execute("DELETE FROM Files WHERE id = ?", (id,)) + return None + + def _normalize_path(self, path): + """Normalizes a given file path.""" + if not os.path.isabs(path): + path = os.path.join(self.root_dir, path) + path = os.path.normcase(path) + path = os.path.normpath(path) + return path + + def _parse_raw_stat(self, raw_stat): + """Accepts an `os.stat_result` object and returns a `StatStruct` + object.""" + stat_info = StatStruct() + + stat_info.ino = raw_stat.st_ino + stat_info.crtime = ( + raw_stat.st_ctime_ns + if os.name == "nt" + # st_birthtime_ns is not supported, so we have to compute it manually + else int(raw_stat.st_birthtime * 1e9) + if hasattr(raw_stat, "st_birthtime") + else None + ) + stat_info.mtime = raw_stat.st_mtime_ns + stat_info.is_dir = stat.S_ISDIR(raw_stat.st_mode) + stat_info.is_symlink = stat.S_ISLNK(raw_stat.st_mode) + + return stat_info + + def _stat(self, path): + """Returns stat info on a path in a StatStruct object.Returns None if + file does not exist at path.""" + try: + raw_stat = os.lstat(path) + except OSError: + return None + + return self._parse_raw_stat(raw_stat) + + def _create(self, path, stat_info): + """Creates a record given its path and stat info. Returns the new file + ID.""" + cursor = self.con.execute( + "INSERT INTO Files (path, ino, crtime, mtime, is_dir) VALUES (?, ?, ?, ?, ?)", + (path, stat_info.ino, stat_info.crtime, stat_info.mtime, stat_info.is_dir), + ) + + return cursor.lastrowid + + def _update_with_path(self, id, stat_info, path): + """Same as _update(), but accepts and updates path.""" + self.con.execute( + "UPDATE Files SET path = ?, ino = ?, crtime = ?, mtime = ? WHERE id = ?", + (path, stat_info.ino, stat_info.crtime, stat_info.mtime, id), + ) + + def _update(self, id, stat_info): + """Updates a record given its file ID and stat info.""" + # updating `ino` and `crtime` is a conscious design decision because + # this method is called by `move()`. these values are only preserved by + # fs moves done via the `rename()` syscall, like `mv`. we don't care how + # the contents manager moves a file; it could be deleting and creating a + # new file (which will change the stat info). + self.con.execute( + "UPDATE Files SET ino = ?, crtime = ?, mtime = ? WHERE id = ?", + (stat_info.ino, stat_info.crtime, stat_info.mtime, id), + ) + + def index(self, path, stat_info=None, commit=True): + """Returns the file ID for the file at `path`, creating a new file ID if + one does not exist. Returns None only if file does not exist at path.""" + path = self._normalize_path(path) + stat_info = stat_info or self._stat(path) + if not stat_info: + return None + + # if file is symlink, then index the path it refers to instead + if stat_info.is_symlink: + return self.index(os.path.realpath(path)) + + # sync file at path and return file ID if it exists + id = self._sync_file(path, stat_info) + if id is not None: + return id + + # otherwise, create a new record and return the file ID + id = self._create(path, stat_info) + if commit: + self.con.commit() + return id + + def get_id(self, path): + """Retrieves the file ID associated with a file path. Returns None if + the file has not yet been indexed or does not exist at the given + path.""" + path = self._normalize_path(path) + stat_info = self._stat(path) + if not stat_info: + return None + + # then sync file at path and retrieve id, if any + id = self._sync_file(path, stat_info) + self.con.commit() + return id + + def get_path(self, id): + """Retrieves the file path associated with a file ID. Returns None if + the ID does not exist in the Files table or if the corresponding path no + longer has a file.""" + self._sync_all() + self.con.commit() + row = self.con.execute("SELECT path FROM Files WHERE id = ?", (id,)).fetchone() + path = row and row[0] + + # if no record associated with ID or if file no longer exists at path, return None + if path is None or self._stat(path) is None: + return None + + return path + + def _move_recursive(self, old_path, new_path, sync_deque=None): + """Updates path of all indexed files prefixed with `old_path` and + replaces the prefix with `new_path`. If `sync_deque` is specified, moved + indexed directories are appended to `sync_deque`.""" + old_path_glob = os.path.join(old_path, "*") + records = self.con.execute( + "SELECT id, path, mtime FROM Files WHERE path GLOB ?", (old_path_glob,) + ).fetchall() + + for record in records: + id, old_recpath, mtime = record + new_recpath = os.path.join(new_path, os.path.relpath(old_recpath, start=old_path)) + stat_info = self._stat(new_recpath) + if not stat_info: + continue + + self._update_with_path(id, stat_info, new_recpath) + + if sync_deque is not None and stat_info.is_dir: + sync_deque.append((id, new_recpath, mtime)) + + def move(self, old_path, new_path, recursive=False): + """Handles file moves by updating the file path of the associated file + ID. Returns the file ID. Returns None if file does not exist at new_path.""" + old_path = self._normalize_path(old_path) + new_path = self._normalize_path(new_path) + + # verify file exists at new_path + stat_info = self._stat(new_path) + if stat_info is None: + return None + + self.log.debug(f"FileIdManager : Moving file from ${old_path} to ${new_path}") + + if recursive: + self._move_recursive(old_path, new_path) + + # attempt to fetch ID associated with old path + # we avoid using get_id() here since that will always return None as file no longer exists at old path + row = self.con.execute("SELECT id FROM Files WHERE path = ?", (old_path,)).fetchone() + if row is None: + # if no existing record, create a new one + id = self._create(new_path, stat_info) + self.con.commit() + return id + else: + # update existing record with new path and stat info + # TODO: make sure is_dir for existing record matches that of file at new_path + id = row[0] + self._update_with_path(id, stat_info, new_path) + self.con.commit() + return id + + def copy(self, from_path, to_path, recursive=False): + """Handles file copies by creating a new record in the Files table. + Returns the file ID associated with `new_path`. Also indexes `old_path` + if record does not exist in Files table. TODO: emit to event bus to + inform client extensions to copy records associated with old file ID to + the new file ID.""" + from_path = self._normalize_path(from_path) + to_path = self._normalize_path(to_path) + self.log.debug(f"FileIdManager : Copying file from ${from_path} to ${to_path}") + + if recursive: + from_path_glob = os.path.join(from_path, "*") + records = self.con.execute( + "SELECT path FROM Files WHERE path GLOB ?", (from_path_glob,) + ).fetchall() + for record in records: + if not record: + continue + (from_recpath,) = record + to_recpath = os.path.join(to_path, os.path.basename(from_recpath)) + stat_info = self._stat(to_recpath) + if not stat_info: + continue + self.con.execute( + "INSERT INTO FILES (path, ino, crtime, mtime, is_dir) VALUES (?, ?, ?, ?, ?)", + ( + to_recpath, + stat_info.ino, + stat_info.crtime, + stat_info.mtime, + stat_info.is_dir, + ), + ) + + self.index(from_path, commit=False) + # transaction committed in index() + return self.index(to_path) + + def delete(self, path, recursive=False): + """Handles file deletions by deleting the associated record in the File + table. Returns None.""" + path = self._normalize_path(path) + self.log.debug(f"FileIdManager : Deleting file {path}") + + if recursive: + path_glob = os.path.join(path, "*") + self.con.execute("DELETE FROM Files WHERE path GLOB ?", (path_glob,)) + + self.con.execute("DELETE FROM Files WHERE path = ?", (path,)) + self.con.commit() + + def _cleanup(self): + """Cleans up `FileIdManager` by committing any pending transactions and + closing the connection.""" + self.con.commit() + self.con.close() diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index b04ab4a3dc..f44d3797d0 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -395,6 +395,10 @@ def get(self, path, content=True, type=None, format=None): if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") model = self._file_model(path, content=content, format=format) + + # append file ID to model + model["id"] = self.file_id_manager.index(path) + return model def _save_directory(self, os_path, model, path=""): diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index 462cbff35e..8ad02a2b96 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -271,7 +271,7 @@ async def delete(self, path=""): if await ensure_async(cm.is_hidden(path)) and not cm.allow_hidden: raise web.HTTPError(400, f"Cannot delete file or directory {path!r}") - self.log.warning("delete %s", path) + self.log.warning("Deleting file at %s", path) await ensure_async(cm.delete(path)) self.set_status(204) self.finish() diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index 7bd6450803..a8dd3f98df 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -30,6 +30,7 @@ from ...files.handlers import FilesHandler from .checkpoints import AsyncCheckpoints, Checkpoints +from .fileidmanager import FileIdManager copy_pat = re.compile(r"\-Copy\d*\.") @@ -59,6 +60,13 @@ class ContentsManager(LoggingConfigurable): notary = Instance(sign.NotebookNotary) + file_id_manager = Instance( + FileIdManager, + args=(), + kw={"root_dir": root_dir}, + help="File ID manager instance to use. Defaults to `FileIdManager`.", + ) + def _notary_default(self): return sign.NotebookNotary(parent=self) @@ -414,12 +422,16 @@ def delete(self, path): path = path.strip("/") if not path: raise HTTPError(400, "Can't delete root") + is_dir = self.dir_exists(path) self.delete_file(path) + self.file_id_manager.delete(path, recursive=is_dir) self.checkpoints.delete_all_checkpoints(path) def rename(self, old_path, new_path): """Rename a file and any checkpoints associated with that file.""" + is_dir = self.dir_exists(old_path) self.rename_file(old_path, new_path) + self.file_id_manager.move(old_path, new_path, recursive=is_dir) self.checkpoints.rename_all_checkpoints(old_path, new_path) def update(self, model, path): @@ -615,7 +627,9 @@ def copy(self, from_path, to_path=None): else: raise HTTPError(404, "No such directory: %s" % to_path) + is_dir = self.dir_exists(from_path) model = self.save(model, to_path) + self.file_id_manager.copy(from_path, to_path, recursive=is_dir) return model def log_info(self): @@ -817,12 +831,16 @@ async def delete(self, path): if not path: raise HTTPError(400, "Can't delete root") + is_dir = await ensure_async(self.dir_exists(path)) await self.delete_file(path) + self.file_id_manager.delete(path, recursive=is_dir) await self.checkpoints.delete_all_checkpoints(path) async def rename(self, old_path, new_path): """Rename a file and any checkpoints associated with that file.""" + is_dir = await ensure_async(self.dir_exists(old_path)) await self.rename_file(old_path, new_path) + self.file_id_manager.move(old_path, new_path, recursive=is_dir) await self.checkpoints.rename_all_checkpoints(old_path, new_path) async def update(self, model, path): @@ -984,7 +1002,9 @@ async def copy(self, from_path, to_path=None): else: raise HTTPError(404, "No such directory: %s" % to_path) + is_dir = await ensure_async(self.dir_exists(from_path)) model = await self.save(model, to_path) + self.file_id_manager.copy(from_path, to_path, recursive=is_dir) return model async def trust_notebook(self, path): diff --git a/tests/services/contents/test_fileidmanager.py b/tests/services/contents/test_fileidmanager.py new file mode 100644 index 0000000000..c4fbdd2966 --- /dev/null +++ b/tests/services/contents/test_fileidmanager.py @@ -0,0 +1,376 @@ +import os +import shutil +from pathlib import Path + +import pytest +from traitlets import TraitError + +from jupyter_server.services.contents.fileidmanager import FileIdManager + + +@pytest.fixture +def test_path(jp_root_dir): + path = os.path.join(jp_root_dir, "test_path") + os.mkdir(path) + return path + + +@pytest.fixture +def test_path_child(test_path): + path = os.path.join(test_path, "child") + Path(path).touch() + return path + + +@pytest.fixture +def old_path(jp_root_dir): + """Fixture for source path to be moved/copied via FID manager""" + path = os.path.join(jp_root_dir, "old_path") + os.mkdir(path) + return path + + +@pytest.fixture +def old_path_child(old_path): + path = os.path.join(old_path, "child") + os.mkdir(path) + return path + + +@pytest.fixture +def old_path_grandchild(old_path_child): + path = os.path.join(old_path_child, "grandchild") + os.mkdir(path) + return path + + +@pytest.fixture +def new_path(jp_root_dir): + """Fixture for destination path for a FID manager move/copy operation""" + return os.path.join(jp_root_dir, "new_path") + + +@pytest.fixture +def new_path_child(new_path): + return os.path.join(new_path, "child") + + +@pytest.fixture +def new_path_grandchild(new_path_child): + return os.path.join(new_path_child, "grandchild") + + +def get_id_nosync(fid_manager, path): + row = fid_manager.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone() + return row and row[0] + + +def get_path_nosync(fid_manager, id): + row = fid_manager.con.execute("SELECT path FROM Files WHERE id = ?", (id,)).fetchone() + return row and row[0] + + +def test_validates_root_dir(fid_db_path): + with pytest.raises(TraitError, match="must be an absolute path"): + FileIdManager(root_dir=os.path.join("some", "rel", "path"), db_path=fid_db_path) + + +def test_validates_db_path(jp_root_dir): + with pytest.raises(TraitError, match="must be an absolute path"): + FileIdManager(root_dir=str(jp_root_dir), db_path=os.path.join("some", "rel", "path")) + + +def test_index(fid_manager, test_path): + id = fid_manager.index(test_path) + assert id is not None + + +def test_index_already_indexed(fid_manager, test_path): + id = fid_manager.index(test_path) + assert id == fid_manager.index(test_path) + + +def test_index_symlink(fid_manager, test_path, jp_root_dir): + link_path = os.path.join(jp_root_dir, "link_path") + os.symlink(test_path, link_path) + id = fid_manager.index(link_path) + + # we want to assert that the "real path" is the only path associated with an + # ID. get_path() *sometimes* returns the real path if _sync_file() happens + # to be called on the real path after the symlink path when _sync_all() is + # run, causing this test to flakily pass when it shouldn't. + assert get_path_nosync(fid_manager, id) == test_path + + +# test out-of-band move detection for FIM.index() +def test_index_oob_move(fid_manager, old_path, new_path): + id = fid_manager.index(old_path) + os.rename(old_path, new_path) + assert fid_manager.index(new_path) == id + + +@pytest.fixture +def stub_stat_crtime(fid_manager, request): + """Fixture that stubs the _stat() method on fid_manager to always return a + StatStruct with a fixed crtime.""" + if hasattr(request, "param") and not request.param: + return False + + stat_real = fid_manager._stat + + def stat_stub(path): + stat = stat_real(path) + if stat: + stat.crtime = 123456789 + return stat + + fid_manager._stat = stat_stub + return True + + +# sync file should work even after directory mtime changes when children are +# added/removed/renamed on platforms supporting crtime +def test_index_crtime(fid_manager, test_path, stub_stat_crtime): + stat = os.stat(test_path) + id = fid_manager.index(test_path) + os.utime(test_path, ns=(stat.st_atime_ns, stat.st_mtime_ns + 1000)) + + assert fid_manager.index(test_path) == id + + +def test_getters_indexed(fid_manager, test_path): + id = fid_manager.index(test_path) + + assert fid_manager.get_id(test_path) == id + assert fid_manager.get_path(id) == test_path + + +def test_getters_nonnormalized(fid_manager, test_path): + path1 = os.path.join(test_path, "file") + path2 = os.path.join(test_path, "some_dir", "..", "file") + path3 = os.path.join(test_path, ".", ".", ".", "file") + Path(path1).touch() + + id = fid_manager.index(path1) + + assert fid_manager.get_id(path1) == id + assert fid_manager.get_id(path2) == id + assert fid_manager.get_id(path3) == id + + +def test_getters_oob_delete(fid_manager, test_path): + id = fid_manager.index(test_path) + os.rmdir(test_path) + assert id is not None + assert fid_manager.get_id(test_path) == None + assert fid_manager.get_path(id) == None + + +def test_get_id_unindexed(fid_manager, test_path_child): + assert fid_manager.get_id(test_path_child) == None + + +# test out-of-band move detection for FIM.get_id() +def test_get_id_oob_move(fid_manager, old_path, new_path): + id = fid_manager.index(old_path) + os.rename(old_path, new_path) + assert fid_manager.get_id(new_path) == id + + +def test_get_id_oob_move_recursive(fid_manager, old_path, old_path_child, new_path, new_path_child): + parent_id = fid_manager.index(old_path) + child_id = fid_manager.index(old_path_child) + + os.rename(old_path, new_path) + + assert fid_manager.get_id(new_path) == parent_id + assert fid_manager.get_id(new_path_child) == child_id + + +# make sure that out-of-band moves are detected even when a new file is created +# at the old path. this is what forces relaxation of the UNIQUE constraint on +# path column, since we need to keep records of deleted files that used to +# occupy a path, which is possibly occupied by a new file. +def test_get_id_oob_move_new_file_at_old_path(fid_manager, old_path, new_path, jp_root_dir): + old_id = fid_manager.index(old_path) + other_path = os.path.join(jp_root_dir, "other_path") + + os.rename(old_path, new_path) + Path(old_path).touch() + other_id = fid_manager.index(old_path) + os.rename(old_path, other_path) + + assert other_id != old_id + assert fid_manager.get_id(new_path) == old_id + assert fid_manager.get_path(old_id) == new_path + assert fid_manager.get_id(other_path) == other_id + + +def test_get_path_oob_move(fid_manager, old_path, new_path): + id = fid_manager.index(old_path) + os.rename(old_path, new_path) + assert fid_manager.get_path(id) == new_path + + +def test_get_path_oob_move_recursive( + fid_manager, old_path, old_path_child, new_path, new_path_child +): + id = fid_manager.index(old_path) + child_id = fid_manager.index(old_path_child) + + os.rename(old_path, new_path) + + assert fid_manager.get_path(id) == new_path + assert fid_manager.get_path(child_id) == new_path_child + + +def test_get_path_oob_move_into_unindexed( + fid_manager, old_path, old_path_child, new_path, new_path_child +): + fid_manager.index(old_path) + id = fid_manager.index(old_path_child) + + os.mkdir(new_path) + os.rename(old_path_child, new_path_child) + + assert fid_manager.get_path(id) == new_path_child + + +# move file into an indexed-but-moved directory +# this test should work regardless of whether crtime is supported on platform +@pytest.mark.parametrize("stub_stat_crtime", [True, False], indirect=["stub_stat_crtime"]) +def test_get_path_oob_move_nested(fid_manager, old_path, new_path, jp_root_dir, stub_stat_crtime): + old_test_path = os.path.join(jp_root_dir, "test_path") + new_test_path = os.path.join(new_path, "test_path") + Path(old_test_path).touch() + stat = os.stat(old_test_path) + fid_manager.index(old_path) + id = fid_manager.index(old_test_path) + + os.rename(old_path, new_path) + os.rename(old_test_path, new_test_path) + # ensure new_path has different mtime after moving test_path. moving a file + # into an indexed-but-moved dir has a chance of not changing the dir's + # mtime. since we fallback to mtime, this makes the dir look unindexed and + # causes this test to flakily pass when it should not. + os.utime(new_path, ns=(stat.st_atime_ns, stat.st_mtime_ns + 1000)) + + assert fid_manager.get_path(id) == new_test_path + + +# move file into directory within an indexed-but-moved directory +# this test should work regardless of whether crtime is supported on platform +@pytest.mark.parametrize("stub_stat_crtime", [True, False], indirect=["stub_stat_crtime"]) +def test_get_path_oob_move_deeply_nested( + fid_manager, old_path, new_path, old_path_child, new_path_child, jp_root_dir, stub_stat_crtime +): + old_test_path = os.path.join(jp_root_dir, "test_path") + new_test_path = os.path.join(new_path_child, "test_path") + Path(old_test_path).touch() + stat = os.stat(old_test_path) + fid_manager.index(old_path) + fid_manager.index(old_path_child) + id = fid_manager.index(old_test_path) + + os.rename(old_path, new_path) + os.rename(old_test_path, new_test_path) + os.utime(new_path_child, ns=(stat.st_atime_ns, stat.st_mtime_ns + 1000)) + + assert fid_manager.get_path(id) == new_test_path + + +def test_move_unindexed(fid_manager, old_path, new_path): + os.rename(old_path, new_path) + id = fid_manager.move(old_path, new_path) + + assert id is not None + assert fid_manager.get_id(old_path) is None + assert fid_manager.get_id(new_path) is id + assert fid_manager.get_path(id) == new_path + + +def test_move_indexed(fid_manager, old_path, new_path): + old_id = fid_manager.index(old_path) + + os.rename(old_path, new_path) + new_id = fid_manager.move(old_path, new_path) + + assert old_id == new_id + assert fid_manager.get_id(old_path) == None + assert fid_manager.get_id(new_path) == new_id + assert fid_manager.get_path(old_id) == new_path + + +# test for disjoint move handling +# disjoint move: any out-of-band move that does not preserve stat info +def test_disjoint_move_indexed(fid_manager, old_path, new_path): + old_id = fid_manager.index(old_path) + + os.rmdir(old_path) + os.mkdir(new_path) + new_id = fid_manager.move(old_path, new_path) + + assert old_id == new_id + + +def test_move_recursive( + fid_manager, + old_path, + old_path_child, + old_path_grandchild, + new_path, + new_path_child, + new_path_grandchild, +): + parent_id = fid_manager.index(old_path) + child_id = fid_manager.index(old_path_child) + grandchild_id = fid_manager.index(old_path_grandchild) + + os.rename(old_path, new_path) + fid_manager.move(old_path, new_path, recursive=True) + + # we avoid using get_id() here as it auto-corrects wrong path updates via + # its out-of-band move detection logic. too smart for its own good! + assert get_id_nosync(fid_manager, new_path) == parent_id + assert get_id_nosync(fid_manager, new_path_child) == child_id + assert get_id_nosync(fid_manager, new_path_grandchild) == grandchild_id + + +def test_copy(fid_manager, old_path, new_path): + shutil.copytree(old_path, new_path) + new_id = fid_manager.copy(old_path, new_path) + old_id = fid_manager.get_id(old_path) + + assert old_id is not None + assert new_id is not None + assert old_id != new_id + + +def test_copy_recursive(fid_manager, old_path, old_path_child, new_path, new_path_child): + fid_manager.index(old_path) + fid_manager.index(old_path_child) + + shutil.copytree(old_path, new_path) + fid_manager.copy(old_path, new_path, recursive=True) + + assert fid_manager.get_id(new_path_child) is not None + + +def test_delete(fid_manager, test_path): + id = fid_manager.index(test_path) + + shutil.rmtree(test_path) + fid_manager.delete(test_path) + + assert fid_manager.get_id(test_path) is None + assert fid_manager.get_path(id) is None + + +def test_delete_recursive(fid_manager, test_path, test_path_child): + fid_manager.index(test_path) + fid_manager.index(test_path_child) + + shutil.rmtree(test_path) + fid_manager.delete(test_path, recursive=True) + + assert fid_manager.get_id(test_path_child) is None diff --git a/tests/services/contents/test_manager.py b/tests/services/contents/test_manager.py index d2d06a2513..074cc39ac9 100644 --- a/tests/services/contents/test_manager.py +++ b/tests/services/contents/test_manager.py @@ -2,7 +2,7 @@ import sys import time from itertools import combinations -from typing import Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple from unittest.mock import patch import pytest @@ -28,14 +28,25 @@ (AsyncFileContentsManager, False), ] ) -def jp_contents_manager(request, tmp_path): +def jp_contents_manager(request, tmp_path, fid_manager): contents_manager, use_atomic_writing = request.param - return contents_manager(root_dir=str(tmp_path), use_atomic_writing=use_atomic_writing) + return contents_manager( + root_dir=str(tmp_path), use_atomic_writing=use_atomic_writing, file_id_manager=fid_manager + ) @pytest.fixture(params=[FileContentsManager, AsyncFileContentsManager]) -def jp_file_contents_manager_class(request, tmp_path): - return request.param +def jp_file_contents_manager_class(request, tmp_path, fid_manager): + # mypy bugs out with dynamic base class + # https://github.com/python/mypy/issues/5865 + Klass: Any = request.param + + class WrappedKlass(Klass): + file_id_manager = fid_manager + # def __init__(self, *args, **kwargs): + # return Klass(*args, file_id_manager=fid_manager, **kwargs) + + return WrappedKlass # -------------- Functions ----------------------------