Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft: File ID Manager base implementation #921

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions jupyter_server/benchmarks/fileidmanager_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os
import timeit

from jupyter_core.paths import jupyter_data_dir

from jupyter_server.services.contents.fileidmanager import FileIdManager

db_path = os.path.join(jupyter_data_dir(), "file_id_manager_perftest.db")


def build_setup(n, insert=True):
def setup():
try:
os.remove(db_path)
except:
pass
fid_manager = FileIdManager(db_path=db_path)

if not insert:
return

for i in range(n):
fid_manager.con.execute(
"INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
)
fid_manager.con.commit()

return setup


BATCH_SIZE = 100_000


def build_test_index(n, single_transaction, batched=False):
def test_index():
fid_manager = FileIdManager(db_path=db_path)

if single_transaction:
if batched:
for batch_start in range(0, n, BATCH_SIZE):
batch_end = batch_start + BATCH_SIZE
fid_manager.con.execute(
"INSERT INTO FILES (path) VALUES "
+ ",".join(
[f'("abracadabra/{i}.txt")' for i in range(batch_start, batch_end)]
)
)
else:
for i in range(n):
fid_manager.con.execute(
"INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
)

fid_manager.con.commit()
else:
for i in range(n):
fid_manager.index(f"abracadabra/{i}.txt")

return test_index


def test_copy():
fid_manager = FileIdManager(db_path=db_path)
fid_manager.copy("abracadabra", "shazam", recursive=True)


def test_move():
fid_manager = FileIdManager(db_path=db_path)
fid_manager.move("abracadabra", "shazam", recursive=True)


def test_delete():
fid_manager = FileIdManager(db_path=db_path)
fid_manager.delete("abracadabra", recursive=True)


row_template = "{:<9,d} files | {:<8.4f} s"


# too slow for 1k+
print("Index benchmark (separate transactions)")
for i in [100, 1_000]:
print(
row_template.format(
i,
timeit.timeit(
build_test_index(i, single_transaction=False),
build_setup(i, insert=False),
number=1,
),
)
)

print("Index benchmark (single transaction, atomic INSERTs)")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(
row_template.format(
i,
timeit.timeit(
build_test_index(i, single_transaction=True, batched=False),
build_setup(i, insert=False),
number=1,
),
)
)

# suggested by https://stackoverflow.com/a/72527058/12548458
# asymptotically faster because it reduces work being done by the SQLite VDBE https://www.sqlite.org/opcode.html
# weird constant time factor that makes it sub-optimal for <1M records.
print("Index benchmark (single transaction, batched INSERTs)")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(
row_template.format(
i,
timeit.timeit(
build_test_index(i, single_transaction=True, batched=True),
build_setup(i, insert=False),
number=1,
),
)
)

print("Recursive move benchmark")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(row_template.format(i, timeit.timeit(test_move, build_setup(i), number=1)))

print("Recursive copy benchmark")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(row_template.format(i, timeit.timeit(test_copy, build_setup(i), number=1)))

print("Recursive delete benchmark")
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
print(row_template.format(i, timeit.timeit(test_delete, build_setup(i), number=1)))
22 changes: 22 additions & 0 deletions jupyter_server/pytest_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from jupyter_server.extension import serverextension
from jupyter_server.serverapp import ServerApp
from jupyter_server.services.contents.fileidmanager import FileIdManager
from jupyter_server.services.contents.filemanager import FileContentsManager
from jupyter_server.services.contents.largefilemanager import LargeFileManager
from jupyter_server.utils import url_path_join
Expand Down Expand Up @@ -457,6 +458,27 @@ def jp_large_contents_manager(tmp_path):
return LargeFileManager(root_dir=str(tmp_path))


@pytest.fixture
def fid_db_path(jp_data_dir):
"""Fixture that returns the file ID DB path used for tests."""
return str(jp_data_dir / "fileidmanager_test.db")


@pytest.fixture(autouse=True)
def delete_db(fid_db_path):
"""Fixture that automatically deletes the DB file before each test."""
try:
os.remove(fid_db_path)
except OSError:
pass


@pytest.fixture
def fid_manager(fid_db_path):
"""Fixture returning a test-configured instance of `FileIdManager`."""
return FileIdManager(db_path=fid_db_path)


@pytest.fixture
def jp_create_notebook(jp_root_dir):
"""Creates a notebook in the test's home directory."""
Expand Down
11 changes: 9 additions & 2 deletions jupyter_server/serverapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
)
from jupyter_server.log import log_request
from jupyter_server.services.config import ConfigManager
from jupyter_server.services.contents.fileidmanager import FileIdManager
from jupyter_server.services.contents.filemanager import (
AsyncFileContentsManager,
FileContentsManager,
Expand Down Expand Up @@ -1886,9 +1887,9 @@ def init_configurables(self):
connection_dir=self.runtime_dir,
kernel_spec_manager=self.kernel_spec_manager,
)
self.file_id_manager = FileIdManager(parent=self, log=self.log)
Copy link
Member

@kevin-bates kevin-bates Aug 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably define a file_id_manager_class configurable like the other managers define so folks can BYO their own FID manager. This also suggests defining an abstract base class that is then referenced in the klass attribute which allows folks to either derive from FileIdManager or essentially implement their own.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! I forgot to define FileIdManager as a trait on ServerApp. Before I fix this, is there any reason to prefer

self.file_id_manager_class = Type(klass=FileIdManager)
...
self.file_id_manager = self.file_id_manager_class(*args)

over just using

self.file_id_manager = Instance(klass=FileIdManager, args=(*args))

Copy link
Member

@kevin-bates kevin-bates Aug 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think either is what we want.

Both imply that one has to derive from FileIdManager - which we shouldn't impose on anyone. So klass should reference an ABC (e.g., FileIdManagerABC) that only defines the public APIs as abstract. It could also contain validation logic for the db-path trait, etc.

So the user would invoke the server using --ServerApp.file_id_manager_class = my.package.MyFileIdManager and the trait would ensure that my.package.MyFileIdManager is an instance of FileIdManagerABC and whatever interacts with the file ID manager class would be relegated to the methods defined in the ABC.

Folks that want to slightly adjust the OOTB implementation of FileIdManager (or override all of its methods entirely) are free to do so via subclassing as they are still an instance of the ABC.

For example, with the last set of changes, an entirely different implementation is required if any non-filesystem-based ContentsManager is configured (unfortunately).

self.contents_manager = self.contents_manager_class(
parent=self,
log=self.log,
parent=self, log=self.log, file_id_manager=self.file_id_manager
)
self.session_manager = self.session_manager_class(
parent=self,
Expand Down Expand Up @@ -2508,6 +2509,11 @@ async def cleanup_extensions(self):
self.log.info(extension_msg % n_extensions)
await run_sync_in_loop(self.extension_manager.stop_all_extensions())

def cleanup_file_id_manager(self):
if not getattr(self, "file_id_manager", None):
return
self.file_id_manager._cleanup()
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

def running_server_info(self, kernel_count=True):
"Return the current working directory and the server url information"
info = self.contents_manager.info_string() + "\n"
Expand Down Expand Up @@ -2780,6 +2786,7 @@ async def _cleanup(self):
self.remove_browser_open_files()
await self.cleanup_extensions()
await self.cleanup_kernels()
self.cleanup_file_id_manager()
if getattr(self, "session_manager", None):
self.session_manager.close()
if getattr(self, "event_bus", None):
Expand Down
126 changes: 126 additions & 0 deletions jupyter_server/services/contents/fileidmanager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
import sqlite3

from jupyter_core.paths import jupyter_data_dir
from traitlets import Unicode
from traitlets.config.configurable import LoggingConfigurable


class FileIdManager(LoggingConfigurable):
db_path = Unicode(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would recommend that we name this trait database_filepath to match the SessionManager:

database_filepath = Unicode(
default_value=":memory:",
help=(
"The filesystem path to SQLite Database file "
"(e.g. /path/to/session_database.db). By default, the session "
"database is stored in-memory (i.e. `:memory:` setting from sqlite3) "
"and does not persist when the current Jupyter Server shuts down."
),
).tag(config=True)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is also some logic here you could borrow to validate this trait:

@validate("database_filepath")
def _validate_database_filepath(self, proposal):
value = proposal["value"]
if value == ":memory:":
return value
path = pathlib.Path(value)
if path.exists():
# Verify that the database path is not a directory.
if path.is_dir():
raise TraitError(
"`database_filepath` expected a file path, but the given path is a directory."
)
# Verify that database path is an SQLite 3 Database by checking its header.
with open(value, "rb") as f:
header = f.read(100)
if not header.startswith(b"SQLite format 3") and not header == b"":
raise TraitError("The given file is not an SQLite database file.")
return value

default_value=os.path.join(jupyter_data_dir(), "file_id_manager.db"),
help=(
"The path of the DB file used by `FileIdManager`. "
"Defaults to `jupyter_data_dir()/file_id_manager.db`."
),
config=True,
)

def __init__(self, *args, **kwargs):
# pass args and kwargs to parent Configurable
super().__init__(*args, **kwargs)
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
# initialize connection with db
self.con = sqlite3.connect(self.db_path)
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
self.log.debug("Creating File ID tables and indices")
self.con.execute(
"CREATE TABLE IF NOT EXISTS Files(id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE)"
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
)
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON FILES (path)")
self.con.commit()

def _normalize_path(self, path):
"""Normalizes a given file path."""
path = os.path.normcase(path)
path = os.path.normpath(path)
return path

def index(self, path):
"""Adds the file path to the Files table, then returns the file ID. If
the file is already indexed, the file ID is immediately returned."""
path = self._normalize_path(path)
existing_id = self.get_id(path)
if existing_id is not None:
return existing_id

cursor = self.con.execute("INSERT INTO Files (path) VALUES (?)", (path,))
self.con.commit()
return cursor.lastrowid

def get_id(self, path):
"""Retrieves the file ID associated with a file path. Returns None if
the file path has not yet been indexed."""
path = self._normalize_path(path)
row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone()
self.con.commit()
return row[0] if row else None

def get_path(self, id):
"""Retrieves the file path associated with a file ID. Returns None if
the ID does not exist in the Files table."""
row = self.con.execute("SELECT path FROM Files WHERE id = ?", (id,)).fetchone()
self.con.commit()
dlqqq marked this conversation as resolved.
Show resolved Hide resolved
return row[0] if row else None

def move(self, old_path, new_path, recursive=False):
"""Handles file moves by updating the file path of the associated file
ID. Returns the file ID."""
old_path = self._normalize_path(old_path)
new_path = self._normalize_path(new_path)
self.log.debug(f"Moving file from ${old_path} to ${new_path}")

if recursive:
old_path_glob = os.path.join(old_path, "*")
self.con.execute(
"UPDATE Files SET path = ? || substr(path, ?) WHERE path GLOB ?",
(new_path, len(old_path) + 1, old_path_glob),
)
self.con.commit()

id = self.get_id(old_path)
if id is None:
return self.index(new_path)
else:
self.con.execute("UPDATE Files SET path = ? WHERE id = ?", (new_path, id))
self.con.commit()
return id
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

def copy(self, from_path, to_path, recursive=False):
"""Handles file copies by creating a new record in the Files table.
Returns the file ID associated with `new_path`. Also indexes `old_path`
if record does not exist in Files table. TODO: emit to event bus to
inform client extensions to copy records associated with old file ID to
the new file ID."""
from_path = self._normalize_path(from_path)
to_path = self._normalize_path(to_path)
self.log.debug(f"Copying file from ${from_path} to ${to_path}")

if recursive:
from_path_glob = os.path.join(from_path, "*")
self.con.execute(
"INSERT INTO Files (path) SELECT (? || substr(path, ?)) FROM Files WHERE path GLOB ?",
(to_path, len(from_path) + 1, from_path_glob),
)
self.con.commit()

self.index(from_path)
return self.index(to_path)
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

def delete(self, path, recursive=False):
"""Handles file deletions by deleting the associated record in the File
table. Returns None."""
path = self._normalize_path(path)
self.log.debug(f"Deleting file {path}")

if recursive:
path_glob = os.path.join(path, "*")
self.con.execute("DELETE FROM Files WHERE path GLOB ?", (path_glob,))
self.con.commit()
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

self.con.execute("DELETE FROM Files WHERE path = ?", (path,))
self.con.commit()

def _cleanup(self):
"""Cleans up `FileIdManager` by committing any pending transactions and
closing the connection."""
self.con.commit()
self.con.close()
4 changes: 4 additions & 0 deletions jupyter_server/services/contents/filemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,10 @@ def get(self, path, content=True, type=None, format=None):
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = self._file_model(path, content=content, format=format)

# append file ID to model
model["id"] = self.file_id_manager.index(path)
dlqqq marked this conversation as resolved.
Show resolved Hide resolved

return model

def _save_directory(self, os_path, model, path=""):
Expand Down
2 changes: 1 addition & 1 deletion jupyter_server/services/contents/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ async def delete(self, path=""):
if await ensure_async(cm.is_hidden(path)) and not cm.allow_hidden:
raise web.HTTPError(400, f"Cannot delete file or directory {path!r}")

self.log.warning("delete %s", path)
self.log.warning("Deleting file at %s", path)
await ensure_async(cm.delete(path))
self.set_status(204)
self.finish()
Expand Down
Loading