From b318e5f3d26854952b5c99519965286cbcbe9b0c Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 13 Nov 2021 18:09:22 -0800 Subject: [PATCH 1/3] Reserved columns now have underscore prefix, refs #14 --- git_history/cli.py | 54 +++------- tests/test_git_history.py | 218 ++++++++++++++++---------------------- 2 files changed, 109 insertions(+), 163 deletions(-) diff --git a/git_history/cli.py b/git_history/cli.py index 5574022..1c05729 100644 --- a/git_history/cli.py +++ b/git_history/cli.py @@ -5,6 +5,7 @@ import sqlite_utils import textwrap from pathlib import Path +from .utils import fix_reserved_columns def iterate_file_versions( @@ -181,17 +182,13 @@ def file( # If --id is specified, do things a bit differently if ids: - # If '--id id' is only option, 'id' is not a reserved column - id_is_reserved = list(ids) != ["id"] # Any ids that are reserved columns must be renamed fixed_ids = set( fix_reserved_columns( {id: 1 for id in ids}, - allow_id=not id_is_reserved, - allow_banned=True, ).keys() ) - items_insert_extra_kwargs["pk"] = "id" + items_insert_extra_kwargs["pk"] = "_id" # Check all items have those columns _ids_set = set(ids) bad_items = [ @@ -209,7 +206,7 @@ def file( items_insert_extra_kwargs["replace"] = True # Which of these are new versions of things we have seen before for item in items: - item = fix_reserved_columns(item, allow_id=not id_is_reserved) + item = fix_reserved_columns(item) item_id = _hash(dict((id, item.get(id)) for id in fixed_ids)) if item_id in item_ids_in_this_version: if not ignore_duplicate_ids: @@ -240,9 +237,9 @@ def file( id_last_hash[item_id] = item_hash version = id_versions.get(item_id, 0) + 1 id_versions[item_id] = version - items_to_add.append(dict(item, id=item_id)) + items_to_add.append(dict(item, _id=item_id)) versions.append( - dict(item, item=item_id, version=version, commit=git_hash) + dict(item, _item=item_id, _version=version, _commit=git_hash) ) # Only add the items that had no new version @@ -252,15 +249,17 @@ def file( # not ids - so just check them for banned columns and add the item["commit"] for item in items: item = fix_reserved_columns(item) - item["commit"] = git_hash + item["_commit"] = git_hash # In this case item table needs a foreign key on 'commit' - items_insert_extra_kwargs["foreign_keys"] = (("commit", "commits", "hash"),) + items_insert_extra_kwargs["foreign_keys"] = ( + ("_commit", "commits", "hash"), + ) # insert items if items: db["items"].insert_all( items, - column_order=("id",), + column_order=("_id",), alter=True, **items_insert_extra_kwargs, ) @@ -269,11 +268,14 @@ def file( if versions: db["item_versions"].insert_all( versions, - pk=("item", "version"), + pk=("_item", "_version"), alter=True, replace=True, - column_order=("item", "version", "commit"), - foreign_keys=(("item", "items", "id"), ("commit", "commits", "hash")), + column_order=("_item", "_version", "_commit"), + foreign_keys=( + ("_item", "items", "_id"), + ("_commit", "commits", "hash"), + ), ) @@ -283,27 +285,3 @@ def _hash(record): "utf8" ) ).hexdigest() - - -def fix_reserved_columns(item, allow_id=False, allow_banned=False): - reserved = {"item", "version", "commit", "rowid"} - banned = {"id_", "item_", "version_", "commit_"} - if not allow_id: - reserved.add("id") - if not allow_banned and any(key in banned for key in item): - raise click.ClickException( - "Column {} is one of these banned columns: {}\n{}".format( - sorted([key for key in item if key in banned]), - sorted(banned), - json.dumps(item, indent=4, default=str), - ) - ) - if not any(key in reserved for key in item): - return item - new_item = {} - for key in item: - if key in reserved: - new_item[key + "_"] = item[key] - else: - new_item[key] = item[key] - return new_item diff --git a/tests/test_git_history.py b/tests/test_git_history.py index d0183d7..8df6153 100644 --- a/tests/test_git_history.py +++ b/tests/test_git_history.py @@ -29,17 +29,17 @@ def repo(tmpdir): json.dumps( [ { - "id": 1, - "item": "Gin", - "version": "v1", - "commit": "commit1", + "_id": 1, + "_item": "Gin", + "_version": "v1", + "_commit": "commit1", "rowid": 5, }, { - "id": 2, - "item": "Tonic", - "version": "v1", - "commit": "commit1", + "_id": 2, + "_item": "Tonic", + "_version": "v1", + "_commit": "commit1", "rowid": 6, }, ] @@ -50,8 +50,8 @@ def repo(tmpdir): json.dumps( [ { - "id_": 1, - "version_": "Gin", + "_id_": 1, + "_version_": "Gin", } ] ), @@ -110,9 +110,27 @@ def repo(tmpdir): (repo_dir / "items-with-reserved-columns.json").write_text( json.dumps( [ - {"id": 1, "item": "Gin", "version": "v1", "commit": "commit1"}, - {"id": 2, "item": "Tonic 2", "version": "v1", "commit": "commit1"}, - {"id": 3, "item": "Rum", "version": "v1", "commit": "commit1"}, + { + "_id": 1, + "_item": "Gin", + "_version": "v1", + "_commit": "commit1", + "rowid": 5, + }, + { + "_id": 2, + "_item": "Tonic 2", + "_version": "v1", + "_commit": "commit1", + "rowid": 6, + }, + { + "_id": 3, + "_item": "Rum", + "_version": "v1", + "_commit": "commit1", + "rowid": 7, + }, ] ), "utf-8", @@ -138,7 +156,7 @@ def test_file_without_id(repo, tmpdir): "CREATE TABLE [items] (\n" " [item_id] INTEGER,\n" " [name] TEXT,\n" - " [commit] TEXT REFERENCES [commits]([hash])\n" + " [_commit] TEXT REFERENCES [commits]([hash])\n" ");" ) assert db["commits"].count == 2 @@ -176,29 +194,29 @@ def test_file_with_id(repo, tmpdir): " [commit_at] TEXT\n" ");\n" "CREATE TABLE [items] (\n" - " [id] TEXT PRIMARY KEY,\n" + " [_id] TEXT PRIMARY KEY,\n" " [item_id] INTEGER,\n" " [name] TEXT\n" ");\n" "CREATE TABLE [item_versions] (\n" - " [item] TEXT REFERENCES [items]([id]),\n" - " [version] INTEGER,\n" - " [commit] TEXT REFERENCES [commits]([hash]),\n" + " [_item] TEXT REFERENCES [items]([_id]),\n" + " [_version] INTEGER,\n" + " [_commit] TEXT REFERENCES [commits]([hash]),\n" " [item_id] INTEGER,\n" " [name] TEXT,\n" - " PRIMARY KEY ([item], [version])\n" + " PRIMARY KEY ([_item], [_version])\n" ");" ) assert db["commits"].count == 2 # Should have no duplicates item_versions = [ - r for r in db.query("select item_id, version, name from item_versions") + r for r in db.query("select item_id, _version, name from item_versions") ] assert item_versions == [ - {"item_id": 1, "version": 1, "name": "Gin"}, - {"item_id": 2, "version": 1, "name": "Tonic"}, - {"item_id": 2, "version": 2, "name": "Tonic 2"}, - {"item_id": 3, "version": 1, "name": "Rum"}, + {"item_id": 1, "_version": 1, "name": "Gin"}, + {"item_id": 2, "_version": 1, "name": "Tonic"}, + {"item_id": 2, "_version": 2, "name": "Tonic 2"}, + {"item_id": 3, "_version": 1, "name": "Rum"}, ] @@ -215,7 +233,7 @@ def test_file_with_reserved_columns(repo, tmpdir): "--repo", str(repo), "--id", - "id", + "_id", ], catch_exceptions=False, ) @@ -227,113 +245,63 @@ def test_file_with_reserved_columns(repo, tmpdir): " [commit_at] TEXT\n" ");\n" "CREATE TABLE [items] (\n" - " [id] TEXT PRIMARY KEY,\n" - " [item_] TEXT,\n" - " [version_] TEXT,\n" - " [commit_] TEXT,\n" + " [_id] TEXT PRIMARY KEY,\n" + " [_id_] INTEGER,\n" + " [_item_] TEXT,\n" + " [_version_] TEXT,\n" + " [_commit_] TEXT,\n" " [rowid_] INTEGER\n" ");\n" "CREATE TABLE [item_versions] (\n" - " [item] TEXT REFERENCES [items]([id]),\n" - " [version] INTEGER,\n" - " [commit] TEXT REFERENCES [commits]([hash]),\n" - " [id] INTEGER,\n" - " [item_] TEXT,\n" - " [version_] TEXT,\n" - " [commit_] TEXT,\n" + " [_item] TEXT REFERENCES [items]([_id]),\n" + " [_version] INTEGER,\n" + " [_commit] TEXT REFERENCES [commits]([hash]),\n" + " [_id_] INTEGER,\n" + " [_item_] TEXT,\n" + " [_version_] TEXT,\n" + " [_commit_] TEXT,\n" " [rowid_] INTEGER,\n" - " PRIMARY KEY ([item], [version])\n" + " PRIMARY KEY ([_item], [_version])\n" ");" ) item_versions = [ - r for r in db.query("select id, item_, version_, commit_ from item_versions") + r + for r in db.query( + "select _id_, _item_, _version_, _commit_, rowid_ from item_versions" + ) ] assert item_versions == [ - {"id": 1, "item_": "Gin", "version_": "v1", "commit_": "commit1"}, - {"id": 2, "item_": "Tonic", "version_": "v1", "commit_": "commit1"}, - {"id": 1, "item_": "Gin", "version_": "v1", "commit_": "commit1"}, - {"id": 2, "item_": "Tonic 2", "version_": "v1", "commit_": "commit1"}, - {"id": 3, "item_": "Rum", "version_": "v1", "commit_": "commit1"}, + { + "_id_": 1, + "_item_": "Gin", + "_version_": "v1", + "_commit_": "commit1", + "rowid_": 5, + }, + { + "_id_": 2, + "_item_": "Tonic", + "_version_": "v1", + "_commit_": "commit1", + "rowid_": 6, + }, + { + "_id_": 2, + "_item_": "Tonic 2", + "_version_": "v1", + "_commit_": "commit1", + "rowid_": 6, + }, + { + "_id_": 3, + "_item_": "Rum", + "_version_": "v1", + "_commit_": "commit1", + "rowid_": 7, + }, ] -def test_more_than_one_id_makes_id_reserved(repo, tmpdir): - # If we use "--id id --id version" then id is converted to id_ - # so we can add our own id_ that is a hash of those two columns - runner = CliRunner() - db_path = str(tmpdir / "db.db") - with runner.isolated_filesystem(): - result = runner.invoke( - cli, - [ - "file", - db_path, - str(repo / "items-with-reserved-columns.json"), - "--repo", - str(repo), - "--id", - "id", - "--id", - "version", - ], - catch_exceptions=False, - ) - assert result.exit_code == 0 - db = sqlite_utils.Database(db_path) - assert db.schema == ( - "CREATE TABLE [commits] (\n" - " [hash] TEXT PRIMARY KEY,\n" - " [commit_at] TEXT\n" - ");\n" - "CREATE TABLE [items] (\n" - " [id] TEXT PRIMARY KEY,\n" - " [id_] INTEGER,\n" - " [item_] TEXT,\n" - " [version_] TEXT,\n" - " [commit_] TEXT,\n" - " [rowid_] INTEGER\n" - ");\n" - "CREATE TABLE [item_versions] (\n" - " [item] TEXT REFERENCES [items]([id]),\n" - " [version] INTEGER,\n" - " [commit] TEXT REFERENCES [commits]([hash]),\n" - " [id_] INTEGER,\n" - " [item_] TEXT,\n" - " [version_] TEXT,\n" - " [commit_] TEXT,\n" - " [rowid_] INTEGER,\n" - " PRIMARY KEY ([item], [version])\n" - ");" - ) - - -@pytest.mark.parametrize("specify_id", (True, False)) -def test_file_with_banned_columns(repo, tmpdir, specify_id): - runner = CliRunner() - db_path = str(tmpdir / "db.db") - with runner.isolated_filesystem(): - result = runner.invoke( - cli, - [ - "file", - db_path, - str(repo / "items-with-banned-columns.json"), - "--repo", - str(repo), - ] - + (["--id", "id_"] if specify_id else []), - catch_exceptions=False, - ) - assert result.exit_code == 1 - assert result.output.strip() == ( - "Error: Column ['id_', 'version_'] is one of these banned columns: ['commit_', 'id_', 'item_', 'version_']\n" - "{\n" - ' "id_": 1,\n' - ' "version_": "Gin"\n' - "}" - ) - - @pytest.mark.parametrize("file", ("trees.csv", "trees.tsv")) def test_csv_tsv(repo, tmpdir, file): runner = CliRunner() @@ -361,16 +329,16 @@ def test_csv_tsv(repo, tmpdir, file): " [commit_at] TEXT\n" ");\n" "CREATE TABLE [items] (\n" - " [id] TEXT PRIMARY KEY,\n" + " [_id] TEXT PRIMARY KEY,\n" " [TreeID] TEXT,\n" " [name] TEXT\n" ");\n" "CREATE TABLE [item_versions] (\n" - " [item] TEXT REFERENCES [items]([id]),\n" - " [version] INTEGER,\n" - " [commit] TEXT REFERENCES [commits]([hash]),\n" + " [_item] TEXT REFERENCES [items]([_id]),\n" + " [_version] INTEGER,\n" + " [_commit] TEXT REFERENCES [commits]([hash]),\n" " [TreeID] TEXT,\n" " [name] TEXT,\n" - " PRIMARY KEY ([item], [version])\n" + " PRIMARY KEY ([_item], [_version])\n" ");" ) From 34ec48e23598475cfa4e97505dadc3a022d8fa87 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 13 Nov 2021 18:10:41 -0800 Subject: [PATCH 2/3] Added utils module, ref #14 --- git_history/utils.py | 19 +++++++++++++++++++ tests/test_utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 git_history/utils.py create mode 100644 tests/test_utils.py diff --git a/git_history/utils.py b/git_history/utils.py new file mode 100644 index 0000000..29df477 --- /dev/null +++ b/git_history/utils.py @@ -0,0 +1,19 @@ +import re + +RESERVED = ("_id", "_item", "_version", "_commit", "rowid") +reserved_with_suffix_re = re.compile("^({})_*$".format("|".join(RESERVED))) + + +def fix_reserved_columns(item): + if not any(reserved_with_suffix_re.match(key) for key in item): + return item + + return {_fix_key(key): item[key] for key in item} + + +def _fix_key(key): + # Add a trailing _ if it's reserved or reserved with _ suffix + if reserved_with_suffix_re.match(key): + return key + "_" + else: + return key diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..5148c8c --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,27 @@ +from git_history.utils import fix_reserved_columns +import pytest + + +@pytest.mark.parametrize( + "column,expected", + ( + ("_id", "_id_"), + ("_item", "_item_"), + ("_version", "_version_"), + ("_commit", "_commit_"), + ("rowid", "rowid_"), + ("rowid_", "rowid__"), + ("_id__", "_id___"), + ), +) +def test_fix_reserved_columns(column, expected): + item = {column: 1} + fixed = fix_reserved_columns(item) + assert fixed == {expected: 1} + assert item is not fixed + + +def test_fix_reserved_columns_unchanged_if_no_reserved(): + item = {"id": 1, "version": "v2"} + fixed = fix_reserved_columns(item) + assert item is fixed From 66031db81d2ab3a1786f6c72f4bc59aa58a24cfc Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 13 Nov 2021 22:01:34 -0800 Subject: [PATCH 3/3] Updated documentation to reflect new _id columns, refs #14 --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index bd781b7..6098096 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Change directory into the GitHub repository in question and run the following: This will create a new SQLite database in the `incidents.db` file with two tables: - `commits` containing a row for every commit, with a `hash` column and the `commit_at` date. -- `items` containing a row for every item in every version of the `filename.json` file - with an extra `commit` column that is a foreign key back to the `commits` table. +- `items` containing a row for every item in every version of the `filename.json` file - with an extra `_commit` column that is a foreign key back to the `commits` table. If you have 10 historic versions of the `incidents.json` file and each one contains 30 incidents, you will end up with 10 * 30 = 300 rows in your `items` table. @@ -48,9 +48,9 @@ The `items` table will contain just the most recent version of each row, de-dupl The `item_versions` table will contain a row for each captured differing version of that item, plus the following columns: -- `item` as a foreign key to the `items` table -- `commit` as a foreign key to the `commits` table -- `version` as the numeric version number, starting at 1 and incrementing for each captured version +- `_item` as a foreign key to the `items` table +- `_commit` as a foreign key to the `commits` table +- `_version` as the numeric version number, starting at 1 and incrementing for each captured version If you have already imported history, the command will skip any commits that it has seen already and just process new ones. This means that even though an initial import could be slow subsequent imports should run a lot faster. @@ -66,9 +66,9 @@ Additional options: - `--ignore-duplicate-ids` - if a single version of a file has the same ID in it more than once, the tool will exit with an error. Use this option to ignore this and instead pick just the first of the two duplicates. - `--silent` - don't show the progress bar. -Note that `id`, `item`, `version`, `commit` and `rowid` are reserved column names that are used by this tool. If your data contains any of these they will be renamed to `id_`, `item_`, `version_`, `commit_` or `rowid_` to avoid clashing with the reserved columns. +Note that `_id`, `_item`, `_version`, `_commit` and `rowid` are considered column names for the purposes of this tool. If your data contains any of these they will be renamed to `_id_`, `_item_`, `_version_`, `_commit_` or `_rowid_` to avoid clashing with the reserved columns. -There is one exception: if you have an `id` column and use `--id id` without specifying more than one ID column, your ìd` column will be used as the item ID but will not be renamed. +If you have a column with a name such as `_commit_` it will be renamed too, adding an additional trailing underscore, so `_commit_` becomes `_commit__` and `_commit__` becomes `_commit__`. ### CSV and TSV data