From b318e5f3d26854952b5c99519965286cbcbe9b0c Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 13 Nov 2021 18:09:22 -0800
Subject: [PATCH 1/3] Reserved columns now have underscore prefix, refs #14

---
 git_history/cli.py        |  54 +++-------
 tests/test_git_history.py | 218 ++++++++++++++++----------------------
 2 files changed, 109 insertions(+), 163 deletions(-)

diff --git a/git_history/cli.py b/git_history/cli.py
index 5574022..1c05729 100644
--- a/git_history/cli.py
+++ b/git_history/cli.py
@@ -5,6 +5,7 @@
 import sqlite_utils
 import textwrap
 from pathlib import Path
+from .utils import fix_reserved_columns
 
 
 def iterate_file_versions(
@@ -181,17 +182,13 @@ def file(
 
         # If --id is specified, do things a bit differently
         if ids:
-            # If '--id id' is only option, 'id' is not a reserved column
-            id_is_reserved = list(ids) != ["id"]
             # Any ids that are reserved columns must be renamed
             fixed_ids = set(
                 fix_reserved_columns(
                     {id: 1 for id in ids},
-                    allow_id=not id_is_reserved,
-                    allow_banned=True,
                 ).keys()
             )
-            items_insert_extra_kwargs["pk"] = "id"
+            items_insert_extra_kwargs["pk"] = "_id"
             # Check all items have those columns
             _ids_set = set(ids)
             bad_items = [
@@ -209,7 +206,7 @@ def file(
             items_insert_extra_kwargs["replace"] = True
             # Which of these are new versions of things we have seen before
             for item in items:
-                item = fix_reserved_columns(item, allow_id=not id_is_reserved)
+                item = fix_reserved_columns(item)
                 item_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
                 if item_id in item_ids_in_this_version:
                     if not ignore_duplicate_ids:
@@ -240,9 +237,9 @@ def file(
                     id_last_hash[item_id] = item_hash
                     version = id_versions.get(item_id, 0) + 1
                     id_versions[item_id] = version
-                    items_to_add.append(dict(item, id=item_id))
+                    items_to_add.append(dict(item, _id=item_id))
                     versions.append(
-                        dict(item, item=item_id, version=version, commit=git_hash)
+                        dict(item, _item=item_id, _version=version, _commit=git_hash)
                     )
 
             # Only add the items that had no new version
@@ -252,15 +249,17 @@ def file(
             # not ids - so just check them for banned columns and add the item["commit"]
             for item in items:
                 item = fix_reserved_columns(item)
-                item["commit"] = git_hash
+                item["_commit"] = git_hash
             # In this case item table needs a foreign key on 'commit'
-            items_insert_extra_kwargs["foreign_keys"] = (("commit", "commits", "hash"),)
+            items_insert_extra_kwargs["foreign_keys"] = (
+                ("_commit", "commits", "hash"),
+            )
 
         # insert items
         if items:
             db["items"].insert_all(
                 items,
-                column_order=("id",),
+                column_order=("_id",),
                 alter=True,
                 **items_insert_extra_kwargs,
             )
@@ -269,11 +268,14 @@ def file(
         if versions:
             db["item_versions"].insert_all(
                 versions,
-                pk=("item", "version"),
+                pk=("_item", "_version"),
                 alter=True,
                 replace=True,
-                column_order=("item", "version", "commit"),
-                foreign_keys=(("item", "items", "id"), ("commit", "commits", "hash")),
+                column_order=("_item", "_version", "_commit"),
+                foreign_keys=(
+                    ("_item", "items", "_id"),
+                    ("_commit", "commits", "hash"),
+                ),
             )
 
 
@@ -283,27 +285,3 @@ def _hash(record):
             "utf8"
         )
     ).hexdigest()
-
-
-def fix_reserved_columns(item, allow_id=False, allow_banned=False):
-    reserved = {"item", "version", "commit", "rowid"}
-    banned = {"id_", "item_", "version_", "commit_"}
-    if not allow_id:
-        reserved.add("id")
-    if not allow_banned and any(key in banned for key in item):
-        raise click.ClickException(
-            "Column {} is one of these banned columns: {}\n{}".format(
-                sorted([key for key in item if key in banned]),
-                sorted(banned),
-                json.dumps(item, indent=4, default=str),
-            )
-        )
-    if not any(key in reserved for key in item):
-        return item
-    new_item = {}
-    for key in item:
-        if key in reserved:
-            new_item[key + "_"] = item[key]
-        else:
-            new_item[key] = item[key]
-    return new_item
diff --git a/tests/test_git_history.py b/tests/test_git_history.py
index d0183d7..8df6153 100644
--- a/tests/test_git_history.py
+++ b/tests/test_git_history.py
@@ -29,17 +29,17 @@ def repo(tmpdir):
         json.dumps(
             [
                 {
-                    "id": 1,
-                    "item": "Gin",
-                    "version": "v1",
-                    "commit": "commit1",
+                    "_id": 1,
+                    "_item": "Gin",
+                    "_version": "v1",
+                    "_commit": "commit1",
                     "rowid": 5,
                 },
                 {
-                    "id": 2,
-                    "item": "Tonic",
-                    "version": "v1",
-                    "commit": "commit1",
+                    "_id": 2,
+                    "_item": "Tonic",
+                    "_version": "v1",
+                    "_commit": "commit1",
                     "rowid": 6,
                 },
             ]
@@ -50,8 +50,8 @@ def repo(tmpdir):
         json.dumps(
             [
                 {
-                    "id_": 1,
-                    "version_": "Gin",
+                    "_id_": 1,
+                    "_version_": "Gin",
                 }
             ]
         ),
@@ -110,9 +110,27 @@ def repo(tmpdir):
     (repo_dir / "items-with-reserved-columns.json").write_text(
         json.dumps(
             [
-                {"id": 1, "item": "Gin", "version": "v1", "commit": "commit1"},
-                {"id": 2, "item": "Tonic 2", "version": "v1", "commit": "commit1"},
-                {"id": 3, "item": "Rum", "version": "v1", "commit": "commit1"},
+                {
+                    "_id": 1,
+                    "_item": "Gin",
+                    "_version": "v1",
+                    "_commit": "commit1",
+                    "rowid": 5,
+                },
+                {
+                    "_id": 2,
+                    "_item": "Tonic 2",
+                    "_version": "v1",
+                    "_commit": "commit1",
+                    "rowid": 6,
+                },
+                {
+                    "_id": 3,
+                    "_item": "Rum",
+                    "_version": "v1",
+                    "_commit": "commit1",
+                    "rowid": 7,
+                },
             ]
         ),
         "utf-8",
@@ -138,7 +156,7 @@ def test_file_without_id(repo, tmpdir):
         "CREATE TABLE [items] (\n"
         "   [item_id] INTEGER,\n"
         "   [name] TEXT,\n"
-        "   [commit] TEXT REFERENCES [commits]([hash])\n"
+        "   [_commit] TEXT REFERENCES [commits]([hash])\n"
         ");"
     )
     assert db["commits"].count == 2
@@ -176,29 +194,29 @@ def test_file_with_id(repo, tmpdir):
         "   [commit_at] TEXT\n"
         ");\n"
         "CREATE TABLE [items] (\n"
-        "   [id] TEXT PRIMARY KEY,\n"
+        "   [_id] TEXT PRIMARY KEY,\n"
         "   [item_id] INTEGER,\n"
         "   [name] TEXT\n"
         ");\n"
         "CREATE TABLE [item_versions] (\n"
-        "   [item] TEXT REFERENCES [items]([id]),\n"
-        "   [version] INTEGER,\n"
-        "   [commit] TEXT REFERENCES [commits]([hash]),\n"
+        "   [_item] TEXT REFERENCES [items]([_id]),\n"
+        "   [_version] INTEGER,\n"
+        "   [_commit] TEXT REFERENCES [commits]([hash]),\n"
         "   [item_id] INTEGER,\n"
         "   [name] TEXT,\n"
-        "   PRIMARY KEY ([item], [version])\n"
+        "   PRIMARY KEY ([_item], [_version])\n"
         ");"
     )
     assert db["commits"].count == 2
     # Should have no duplicates
     item_versions = [
-        r for r in db.query("select item_id, version, name from item_versions")
+        r for r in db.query("select item_id, _version, name from item_versions")
     ]
     assert item_versions == [
-        {"item_id": 1, "version": 1, "name": "Gin"},
-        {"item_id": 2, "version": 1, "name": "Tonic"},
-        {"item_id": 2, "version": 2, "name": "Tonic 2"},
-        {"item_id": 3, "version": 1, "name": "Rum"},
+        {"item_id": 1, "_version": 1, "name": "Gin"},
+        {"item_id": 2, "_version": 1, "name": "Tonic"},
+        {"item_id": 2, "_version": 2, "name": "Tonic 2"},
+        {"item_id": 3, "_version": 1, "name": "Rum"},
     ]
 
 
@@ -215,7 +233,7 @@ def test_file_with_reserved_columns(repo, tmpdir):
                 "--repo",
                 str(repo),
                 "--id",
-                "id",
+                "_id",
             ],
             catch_exceptions=False,
         )
@@ -227,113 +245,63 @@ def test_file_with_reserved_columns(repo, tmpdir):
         "   [commit_at] TEXT\n"
         ");\n"
         "CREATE TABLE [items] (\n"
-        "   [id] TEXT PRIMARY KEY,\n"
-        "   [item_] TEXT,\n"
-        "   [version_] TEXT,\n"
-        "   [commit_] TEXT,\n"
+        "   [_id] TEXT PRIMARY KEY,\n"
+        "   [_id_] INTEGER,\n"
+        "   [_item_] TEXT,\n"
+        "   [_version_] TEXT,\n"
+        "   [_commit_] TEXT,\n"
         "   [rowid_] INTEGER\n"
         ");\n"
         "CREATE TABLE [item_versions] (\n"
-        "   [item] TEXT REFERENCES [items]([id]),\n"
-        "   [version] INTEGER,\n"
-        "   [commit] TEXT REFERENCES [commits]([hash]),\n"
-        "   [id] INTEGER,\n"
-        "   [item_] TEXT,\n"
-        "   [version_] TEXT,\n"
-        "   [commit_] TEXT,\n"
+        "   [_item] TEXT REFERENCES [items]([_id]),\n"
+        "   [_version] INTEGER,\n"
+        "   [_commit] TEXT REFERENCES [commits]([hash]),\n"
+        "   [_id_] INTEGER,\n"
+        "   [_item_] TEXT,\n"
+        "   [_version_] TEXT,\n"
+        "   [_commit_] TEXT,\n"
         "   [rowid_] INTEGER,\n"
-        "   PRIMARY KEY ([item], [version])\n"
+        "   PRIMARY KEY ([_item], [_version])\n"
         ");"
     )
     item_versions = [
-        r for r in db.query("select id, item_, version_, commit_ from item_versions")
+        r
+        for r in db.query(
+            "select _id_, _item_, _version_, _commit_, rowid_ from item_versions"
+        )
     ]
     assert item_versions == [
-        {"id": 1, "item_": "Gin", "version_": "v1", "commit_": "commit1"},
-        {"id": 2, "item_": "Tonic", "version_": "v1", "commit_": "commit1"},
-        {"id": 1, "item_": "Gin", "version_": "v1", "commit_": "commit1"},
-        {"id": 2, "item_": "Tonic 2", "version_": "v1", "commit_": "commit1"},
-        {"id": 3, "item_": "Rum", "version_": "v1", "commit_": "commit1"},
+        {
+            "_id_": 1,
+            "_item_": "Gin",
+            "_version_": "v1",
+            "_commit_": "commit1",
+            "rowid_": 5,
+        },
+        {
+            "_id_": 2,
+            "_item_": "Tonic",
+            "_version_": "v1",
+            "_commit_": "commit1",
+            "rowid_": 6,
+        },
+        {
+            "_id_": 2,
+            "_item_": "Tonic 2",
+            "_version_": "v1",
+            "_commit_": "commit1",
+            "rowid_": 6,
+        },
+        {
+            "_id_": 3,
+            "_item_": "Rum",
+            "_version_": "v1",
+            "_commit_": "commit1",
+            "rowid_": 7,
+        },
     ]
 
 
-def test_more_than_one_id_makes_id_reserved(repo, tmpdir):
-    # If we use "--id id --id version" then id is converted to id_
-    # so we can add our own id_ that is a hash of those two columns
-    runner = CliRunner()
-    db_path = str(tmpdir / "db.db")
-    with runner.isolated_filesystem():
-        result = runner.invoke(
-            cli,
-            [
-                "file",
-                db_path,
-                str(repo / "items-with-reserved-columns.json"),
-                "--repo",
-                str(repo),
-                "--id",
-                "id",
-                "--id",
-                "version",
-            ],
-            catch_exceptions=False,
-        )
-    assert result.exit_code == 0
-    db = sqlite_utils.Database(db_path)
-    assert db.schema == (
-        "CREATE TABLE [commits] (\n"
-        "   [hash] TEXT PRIMARY KEY,\n"
-        "   [commit_at] TEXT\n"
-        ");\n"
-        "CREATE TABLE [items] (\n"
-        "   [id] TEXT PRIMARY KEY,\n"
-        "   [id_] INTEGER,\n"
-        "   [item_] TEXT,\n"
-        "   [version_] TEXT,\n"
-        "   [commit_] TEXT,\n"
-        "   [rowid_] INTEGER\n"
-        ");\n"
-        "CREATE TABLE [item_versions] (\n"
-        "   [item] TEXT REFERENCES [items]([id]),\n"
-        "   [version] INTEGER,\n"
-        "   [commit] TEXT REFERENCES [commits]([hash]),\n"
-        "   [id_] INTEGER,\n"
-        "   [item_] TEXT,\n"
-        "   [version_] TEXT,\n"
-        "   [commit_] TEXT,\n"
-        "   [rowid_] INTEGER,\n"
-        "   PRIMARY KEY ([item], [version])\n"
-        ");"
-    )
-
-
-@pytest.mark.parametrize("specify_id", (True, False))
-def test_file_with_banned_columns(repo, tmpdir, specify_id):
-    runner = CliRunner()
-    db_path = str(tmpdir / "db.db")
-    with runner.isolated_filesystem():
-        result = runner.invoke(
-            cli,
-            [
-                "file",
-                db_path,
-                str(repo / "items-with-banned-columns.json"),
-                "--repo",
-                str(repo),
-            ]
-            + (["--id", "id_"] if specify_id else []),
-            catch_exceptions=False,
-        )
-    assert result.exit_code == 1
-    assert result.output.strip() == (
-        "Error: Column ['id_', 'version_'] is one of these banned columns: ['commit_', 'id_', 'item_', 'version_']\n"
-        "{\n"
-        '    "id_": 1,\n'
-        '    "version_": "Gin"\n'
-        "}"
-    )
-
-
 @pytest.mark.parametrize("file", ("trees.csv", "trees.tsv"))
 def test_csv_tsv(repo, tmpdir, file):
     runner = CliRunner()
@@ -361,16 +329,16 @@ def test_csv_tsv(repo, tmpdir, file):
         "   [commit_at] TEXT\n"
         ");\n"
         "CREATE TABLE [items] (\n"
-        "   [id] TEXT PRIMARY KEY,\n"
+        "   [_id] TEXT PRIMARY KEY,\n"
         "   [TreeID] TEXT,\n"
         "   [name] TEXT\n"
         ");\n"
         "CREATE TABLE [item_versions] (\n"
-        "   [item] TEXT REFERENCES [items]([id]),\n"
-        "   [version] INTEGER,\n"
-        "   [commit] TEXT REFERENCES [commits]([hash]),\n"
+        "   [_item] TEXT REFERENCES [items]([_id]),\n"
+        "   [_version] INTEGER,\n"
+        "   [_commit] TEXT REFERENCES [commits]([hash]),\n"
         "   [TreeID] TEXT,\n"
         "   [name] TEXT,\n"
-        "   PRIMARY KEY ([item], [version])\n"
+        "   PRIMARY KEY ([_item], [_version])\n"
         ");"
     )

From 34ec48e23598475cfa4e97505dadc3a022d8fa87 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 13 Nov 2021 18:10:41 -0800
Subject: [PATCH 2/3] Added utils module, ref #14

---
 git_history/utils.py | 19 +++++++++++++++++++
 tests/test_utils.py  | 27 +++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 git_history/utils.py
 create mode 100644 tests/test_utils.py

diff --git a/git_history/utils.py b/git_history/utils.py
new file mode 100644
index 0000000..29df477
--- /dev/null
+++ b/git_history/utils.py
@@ -0,0 +1,19 @@
+import re
+
+RESERVED = ("_id", "_item", "_version", "_commit", "rowid")
+reserved_with_suffix_re = re.compile("^({})_*$".format("|".join(RESERVED)))
+
+
+def fix_reserved_columns(item):
+    if not any(reserved_with_suffix_re.match(key) for key in item):
+        return item
+
+    return {_fix_key(key): item[key] for key in item}
+
+
+def _fix_key(key):
+    # Add a trailing _ if it's reserved or reserved with _ suffix
+    if reserved_with_suffix_re.match(key):
+        return key + "_"
+    else:
+        return key
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..5148c8c
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,27 @@
+from git_history.utils import fix_reserved_columns
+import pytest
+
+
+@pytest.mark.parametrize(
+    "column,expected",
+    (
+        ("_id", "_id_"),
+        ("_item", "_item_"),
+        ("_version", "_version_"),
+        ("_commit", "_commit_"),
+        ("rowid", "rowid_"),
+        ("rowid_", "rowid__"),
+        ("_id__", "_id___"),
+    ),
+)
+def test_fix_reserved_columns(column, expected):
+    item = {column: 1}
+    fixed = fix_reserved_columns(item)
+    assert fixed == {expected: 1}
+    assert item is not fixed
+
+
+def test_fix_reserved_columns_unchanged_if_no_reserved():
+    item = {"id": 1, "version": "v2"}
+    fixed = fix_reserved_columns(item)
+    assert item is fixed

From 66031db81d2ab3a1786f6c72f4bc59aa58a24cfc Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 13 Nov 2021 22:01:34 -0800
Subject: [PATCH 3/3] Updated documentation to reflect new _id columns, refs
 #14

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bd781b7..6098096 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Change directory into the GitHub repository in question and run the following:
 This will create a new SQLite database in the `incidents.db` file with two tables:
 
 - `commits` containing a row for every commit, with a `hash` column and the `commit_at` date.
-- `items` containing a row for every item in every version of the `filename.json` file - with an extra `commit` column that is a foreign key back to the `commits` table.
+- `items` containing a row for every item in every version of the `filename.json` file - with an extra `_commit` column that is a foreign key back to the `commits` table.
 
 If you have 10 historic versions of the `incidents.json` file and each one contains 30 incidents, you will end up with 10 * 30 = 300 rows in your `items` table.
 
@@ -48,9 +48,9 @@ The `items` table will contain just the most recent version of each row, de-dupl
 
 The `item_versions` table will contain a row for each captured differing version of that item, plus the following columns:
 
-- `item` as a foreign key to the `items` table
-- `commit` as a foreign key to the `commits` table
-- `version` as the numeric version number, starting at 1 and incrementing for each captured version
+- `_item` as a foreign key to the `items` table
+- `_commit` as a foreign key to the `commits` table
+- `_version` as the numeric version number, starting at 1 and incrementing for each captured version
 
 If you have already imported history, the command will skip any commits that it has seen already and just process new ones. This means that even though an initial import could be slow subsequent imports should run a lot faster.
 
@@ -66,9 +66,9 @@ Additional options:
 - `--ignore-duplicate-ids` - if a single version of a file has the same ID in it more than once, the tool will exit with an error. Use this option to ignore this and instead pick just the first of the two duplicates.
 - `--silent` - don't show the progress bar.
 
-Note that `id`, `item`, `version`, `commit` and `rowid` are reserved column names that are used by this tool. If your data contains any of these they will be renamed to `id_`, `item_`, `version_`, `commit_` or `rowid_` to avoid clashing with the reserved columns.
+Note that `_id`, `_item`, `_version`, `_commit` and `rowid` are considered column names for the purposes of this tool. If your data contains any of these they will be renamed to `_id_`, `_item_`, `_version_`, `_commit_` or `_rowid_` to avoid clashing with the reserved columns.
 
-There is one exception: if you have an `id` column and use `--id id` without specifying more than one ID column, your ìd` column will be used as the item ID but will not be renamed.
+If you have a column with a name such as `_commit_` it will be renamed too, adding an additional trailing underscore, so `_commit_` becomes `_commit__` and `_commit__` becomes `_commit__`.
 
 ### CSV and TSV data