From d6457770a46f8a17172b351e7e7bdc9cb09f4698 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Thu, 18 Nov 2021 23:37:13 -0800
Subject: [PATCH 1/4] Initial prototype with integer IDs, refs #12

---
 git_history/cli.py        | 99 +++++++++++++++++----------------------
 git_history/utils.py      |  2 +-
 setup.py                  |  2 +-
 tests/test_git_history.py | 58 ++++++++++++++++-------
 tests/test_utils.py       |  1 +
 5 files changed, 87 insertions(+), 75 deletions(-)

diff --git a/git_history/cli.py b/git_history/cli.py
index a05602e..232e3a3 100644
--- a/git_history/cli.py
+++ b/git_history/cli.py
@@ -113,7 +113,7 @@ def file(
             decoded = content.decode("utf-8")
             dialect = csv.Sniffer().sniff(decoded[:512])
             reader = csv.DictReader(io.StringIO(decoded), dialect=dialect)
-            return list(reader)
+            return reader
         """
         )
         imports = ["io", "csv"]
@@ -140,9 +140,8 @@ def file(
     resolved_filepath = str(Path(filepath).resolve())
     resolved_repo = str(Path(repo).resolve())
     db = sqlite_utils.Database(database)
-    seen_hashes = set()
-    id_versions = {}
-    id_last_hash = {}
+    item_hash_id_versions = {}
+    item_hash_id_last_hash = {}
     for git_commit_at, git_hash, content in iterate_file_versions(
         resolved_repo,
         resolved_filepath,
@@ -154,13 +153,10 @@ def file(
         else set(),
         show_progress=not silent,
     ):
-        if git_hash not in seen_hashes:
-            seen_hashes.add(git_hash)
-            db["commits"].insert(
-                {"hash": git_hash, "commit_at": git_commit_at.isoformat()},
-                pk="hash",
-                replace=True,
-            )
+        commit_id = db["commits"].lookup(
+            {"hash": git_hash},
+            {"commit_at": git_commit_at.isoformat()},
+        )
         if not content.strip():
             # Skip empty JSON files
             continue
@@ -178,7 +174,6 @@ def file(
                 new_items.append(new_item)
             items = new_items
 
-        items_insert_extra_kwargs = {}
         versions = []
 
         # If --id is specified, do things a bit differently
@@ -189,7 +184,6 @@ def file(
                     {id: 1 for id in ids},
                 ).keys()
             )
-            items_insert_extra_kwargs["pk"] = "_id"
             # Check all items have those columns
             _ids_set = set(ids)
             bad_items = [
@@ -201,15 +195,13 @@ def file(
                         git_hash, json.dumps(bad_items[:5], indent=4, default=str)
                     )
                 )
-            # Also ensure there are not TWO items in this file with the same ID
-            item_ids_in_this_version = set()
-            items_to_add = []
-            items_insert_extra_kwargs["replace"] = True
+            # Also ensure there are not TWO items in this commit with the same ID
+            item_hash_ids_in_this_commit = set()
             # Which of these are new versions of things we have seen before
             for item in items:
                 item = fix_reserved_columns(item)
-                item_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
-                if item_id in item_ids_in_this_version:
+                item_hash_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
+                if item_hash_id in item_hash_ids_in_this_commit:
                     if not ignore_duplicate_ids:
                         raise click.ClickException(
                             "Commit: {} - found multiple items with the same ID:\n{}".format(
@@ -221,7 +213,7 @@ def file(
                                         if _hash(
                                             dict((id, item.get(id)) for id in fixed_ids)
                                         )
-                                        == item_id
+                                        == item_hash
                                     ][:5],
                                     indent=4,
                                     default=str,
@@ -230,53 +222,48 @@ def file(
                         )
                     else:
                         continue
-                item_ids_in_this_version.add(item_id)
+                item_hash_ids_in_this_commit.add(item_hash_id)
+
                 # Has it changed since last time we saw it?
                 item_hash = _hash(item)
-                if id_last_hash.get(item_id) != item_hash:
-                    # It's either new or the content has changed
-                    id_last_hash[item_id] = item_hash
-                    version = id_versions.get(item_id, 0) + 1
-                    id_versions[item_id] = version
-                    items_to_add.append(dict(item, _id=item_id))
-                    versions.append(
-                        dict(item, _item=item_id, _version=version, _commit=git_hash)
-                    )
-
-            # Only add the items that had no new version
-            items = items_to_add
+                if item_hash_id_last_hash.get(item_hash_id) != item_hash:
+                    # It's either new or the content has changed - so insert it
+                    item_hash_id_last_hash[item_hash_id] = item_hash
+                    version = item_hash_id_versions.get(item_hash_id, 0) + 1
+                    item_hash_id_versions[item_hash_id] = version
 
+                    # Add or fetch item
+                    item_to_insert = dict(
+                        item, _item_hash_id=item_hash_id, _commit=commit_id
+                    )
+                    item_id = db["items"].lookup(
+                        {"_item_hash_id": item_hash_id},
+                        item_to_insert,
+                        column_order=("_id", "_item_hash_id"),
+                        pk="_id",
+                    )
+                    db["item_versions"].insert(
+                        dict(item, _item=item_id, _version=version, _commit=commit_id),
+                        pk=("_item", "_version"),
+                        alter=True,
+                        replace=True,
+                        column_order=("_item", "_version", "_commit"),
+                        foreign_keys=(
+                            ("_item", "items", "_id"),
+                            ("_commit", "commits", "id"),
+                        ),
+                    )
         else:
-            # not ids - so just check them for banned columns and add the item["commit"]
+            # no --id - so just correct for reserved columns and add item["_commit"]
             for item in items:
                 item = fix_reserved_columns(item)
-                item["_commit"] = git_hash
+                item["_commit"] = commit_id
             # In this case item table needs a foreign key on 'commit'
-            items_insert_extra_kwargs["foreign_keys"] = (
-                ("_commit", "commits", "hash"),
-            )
-
-        # insert items
-        if items:
             db["items"].insert_all(
                 items,
                 column_order=("_id",),
                 alter=True,
-                **items_insert_extra_kwargs,
-            )
-
-        # insert versions
-        if versions:
-            db["item_versions"].insert_all(
-                versions,
-                pk=("_item", "_version"),
-                alter=True,
-                replace=True,
-                column_order=("_item", "_version", "_commit"),
-                foreign_keys=(
-                    ("_item", "items", "_id"),
-                    ("_commit", "commits", "hash"),
-                ),
+                foreign_keys=(("_commit", "commits", "id"),),
             )
 
 
diff --git a/git_history/utils.py b/git_history/utils.py
index 29df477..071deb2 100644
--- a/git_history/utils.py
+++ b/git_history/utils.py
@@ -1,6 +1,6 @@
 import re
 
-RESERVED = ("_id", "_item", "_version", "_commit", "rowid")
+RESERVED = ("_id", "_item", "_version", "_commit", "_item_hash", "rowid")
 reserved_with_suffix_re = re.compile("^({})_*$".format("|".join(RESERVED)))
 
 
diff --git a/setup.py b/setup.py
index 2f5cc56..9fa5fd7 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ def get_long_description():
         [console_scripts]
         git-history=git_history.cli:cli
     """,
-    install_requires=["click", "GitPython", "sqlite-utils>=3.17.1"],
+    install_requires=["click", "GitPython", "sqlite-utils>=3.19a0"],
     extras_require={"test": ["pytest"]},
     python_requires=">=3.6",
 )
diff --git a/tests/test_git_history.py b/tests/test_git_history.py
index a646bb3..1dc6711 100644
--- a/tests/test_git_history.py
+++ b/tests/test_git_history.py
@@ -150,13 +150,16 @@ def test_file_without_id(repo, tmpdir):
     db = sqlite_utils.Database(db_path)
     assert db.schema == (
         "CREATE TABLE [commits] (\n"
-        "   [hash] TEXT PRIMARY KEY,\n"
+        "   [id] INTEGER PRIMARY KEY,\n"
+        "   [hash] TEXT,\n"
         "   [commit_at] TEXT\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_commits_hash]\n"
+        "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
         "   [item_id] INTEGER,\n"
         "   [name] TEXT,\n"
-        "   [_commit] TEXT REFERENCES [commits]([hash])\n"
+        "   [_commit] INTEGER REFERENCES [commits]([id])\n"
         ");"
     )
     assert db["commits"].count == 2
@@ -190,18 +193,25 @@ def test_file_with_id(repo, tmpdir):
     db = sqlite_utils.Database(db_path)
     assert db.schema == (
         "CREATE TABLE [commits] (\n"
-        "   [hash] TEXT PRIMARY KEY,\n"
+        "   [id] INTEGER PRIMARY KEY,\n"
+        "   [hash] TEXT,\n"
         "   [commit_at] TEXT\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_commits_hash]\n"
+        "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
-        "   [_id] TEXT PRIMARY KEY,\n"
+        "   [_id] INTEGER PRIMARY KEY,\n"
+        "   [_item_hash_id] TEXT,\n"
         "   [item_id] INTEGER,\n"
-        "   [name] TEXT\n"
+        "   [name] TEXT,\n"
+        "   [_commit] INTEGER\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_items__item_hash_id]\n"
+        "    ON [items] ([_item_hash_id]);\n"
         "CREATE TABLE [item_versions] (\n"
-        "   [_item] TEXT REFERENCES [items]([_id]),\n"
+        "   [_item] INTEGER REFERENCES [items]([_id]),\n"
         "   [_version] INTEGER,\n"
-        "   [_commit] TEXT REFERENCES [commits]([hash]),\n"
+        "   [_commit] INTEGER REFERENCES [commits]([id]),\n"
         "   [item_id] INTEGER,\n"
         "   [name] TEXT,\n"
         "   PRIMARY KEY ([_item], [_version])\n"
@@ -241,21 +251,28 @@ def test_file_with_reserved_columns(repo, tmpdir):
     db = sqlite_utils.Database(db_path)
     assert db.schema == (
         "CREATE TABLE [commits] (\n"
-        "   [hash] TEXT PRIMARY KEY,\n"
+        "   [id] INTEGER PRIMARY KEY,\n"
+        "   [hash] TEXT,\n"
         "   [commit_at] TEXT\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_commits_hash]\n"
+        "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
-        "   [_id] TEXT PRIMARY KEY,\n"
+        "   [_id] INTEGER PRIMARY KEY,\n"
+        "   [_item_hash_id] TEXT,\n"
         "   [_id_] INTEGER,\n"
         "   [_item_] TEXT,\n"
         "   [_version_] TEXT,\n"
         "   [_commit_] TEXT,\n"
-        "   [rowid_] INTEGER\n"
+        "   [rowid_] INTEGER,\n"
+        "   [_commit] INTEGER\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_items__item_hash_id]\n"
+        "    ON [items] ([_item_hash_id]);\n"
         "CREATE TABLE [item_versions] (\n"
-        "   [_item] TEXT REFERENCES [items]([_id]),\n"
+        "   [_item] INTEGER REFERENCES [items]([_id]),\n"
         "   [_version] INTEGER,\n"
-        "   [_commit] TEXT REFERENCES [commits]([hash]),\n"
+        "   [_commit] INTEGER REFERENCES [commits]([id]),\n"
         "   [_id_] INTEGER,\n"
         "   [_item_] TEXT,\n"
         "   [_version_] TEXT,\n"
@@ -325,18 +342,25 @@ def test_csv_tsv(repo, tmpdir, file):
     db = sqlite_utils.Database(db_path)
     assert db.schema == (
         "CREATE TABLE [commits] (\n"
-        "   [hash] TEXT PRIMARY KEY,\n"
+        "   [id] INTEGER PRIMARY KEY,\n"
+        "   [hash] TEXT,\n"
         "   [commit_at] TEXT\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_commits_hash]\n"
+        "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
-        "   [_id] TEXT PRIMARY KEY,\n"
+        "   [_id] INTEGER PRIMARY KEY,\n"
+        "   [_item_hash_id] TEXT,\n"
         "   [TreeID] TEXT,\n"
-        "   [name] TEXT\n"
+        "   [name] TEXT,\n"
+        "   [_commit] INTEGER\n"
         ");\n"
+        "CREATE UNIQUE INDEX [idx_items__item_hash_id]\n"
+        "    ON [items] ([_item_hash_id]);\n"
         "CREATE TABLE [item_versions] (\n"
-        "   [_item] TEXT REFERENCES [items]([_id]),\n"
+        "   [_item] INTEGER REFERENCES [items]([_id]),\n"
         "   [_version] INTEGER,\n"
-        "   [_commit] TEXT REFERENCES [commits]([hash]),\n"
+        "   [_commit] INTEGER REFERENCES [commits]([id]),\n"
         "   [TreeID] TEXT,\n"
         "   [name] TEXT,\n"
         "   PRIMARY KEY ([_item], [_version])\n"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5148c8c..038d8fb 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,6 +9,7 @@
         ("_item", "_item_"),
         ("_version", "_version_"),
         ("_commit", "_commit_"),
+        ("_item_hash", "_item_hash_"),
         ("rowid", "rowid_"),
         ("rowid_", "rowid__"),
         ("_id__", "_id___"),

From d254815a98747f2f73831a6bc7c135773bd08ea2 Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 20 Nov 2021 20:28:58 -0800
Subject: [PATCH 2/4] Rename _item_hash to _item_id, refs #20

---
 git_history/cli.py        | 32 +++++++++++++++-----------------
 git_history/utils.py      |  2 +-
 tests/test_git_history.py | 18 +++++++++---------
 tests/test_utils.py       |  2 +-
 4 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/git_history/cli.py b/git_history/cli.py
index 232e3a3..0356e3d 100644
--- a/git_history/cli.py
+++ b/git_history/cli.py
@@ -140,8 +140,8 @@ def file(
     resolved_filepath = str(Path(filepath).resolve())
     resolved_repo = str(Path(repo).resolve())
     db = sqlite_utils.Database(database)
-    item_hash_id_versions = {}
-    item_hash_id_last_hash = {}
+    item_id_versions = {}
+    item_id_last_hash = {}
     for git_commit_at, git_hash, content in iterate_file_versions(
         resolved_repo,
         resolved_filepath,
@@ -196,12 +196,12 @@ def file(
                     )
                 )
             # Also ensure there are not TWO items in this commit with the same ID
-            item_hash_ids_in_this_commit = set()
+            item_ids_in_this_commit = set()
             # Which of these are new versions of things we have seen before
             for item in items:
                 item = fix_reserved_columns(item)
-                item_hash_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
-                if item_hash_id in item_hash_ids_in_this_commit:
+                item_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
+                if item_id in item_ids_in_this_commit:
                     if not ignore_duplicate_ids:
                         raise click.ClickException(
                             "Commit: {} - found multiple items with the same ID:\n{}".format(
@@ -213,7 +213,7 @@ def file(
                                         if _hash(
                                             dict((id, item.get(id)) for id in fixed_ids)
                                         )
-                                        == item_hash
+                                        == item_id
                                     ][:5],
                                     indent=4,
                                     default=str,
@@ -222,24 +222,22 @@ def file(
                         )
                     else:
                         continue
-                item_hash_ids_in_this_commit.add(item_hash_id)
+                item_ids_in_this_commit.add(item_id)
 
                 # Has it changed since last time we saw it?
-                item_hash = _hash(item)
-                if item_hash_id_last_hash.get(item_hash_id) != item_hash:
+                item_full_hash = _hash(item)
+                if item_id_last_hash.get(item_id) != item_full_hash:
                     # It's either new or the content has changed - so insert it
-                    item_hash_id_last_hash[item_hash_id] = item_hash
-                    version = item_hash_id_versions.get(item_hash_id, 0) + 1
-                    item_hash_id_versions[item_hash_id] = version
+                    item_id_last_hash[item_id] = item_full_hash
+                    version = item_id_versions.get(item_id, 0) + 1
+                    item_id_versions[item_id] = version
 
                     # Add or fetch item
-                    item_to_insert = dict(
-                        item, _item_hash_id=item_hash_id, _commit=commit_id
-                    )
+                    item_to_insert = dict(item, _item_id=item_id, _commit=commit_id)
                     item_id = db["items"].lookup(
-                        {"_item_hash_id": item_hash_id},
+                        {"_item_id": item_id},
                         item_to_insert,
-                        column_order=("_id", "_item_hash_id"),
+                        column_order=("_id", "_item_id"),
                         pk="_id",
                     )
                     db["item_versions"].insert(
diff --git a/git_history/utils.py b/git_history/utils.py
index 071deb2..733f25c 100644
--- a/git_history/utils.py
+++ b/git_history/utils.py
@@ -1,6 +1,6 @@
 import re
 
-RESERVED = ("_id", "_item", "_version", "_commit", "_item_hash", "rowid")
+RESERVED = ("_id", "_item", "_version", "_commit", "_item_id", "rowid")
 reserved_with_suffix_re = re.compile("^({})_*$".format("|".join(RESERVED)))
 
 
diff --git a/tests/test_git_history.py b/tests/test_git_history.py
index 1dc6711..716f58e 100644
--- a/tests/test_git_history.py
+++ b/tests/test_git_history.py
@@ -201,13 +201,13 @@ def test_file_with_id(repo, tmpdir):
         "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
         "   [_id] INTEGER PRIMARY KEY,\n"
-        "   [_item_hash_id] TEXT,\n"
+        "   [_item_id] TEXT,\n"
         "   [item_id] INTEGER,\n"
         "   [name] TEXT,\n"
         "   [_commit] INTEGER\n"
         ");\n"
-        "CREATE UNIQUE INDEX [idx_items__item_hash_id]\n"
-        "    ON [items] ([_item_hash_id]);\n"
+        "CREATE UNIQUE INDEX [idx_items__item_id]\n"
+        "    ON [items] ([_item_id]);\n"
         "CREATE TABLE [item_versions] (\n"
         "   [_item] INTEGER REFERENCES [items]([_id]),\n"
         "   [_version] INTEGER,\n"
@@ -259,7 +259,7 @@ def test_file_with_reserved_columns(repo, tmpdir):
         "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
         "   [_id] INTEGER PRIMARY KEY,\n"
-        "   [_item_hash_id] TEXT,\n"
+        "   [_item_id] TEXT,\n"
         "   [_id_] INTEGER,\n"
         "   [_item_] TEXT,\n"
         "   [_version_] TEXT,\n"
@@ -267,8 +267,8 @@ def test_file_with_reserved_columns(repo, tmpdir):
         "   [rowid_] INTEGER,\n"
         "   [_commit] INTEGER\n"
         ");\n"
-        "CREATE UNIQUE INDEX [idx_items__item_hash_id]\n"
-        "    ON [items] ([_item_hash_id]);\n"
+        "CREATE UNIQUE INDEX [idx_items__item_id]\n"
+        "    ON [items] ([_item_id]);\n"
         "CREATE TABLE [item_versions] (\n"
         "   [_item] INTEGER REFERENCES [items]([_id]),\n"
         "   [_version] INTEGER,\n"
@@ -350,13 +350,13 @@ def test_csv_tsv(repo, tmpdir, file):
         "    ON [commits] ([hash]);\n"
         "CREATE TABLE [items] (\n"
         "   [_id] INTEGER PRIMARY KEY,\n"
-        "   [_item_hash_id] TEXT,\n"
+        "   [_item_id] TEXT,\n"
         "   [TreeID] TEXT,\n"
         "   [name] TEXT,\n"
         "   [_commit] INTEGER\n"
         ");\n"
-        "CREATE UNIQUE INDEX [idx_items__item_hash_id]\n"
-        "    ON [items] ([_item_hash_id]);\n"
+        "CREATE UNIQUE INDEX [idx_items__item_id]\n"
+        "    ON [items] ([_item_id]);\n"
         "CREATE TABLE [item_versions] (\n"
         "   [_item] INTEGER REFERENCES [items]([_id]),\n"
         "   [_version] INTEGER,\n"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 038d8fb..fbcf245 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,7 +9,7 @@
         ("_item", "_item_"),
         ("_version", "_version_"),
         ("_commit", "_commit_"),
-        ("_item_hash", "_item_hash_"),
+        ("_item_id", "_item_id_"),
         ("rowid", "rowid_"),
         ("rowid_", "rowid__"),
         ("_id__", "_id___"),

From f7bddd539e1fc22d2968a70657b0341194b6c48e Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 20 Nov 2021 20:43:27 -0800
Subject: [PATCH 3/4] Depend on sqlite-utils 3.19, refs #20

Using new feature from https://github.com/simonw/sqlite-utils/issues/342
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9fa5fd7..11b923b 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@ def get_long_description():
         [console_scripts]
         git-history=git_history.cli:cli
     """,
-    install_requires=["click", "GitPython", "sqlite-utils>=3.19a0"],
+    install_requires=["click", "GitPython", "sqlite-utils>=3.19"],
     extras_require={"test": ["pytest"]},
     python_requires=">=3.6",
 )

From f419e2a02f3805a4f290c5c3dfd5c56793aa744a Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sat, 20 Nov 2021 21:00:37 -0800
Subject: [PATCH 4/4] Include table schemas in README, refs #20

---
 README.md | 95 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index a0030a2..2ed7a8b 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,22 @@ The `file` command analyzes the history of an individual file within the reposit
 
 The file is assumed to contain multiple objects - for example, the results of scraping an electricity outage map or a CSV file full of records.
 
-Assuming you have a file called `incidents.json` that is a JSON array of objects, with multiple versions of that file recorded in a repository.
+Assuming you have a file called `incidents.json` that is a JSON array of objects, with multiple versions of that file recorded in a repository. Each version of that file might look something like this:
+
+```json
+[
+    {
+        "IncidentID": "abc123",
+        "Location": "Corner of 4th and Vermont",
+        "Type": "fire"
+    },
+    {
+        "IncidentID": "cde448",
+        "Location": "555 West Example Drive",
+        "Type": "medical"
+    }
+]
+```
 
 Change directory into the GitHub repository in question and run the following:
 
@@ -32,6 +47,24 @@ This will create a new SQLite database in the `incidents.db` file with two table
 - `commits` containing a row for every commit, with a `hash` column and the `commit_at` date.
 - `items` containing a row for every item in every version of the `filename.json` file - with an extra `_commit` column that is a foreign key back to the `commits` table.
 
+The database schema for this example will look like this:
+
+```sql
+CREATE TABLE [commits] (
+   [id] INTEGER PRIMARY KEY,
+   [hash] TEXT,
+   [commit_at] TEXT
+);
+CREATE UNIQUE INDEX [idx_commits_hash]
+    ON [commits] ([hash]);
+CREATE TABLE [items] (
+   [IncidentID] TEXT,
+   [Location] TEXT,
+   [Type] TEXT,
+   [_commit] INTEGER REFERENCES [commits]([id])
+);
+```
+
 If you have 10 historic versions of the `incidents.json` file and each one contains 30 incidents, you will end up with 10 * 30 = 300 rows in your `items` table.
 
 ### De-duplicating items using IDs
@@ -44,13 +77,48 @@ If there is a unique identifier column called `IncidentID` you could run the fol
 
 This will create three tables - `commits`, `items` and `item_versions`.
 
-The `items` table will contain just the most recent version of each row, de-duplicated by ID.
+This time the schema will look like this:
+
+```sql
+CREATE TABLE [commits] (
+   [id] INTEGER PRIMARY KEY,
+   [hash] TEXT,
+   [commit_at] TEXT
+);
+CREATE UNIQUE INDEX [idx_commits_hash]
+    ON [commits] ([hash]);
+CREATE TABLE [items] (
+   [_id] INTEGER PRIMARY KEY,
+   [_item_id] TEXT,
+   [IncidentID] TEXT,
+   [Location] TEXT,
+   [Type] TEXT,
+   [_commit] INTEGER
+);
+CREATE UNIQUE INDEX [idx_items__item_id]
+    ON [items] ([_item_id]);
+CREATE TABLE [item_versions] (
+   [_item] INTEGER REFERENCES [items]([_id]),
+   [_version] INTEGER,
+   [_commit] INTEGER REFERENCES [commits]([id]),
+   [IncidentID] TEXT,
+   [Location] TEXT,
+   [Type] TEXT,
+   PRIMARY KEY ([_item], [_version])
+);
+```
+
+The `items` table will contain the most recent version of each row, de-duplicated by ID, plus the following additional columns:
+
+- `_id` - a numeric integer primary key, used as a foreign key from the `item_versions` table.
+- `_item_id` - a hash of the values of the columns specified using the `--id` option to the command. This is used for de-duplication when processing new versions.
+- `_commit` - a foreign key to the `commits` table.
 
 The `item_versions` table will contain a row for each captured differing version of that item, plus the following columns:
 
-- `_item` as a foreign key to the `items` table
-- `_commit` as a foreign key to the `commits` table
-- `_version` as the numeric version number, starting at 1 and incrementing for each captured version
+- `_item` - a foreign key to the `items` table.
+- `_version` - the numeric version number, starting at 1 and incrementing for each captured version.
+- `_commit` - a foreign key to the `commits` table.
 
 If you have already imported history, the command will skip any commits that it has seen already and just process new ones. This means that even though an initial import could be slow subsequent imports should run a lot faster.
 
@@ -80,22 +148,7 @@ If the data in your repository is a CSV or TSV file you can process it by adding
 
 If your data is not already either CSV/TSV or a flat JSON array, you can reshape it using the `--convert` option.
 
-The format needed by this tool is an array of dictionaries that looks like this:
-
-```json
-[
-    {
-        "id": "552",
-        "name": "Hawthorne Fire",
-        "engines": 3
-    },
-    {
-        "id": "556",
-        "name": "Merlin Fire",
-        "engines": 1
-    }
-]
-```
+The format needed by this tool is an array of dictionaries, as demonstrated by the `incidents.json` example above.
 
 If your data does not fit this shape, you can provide a snippet of Python code to converts the on-disk content of each stored file into a Python list of dictionaries.