simonw · simonw · Nov 21, 2021 · Nov 19, 2021 · Nov 21, 2021 · Nov 21, 2021
diff --git a/README.md b/README.md
@@ -21,7 +21,22 @@ The `file` command analyzes the history of an individual file within the reposit
 
 The file is assumed to contain multiple objects - for example, the results of scraping an electricity outage map or a CSV file full of records.
 
-Assuming you have a file called `incidents.json` that is a JSON array of objects, with multiple versions of that file recorded in a repository.
+Assuming you have a file called `incidents.json` that is a JSON array of objects, with multiple versions of that file recorded in a repository. Each version of that file might look something like this:
+
+```json
+[
+    {
+        "IncidentID": "abc123",
+        "Location": "Corner of 4th and Vermont",
+        "Type": "fire"
+    },
+    {
+        "IncidentID": "cde448",
+        "Location": "555 West Example Drive",
+        "Type": "medical"
+    }
+]
+```
 
 Change directory into the GitHub repository in question and run the following:
 
@@ -32,6 +47,24 @@ This will create a new SQLite database in the `incidents.db` file with two table
 - `commits` containing a row for every commit, with a `hash` column and the `commit_at` date.
 - `items` containing a row for every item in every version of the `filename.json` file - with an extra `_commit` column that is a foreign key back to the `commits` table.
 
+The database schema for this example will look like this:
+
+```sql
+CREATE TABLE [commits] (
+   [id] INTEGER PRIMARY KEY,
+   [hash] TEXT,
+   [commit_at] TEXT
+);
+CREATE UNIQUE INDEX [idx_commits_hash]
+    ON [commits] ([hash]);
+CREATE TABLE [items] (
+   [IncidentID] TEXT,
+   [Location] TEXT,
+   [Type] TEXT,
+   [_commit] INTEGER REFERENCES [commits]([id])
+);
+```
+
 If you have 10 historic versions of the `incidents.json` file and each one contains 30 incidents, you will end up with 10 * 30 = 300 rows in your `items` table.
 
 ### De-duplicating items using IDs
@@ -44,13 +77,48 @@ If there is a unique identifier column called `IncidentID` you could run the fol
 
 This will create three tables - `commits`, `items` and `item_versions`.
 
-The `items` table will contain just the most recent version of each row, de-duplicated by ID.
+This time the schema will look like this:
+
+```sql
+CREATE TABLE [commits] (
+   [id] INTEGER PRIMARY KEY,
+   [hash] TEXT,
+   [commit_at] TEXT
+);
+CREATE UNIQUE INDEX [idx_commits_hash]
+    ON [commits] ([hash]);
+CREATE TABLE [items] (
+   [_id] INTEGER PRIMARY KEY,
+   [_item_id] TEXT,
+   [IncidentID] TEXT,
+   [Location] TEXT,
+   [Type] TEXT,
+   [_commit] INTEGER
+);
+CREATE UNIQUE INDEX [idx_items__item_id]
+    ON [items] ([_item_id]);
+CREATE TABLE [item_versions] (
+   [_item] INTEGER REFERENCES [items]([_id]),
+   [_version] INTEGER,
+   [_commit] INTEGER REFERENCES [commits]([id]),
+   [IncidentID] TEXT,
+   [Location] TEXT,
+   [Type] TEXT,
+   PRIMARY KEY ([_item], [_version])
+);
+```
+
+The `items` table will contain the most recent version of each row, de-duplicated by ID, plus the following additional columns:
+
+- `_id` - a numeric integer primary key, used as a foreign key from the `item_versions` table.
+- `_item_id` - a hash of the values of the columns specified using the `--id` option to the command. This is used for de-duplication when processing new versions.
+- `_commit` - a foreign key to the `commits` table.
 
 The `item_versions` table will contain a row for each captured differing version of that item, plus the following columns:
 
-- `_item` as a foreign key to the `items` table
-- `_commit` as a foreign key to the `commits` table
-- `_version` as the numeric version number, starting at 1 and incrementing for each captured version
+- `_item` - a foreign key to the `items` table.
+- `_version` - the numeric version number, starting at 1 and incrementing for each captured version.
+- `_commit` - a foreign key to the `commits` table.
 
 If you have already imported history, the command will skip any commits that it has seen already and just process new ones. This means that even though an initial import could be slow subsequent imports should run a lot faster.
 
@@ -80,22 +148,7 @@ If the data in your repository is a CSV or TSV file you can process it by adding
 
 If your data is not already either CSV/TSV or a flat JSON array, you can reshape it using the `--convert` option.
 
-The format needed by this tool is an array of dictionaries that looks like this:
-
-```json
-[
-    {
-        "id": "552",
-        "name": "Hawthorne Fire",
-        "engines": 3
-    },
-    {
-        "id": "556",
-        "name": "Merlin Fire",
-        "engines": 1
-    }
-]
-```
+The format needed by this tool is an array of dictionaries, as demonstrated by the `incidents.json` example above.
 
 If your data does not fit this shape, you can provide a snippet of Python code to converts the on-disk content of each stored file into a Python list of dictionaries.
 

diff --git a/git_history/cli.py b/git_history/cli.py
@@ -113,7 +113,7 @@ def file(
             decoded = content.decode("utf-8")
             dialect = csv.Sniffer().sniff(decoded[:512])
             reader = csv.DictReader(io.StringIO(decoded), dialect=dialect)
-            return list(reader)
+            return reader
         """
         )
         imports = ["io", "csv"]
@@ -140,9 +140,8 @@ def file(
     resolved_filepath = str(Path(filepath).resolve())
     resolved_repo = str(Path(repo).resolve())
     db = sqlite_utils.Database(database)
-    seen_hashes = set()
-    id_versions = {}
-    id_last_hash = {}
+    item_id_versions = {}
+    item_id_last_hash = {}
     for git_commit_at, git_hash, content in iterate_file_versions(
         resolved_repo,
         resolved_filepath,
@@ -154,13 +153,10 @@ def file(
         else set(),
         show_progress=not silent,
     ):
-        if git_hash not in seen_hashes:
-            seen_hashes.add(git_hash)
-            db["commits"].insert(
-                {"hash": git_hash, "commit_at": git_commit_at.isoformat()},
-                pk="hash",
-                replace=True,
-            )
+        commit_id = db["commits"].lookup(
+            {"hash": git_hash},
+            {"commit_at": git_commit_at.isoformat()},
+        )
         if not content.strip():
             # Skip empty JSON files
             continue
@@ -178,7 +174,6 @@ def file(
                 new_items.append(new_item)
             items = new_items
 
-        items_insert_extra_kwargs = {}
         versions = []
 
         # If --id is specified, do things a bit differently
@@ -189,7 +184,6 @@ def file(
                     {id: 1 for id in ids},
                 ).keys()
             )
-            items_insert_extra_kwargs["pk"] = "_id"
             # Check all items have those columns
             _ids_set = set(ids)
             bad_items = [
@@ -201,15 +195,13 @@ def file(
                         git_hash, json.dumps(bad_items[:5], indent=4, default=str)
                     )
                 )
-            # Also ensure there are not TWO items in this file with the same ID
-            item_ids_in_this_version = set()
-            items_to_add = []
-            items_insert_extra_kwargs["replace"] = True
+            # Also ensure there are not TWO items in this commit with the same ID
+            item_ids_in_this_commit = set()
             # Which of these are new versions of things we have seen before
             for item in items:
                 item = fix_reserved_columns(item)
                 item_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
-                if item_id in item_ids_in_this_version:
+                if item_id in item_ids_in_this_commit:
                     if not ignore_duplicate_ids:
                         raise click.ClickException(
                             "Commit: {} - found multiple items with the same ID:\n{}".format(
@@ -230,53 +222,46 @@ def file(
                         )
                     else:
                         continue
-                item_ids_in_this_version.add(item_id)
-                # Has it changed since last time we saw it?
-                item_hash = _hash(item)
-                if id_last_hash.get(item_id) != item_hash:
-                    # It's either new or the content has changed
-                    id_last_hash[item_id] = item_hash
-                    version = id_versions.get(item_id, 0) + 1
-                    id_versions[item_id] = version
-                    items_to_add.append(dict(item, _id=item_id))
-                    versions.append(
-                        dict(item, _item=item_id, _version=version, _commit=git_hash)
-                    )
+                item_ids_in_this_commit.add(item_id)
 
-            # Only add the items that had no new version
-            items = items_to_add
+                # Has it changed since last time we saw it?
+                item_full_hash = _hash(item)
+                if item_id_last_hash.get(item_id) != item_full_hash:
+                    # It's either new or the content has changed - so insert it
+                    item_id_last_hash[item_id] = item_full_hash
+                    version = item_id_versions.get(item_id, 0) + 1
+                    item_id_versions[item_id] = version
 
+                    # Add or fetch item
+                    item_to_insert = dict(item, _item_id=item_id, _commit=commit_id)
+                    item_id = db["items"].lookup(
+                        {"_item_id": item_id},
+                        item_to_insert,
+                        column_order=("_id", "_item_id"),
+                        pk="_id",
+                    )
+                    db["item_versions"].insert(
+                        dict(item, _item=item_id, _version=version, _commit=commit_id),
+                        pk=("_item", "_version"),
+                        alter=True,
+                        replace=True,
+                        column_order=("_item", "_version", "_commit"),
+                        foreign_keys=(
+                            ("_item", "items", "_id"),
+                            ("_commit", "commits", "id"),
+                        ),
+                    )
         else:
-            # not ids - so just check them for banned columns and add the item["commit"]
+            # no --id - so just correct for reserved columns and add item["_commit"]
             for item in items:
                 item = fix_reserved_columns(item)
-                item["_commit"] = git_hash
+                item["_commit"] = commit_id
             # In this case item table needs a foreign key on 'commit'
-            items_insert_extra_kwargs["foreign_keys"] = (
-                ("_commit", "commits", "hash"),
-            )
-
-        # insert items
-        if items:
             db["items"].insert_all(
                 items,
                 column_order=("_id",),
                 alter=True,
-                **items_insert_extra_kwargs,
-            )
-
-        # insert versions
-        if versions:
-            db["item_versions"].insert_all(
-                versions,
-                pk=("_item", "_version"),
-                alter=True,
-                replace=True,
-                column_order=("_item", "_version", "_commit"),
-                foreign_keys=(
-                    ("_item", "items", "_id"),
-                    ("_commit", "commits", "hash"),
-                ),
+                foreign_keys=(("_commit", "commits", "id"),),
             )
 
 

diff --git a/git_history/utils.py b/git_history/utils.py
@@ -1,6 +1,6 @@
 import re
 
-RESERVED = ("_id", "_item", "_version", "_commit", "rowid")
+RESERVED = ("_id", "_item", "_version", "_commit", "_item_id", "rowid")
 reserved_with_suffix_re = re.compile("^({})_*$".format("|".join(RESERVED)))
 
 

diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@ def get_long_description():
         [console_scripts]
         git-history=git_history.cli:cli
     """,
-    install_requires=["click", "GitPython", "sqlite-utils>=3.17.1"],
+    install_requires=["click", "GitPython", "sqlite-utils>=3.19"],
     extras_require={"test": ["pytest"]},
     python_requires=">=3.6",
 )