Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Underscore prefix for reserved colums #15

Merged
merged 3 commits into from
Nov 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Change directory into the GitHub repository in question and run the following:
This will create a new SQLite database in the `incidents.db` file with two tables:

- `commits` containing a row for every commit, with a `hash` column and the `commit_at` date.
- `items` containing a row for every item in every version of the `filename.json` file - with an extra `commit` column that is a foreign key back to the `commits` table.
- `items` containing a row for every item in every version of the `filename.json` file - with an extra `_commit` column that is a foreign key back to the `commits` table.

If you have 10 historic versions of the `incidents.json` file and each one contains 30 incidents, you will end up with 10 * 30 = 300 rows in your `items` table.

Expand All @@ -48,9 +48,9 @@ The `items` table will contain just the most recent version of each row, de-dupl

The `item_versions` table will contain a row for each captured differing version of that item, plus the following columns:

- `item` as a foreign key to the `items` table
- `commit` as a foreign key to the `commits` table
- `version` as the numeric version number, starting at 1 and incrementing for each captured version
- `_item` as a foreign key to the `items` table
- `_commit` as a foreign key to the `commits` table
- `_version` as the numeric version number, starting at 1 and incrementing for each captured version

If you have already imported history, the command will skip any commits that it has seen already and just process new ones. This means that even though an initial import could be slow subsequent imports should run a lot faster.

Expand All @@ -66,9 +66,9 @@ Additional options:
- `--ignore-duplicate-ids` - if a single version of a file has the same ID in it more than once, the tool will exit with an error. Use this option to ignore this and instead pick just the first of the two duplicates.
- `--silent` - don't show the progress bar.

Note that `id`, `item`, `version`, `commit` and `rowid` are reserved column names that are used by this tool. If your data contains any of these they will be renamed to `id_`, `item_`, `version_`, `commit_` or `rowid_` to avoid clashing with the reserved columns.
Note that `_id`, `_item`, `_version`, `_commit` and `rowid` are considered column names for the purposes of this tool. If your data contains any of these they will be renamed to `_id_`, `_item_`, `_version_`, `_commit_` or `_rowid_` to avoid clashing with the reserved columns.

There is one exception: if you have an `id` column and use `--id id` without specifying more than one ID column, your ìd` column will be used as the item ID but will not be renamed.
If you have a column with a name such as `_commit_` it will be renamed too, adding an additional trailing underscore, so `_commit_` becomes `_commit__` and `_commit__` becomes `_commit__`.

### CSV and TSV data

Expand Down
54 changes: 16 additions & 38 deletions git_history/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sqlite_utils
import textwrap
from pathlib import Path
from .utils import fix_reserved_columns


def iterate_file_versions(
Expand Down Expand Up @@ -181,17 +182,13 @@ def file(

# If --id is specified, do things a bit differently
if ids:
# If '--id id' is only option, 'id' is not a reserved column
id_is_reserved = list(ids) != ["id"]
# Any ids that are reserved columns must be renamed
fixed_ids = set(
fix_reserved_columns(
{id: 1 for id in ids},
allow_id=not id_is_reserved,
allow_banned=True,
).keys()
)
items_insert_extra_kwargs["pk"] = "id"
items_insert_extra_kwargs["pk"] = "_id"
# Check all items have those columns
_ids_set = set(ids)
bad_items = [
Expand All @@ -209,7 +206,7 @@ def file(
items_insert_extra_kwargs["replace"] = True
# Which of these are new versions of things we have seen before
for item in items:
item = fix_reserved_columns(item, allow_id=not id_is_reserved)
item = fix_reserved_columns(item)
item_id = _hash(dict((id, item.get(id)) for id in fixed_ids))
if item_id in item_ids_in_this_version:
if not ignore_duplicate_ids:
Expand Down Expand Up @@ -240,9 +237,9 @@ def file(
id_last_hash[item_id] = item_hash
version = id_versions.get(item_id, 0) + 1
id_versions[item_id] = version
items_to_add.append(dict(item, id=item_id))
items_to_add.append(dict(item, _id=item_id))
versions.append(
dict(item, item=item_id, version=version, commit=git_hash)
dict(item, _item=item_id, _version=version, _commit=git_hash)
)

# Only add the items that had no new version
Expand All @@ -252,15 +249,17 @@ def file(
# not ids - so just check them for banned columns and add the item["commit"]
for item in items:
item = fix_reserved_columns(item)
item["commit"] = git_hash
item["_commit"] = git_hash
# In this case item table needs a foreign key on 'commit'
items_insert_extra_kwargs["foreign_keys"] = (("commit", "commits", "hash"),)
items_insert_extra_kwargs["foreign_keys"] = (
("_commit", "commits", "hash"),
)

# insert items
if items:
db["items"].insert_all(
items,
column_order=("id",),
column_order=("_id",),
alter=True,
**items_insert_extra_kwargs,
)
Expand All @@ -269,11 +268,14 @@ def file(
if versions:
db["item_versions"].insert_all(
versions,
pk=("item", "version"),
pk=("_item", "_version"),
alter=True,
replace=True,
column_order=("item", "version", "commit"),
foreign_keys=(("item", "items", "id"), ("commit", "commits", "hash")),
column_order=("_item", "_version", "_commit"),
foreign_keys=(
("_item", "items", "_id"),
("_commit", "commits", "hash"),
),
)


Expand All @@ -283,27 +285,3 @@ def _hash(record):
"utf8"
)
).hexdigest()


def fix_reserved_columns(item, allow_id=False, allow_banned=False):
reserved = {"item", "version", "commit", "rowid"}
banned = {"id_", "item_", "version_", "commit_"}
if not allow_id:
reserved.add("id")
if not allow_banned and any(key in banned for key in item):
raise click.ClickException(
"Column {} is one of these banned columns: {}\n{}".format(
sorted([key for key in item if key in banned]),
sorted(banned),
json.dumps(item, indent=4, default=str),
)
)
if not any(key in reserved for key in item):
return item
new_item = {}
for key in item:
if key in reserved:
new_item[key + "_"] = item[key]
else:
new_item[key] = item[key]
return new_item
19 changes: 19 additions & 0 deletions git_history/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import re

RESERVED = ("_id", "_item", "_version", "_commit", "rowid")
reserved_with_suffix_re = re.compile("^({})_*$".format("|".join(RESERVED)))


def fix_reserved_columns(item):
if not any(reserved_with_suffix_re.match(key) for key in item):
return item

return {_fix_key(key): item[key] for key in item}


def _fix_key(key):
# Add a trailing _ if it's reserved or reserved with _ suffix
if reserved_with_suffix_re.match(key):
return key + "_"
else:
return key
Loading