From 8d3ebd996c18f3963b189dda56ef2506a50ad0fd Mon Sep 17 00:00:00 2001 From: Sean Coates Date: Fri, 31 Dec 2021 15:46:11 -0500 Subject: [PATCH 1/2] Use GitPython's tree[path] notation to get the blob. This bypasses the behaviour of a sometimes-empty commit.tree.blobs and avoids the filtered comprehension. --- git_history/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/git_history/cli.py b/git_history/cli.py index f3a4c40..41a88d4 100644 --- a/git_history/cli.py +++ b/git_history/cli.py @@ -27,7 +27,7 @@ def iterate_file_versions( if progress_bar: progress_bar.update(1) try: - blob = [b for b in commit.tree.blobs if b.name == relative_path][0] + blob = commit.tree[relative_path] yield commit.committed_datetime, commit.hexsha, blob.data_stream.read() except IndexError: # This commit doesn't have a copy of the requested file From 2776b049a288eed1625f48facce4f466ac42fc02 Mon Sep 17 00:00:00 2001 From: Sean Coates Date: Fri, 31 Dec 2021 15:49:08 -0500 Subject: [PATCH 2/2] Add --re-encode to allow importing files that are not in utf-8 (e.g. a file in iso-8859-1/latin-1 --- README.md | 1 + git_history/cli.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 70e35b1..d0b4665 100644 --- a/README.md +++ b/README.md @@ -273,6 +273,7 @@ If you have a column with a name such as `_commit_` it will be renamed too, addi - `--namespace TEXT` - use this if you wish to include the history of multiple different files in the same database. The default is `item` but you can set it to something else, which will produce tables with names like `yournamespace` and `yournamespace_version`. - `--wal` - Enable WAL mode on the created database file. Use this if you plan to run queries against the database while `git-history` is creating it. - `--silent` - don't show the progress bar. +- `--re-encode TEXT` - re-encode the incoming data from this character encoding (e.g. `--re-encode=iso-8859-1` to convert from `Latin-1`); you can use `file -I filename` to get the encoding on some distributions. ### CSV and TSV data diff --git a/git_history/cli.py b/git_history/cli.py index 41a88d4..22d63a7 100644 --- a/git_history/cli.py +++ b/git_history/cli.py @@ -122,6 +122,11 @@ def cli(): is_flag=True, help="Don't show progress bar", ) +@click.option( + "--re-encode", + type=str, + help="Re-encodes incoming data from this charset (e.g. --re-encode=iso-8859-1)", +) @click.version_option() def file( database, @@ -143,6 +148,7 @@ def file( wal, debug, silent, + re_encode, ): "Analyze the history of a specific file and write it to SQLite" if csv_ and convert: @@ -231,7 +237,10 @@ def column_id(column): # list() to resolve generators for repeated access later try: - items = list(convert_function(content)) + if re_encode: + items = list(convert_function(content.decode(re_encode).encode("utf-8"))) + else: + items = list(convert_function(content)) except Exception: print("\nError in commit: {}".format(git_hash)) raise