From 85d15d2d0a6162a172f70e7e98adb552b56563f9 Mon Sep 17 00:00:00 2001 From: Pierre Tardy Date: Mon, 29 Mar 2021 12:15:55 +0200 Subject: [PATCH 1/2] Switch data_source dependency to GitPython While neither pygit2 or GitPython have a formidable API and documentation GitPython has the advantage of being pure python and support same proxy feature as git. Signed-off-by: Pierre Tardy --- etc/nix/flake.nix | 2 +- requirements.txt | 2 +- vulnerabilities/data_source.py | 82 +++++---- vulnerabilities/tests/test_data_source.py | 212 ++++++---------------- 4 files changed, 103 insertions(+), 195 deletions(-) diff --git a/etc/nix/flake.nix b/etc/nix/flake.nix index 10cc4c7cd..1a6504538 100644 --- a/etc/nix/flake.nix +++ b/etc/nix/flake.nix @@ -74,7 +74,7 @@ name = "vulnerablecode-${version}"; src = vulnerablecode-src; dontConfigure = true; # do not use ./configure - propagatedBuildInputs = [ pythonEnv postgresql ]; + propagatedBuildInputs = [ pythonEnv postgresql gitMinimal]; postPatch = '' # Make sure the pycodestyle binary in $PATH is used. diff --git a/requirements.txt b/requirements.txt index a8a1da1d1..d64e488d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,7 +33,7 @@ psycopg2==2.8.4 ptyprocess==0.6.0 py==1.8.0 pycparser==2.20 -pygit2==1.5.0 +gitpython==3.1.14 Pygments==2.7.4 pyparsing==2.4.5 pytest==5.3.2 diff --git a/vulnerabilities/data_source.py b/vulnerabilities/data_source.py index c911518ec..03d3e3b85 100644 --- a/vulnerabilities/data_source.py +++ b/vulnerabilities/data_source.py @@ -39,7 +39,7 @@ from typing import Tuple import xml.etree.ElementTree as ET -import pygit2 +from git import Repo, DiffIndex from packageurl import PackageURL from vulnerabilities.oval_parser import OvalParser @@ -319,35 +319,41 @@ def _collect_file_changes( file_ext: Optional[str], ) -> Tuple[Set[str], Set[str]]: - previous_commit = None added_files, updated_files = set(), set() - for commit in self._repo.walk(self._repo.head.target, pygit2.GIT_SORT_TIME): - commit_time = commit.commit_time + commit.commit_time_offset # convert to UTC - - if commit_time < self.cutoff_timestamp: + # find the most ancient commit we need to diff with + cutoff_commit = None + for commit in self._repo.iter_commits(self._repo.head): + if commit.committed_date < self.cutoff_timestamp: break + cutoff_commit = commit - if previous_commit is None: - previous_commit = commit - continue + if cutoff_commit is None: + return added_files, updated_files - for d in commit.tree.diff_to_tree(previous_commit.tree).deltas: - if not _include_file(d.new_file.path, subdir, recursive, file_ext) or d.is_binary: - continue + def _is_binary(d: DiffIndex): + if not d.b_blob: + return False + try: + d.b_blob.data_stream.read().decode() + except UnicodeDecodeError: + return True + return False + + for d in cutoff_commit.diff(self._repo.head.commit): + if not _include_file(d.b_path, subdir, recursive, file_ext) or _is_binary(d): + continue - abspath = os.path.join(self.config.working_directory, d.new_file.path) - # TODO - # Just filtering on the two status values for "added" and "modified" is too - # simplistic. This does not cover file renames, copies & - # deletions. - if d.status == pygit2.GIT_DELTA_ADDED: + abspath = os.path.join(self.config.working_directory, d.b_path) + if d.new_file: + added_files.add(abspath) + elif d.a_blob and d.b_blob: + if d.a_path != d.b_path: + # consider moved files as added added_files.add(abspath) - elif d.status == pygit2.GIT_DELTA_MODIFIED: + elif d.a_blob != d.b_blob: updated_files.add(abspath) - previous_commit = commit - # Any file that has been added and then updated inside the window of the git history we # looked at, should be considered "added", not "updated", since it does not exist in the # database yet. @@ -364,19 +370,16 @@ def _ensure_working_directory(self) -> None: os.mkdir(self.config.working_directory) def _ensure_repository(self) -> None: - repodir = pygit2.discover_repository(self.config.working_directory) - if repodir is None: + if not os.path.exists(os.path.join(self.config.working_directory, ".git")): self._clone_repository() return - - self._repo = pygit2.Repository(repodir) + self._repo = Repo(self.config.working_directory) if self.config.branch is None: - self.config.branch = self._repo.head.shorthand - branch = self._repo.branches[self.config.branch] - - if not branch.is_checked_out(): - self._repo.checkout(branch) + self.config.branch = str(self._repo.active_branch) + branch = self.config.branch + self._repo.head.reference = self._repo.heads[branch] + self._repo.head.reset(index=True, working_tree=True) remote = self._find_or_add_remote() self._update_from_remote(remote, branch) @@ -384,9 +387,9 @@ def _ensure_repository(self) -> None: def _clone_repository(self) -> None: kwargs = {} if self.config.branch: - kwargs["checkout_branch"] = self.config.branch + kwargs["branch"] = self.config.branch - self._repo = pygit2.clone_repository( + self._repo = Repo.clone_from( self.config.repository_url, self.config.working_directory, **kwargs ) @@ -398,20 +401,19 @@ def _find_or_add_remote(self): break if remote is None: - remote = self._repo.remotes.create( - "added_by_vulnerablecode", self.config.repository_url + remote = self._repo.create_remote( + "added_by_vulnerablecode", url=self.config.repository_url ) return remote def _update_from_remote(self, remote, branch) -> None: - progress = remote.fetch() - if progress.received_objects == 0: + fetch_info = remote.fetch() + if len(fetch_info) == 0: return - - remote_branch = self._repo.branches[f"{remote.name}/{self.config.branch}"] - branch.set_target(remote_branch.target) - self._repo.checkout(branch, strategy=pygit2.GIT_CHECKOUT_FORCE) + branch = self._repo.branches[branch] + branch.set_reference(remote.refs[branch.name]) + self._repo.head.reset(index=True, working_tree=True) def _include_file( diff --git a/vulnerabilities/tests/test_data_source.py b/vulnerabilities/tests/test_data_source.py index 0d1b786b2..9ad19cd93 100644 --- a/vulnerabilities/tests/test_data_source.py +++ b/vulnerabilities/tests/test_data_source.py @@ -29,7 +29,8 @@ from unittest.mock import patch import xml.etree.ElementTree as ET -import pygit2 +import git + import pytest from packageurl import PackageURL @@ -51,8 +52,37 @@ def load_oval_data(): return etrees_of_oval +@pytest.fixture +def clone_url(tmp_path): + git_dir = tmp_path / "git_dir" + repo = git.Repo.init(str(git_dir)) + new_file_path = str(git_dir / "file") + open(new_file_path, "wb").close() + repo.index.add([new_file_path]) + repo.index.commit("Added a new file") + try: + yield str(git_dir) + finally: + shutil.rmtree(git_dir) + + +@pytest.fixture +def clone_url2(tmp_path): + git_dir = tmp_path / "git_dir2" + repo = git.Repo.init(str(git_dir)) + new_file_path = str(git_dir / "file2") + open(new_file_path, "wb").close() + repo.index.add([new_file_path]) + repo.index.commit("Added a new file") + + try: + yield str(git_dir) + finally: + shutil.rmtree(git_dir) + + def mk_ds(**kwargs): - # just for convenience, since this is a manadory parameter we always pass a value + # just for convenience, since this is a mandatory parameter we always pass a value if "repository_url" not in kwargs: kwargs["repository_url"] = "asdf" @@ -105,179 +135,55 @@ def test_GitDataSource_validate_configuration_working_directory_must_exist_when_ mk_ds(working_directory="/does/not/exist", create_working_directory=False) -@patch("os.path.exists", return_value=False) -@patch("shutil.rmtree") -@patch("pygit2.clone_repository") -@patch("os.mkdir") -def test_GitDataSource_contextmgr_working_directory_is_created_and_removed( - mkdir, clone_repository, rmtree, _ -): - - wd = "/some/working/directory" - ds = mk_ds(working_directory=wd, create_working_directory=True, remove_working_directory=True) - - with ds: - assert wd == ds.config.working_directory - assert mkdir.called_with(wd) - - assert clone_repository.called_with("asdf", wd, checkout_branch=ds.config.branch) - assert rmtree.called_with(wd) - - -@patch("shutil.rmtree") -@patch("pygit2.clone_repository") -@patch("tempfile.mkdtemp", return_value="/fake/tempdir") -def test_GitDataSource_contextmgr_calls_mkdtemp_if_working_directory_is_not_set(mkdtemp, *_): - - ds = mk_ds() +def test_GitDataSource_contextmgr_working_directory_is_created_and_removed(tmp_path, clone_url): - with ds: - assert mkdtemp.called - assert ds.config.working_directory == "/fake/tempdir" - - -@patch("os.path.exists", return_value=True) -@patch("pygit2.Repository") -@patch("pygit2.clone_repository") -@patch("pygit2.discover_repository", return_value="/fake/tempdir/.git") -def test_GitDataSource_contextmgr_uses_existing_repository( - discover_repository, - clone_repository, - _, - no_mkdir, - no_rmtree, -): + wd = tmp_path / "working" ds = mk_ds( - working_directory="/fake/tempdir", - create_working_directory=False, - remove_working_directory=False, + working_directory=str(wd), + create_working_directory=True, + remove_working_directory=True, + repository_url=clone_url, ) with ds: - assert discover_repository.called - assert not clone_repository.called - - -@patch("os.path.exists", return_value=True) -@patch("pygit2.discover_repository", return_value="/fake/tempdir/.git") -@patch("pygit2.Repository") -def test_GitDataSource_contextmgr_switches_branch( - Repository, discover_repository, exists, no_mkdir, no_rmtree -): - - mock_repo, mock_remote, mock_remote_origin, mock_remote_other = ( - MagicMock(), - MagicMock(), - MagicMock(), - MagicMock(), - ) - mock_master, mock_custom, mock_remote_custom = MagicMock(), MagicMock(), MagicMock() - - mock_master.is_checked_out = lambda: True - mock_custom.is_checked_out = lambda: False - mock_repo.branches = { - "master": mock_master, - "custom": mock_custom, - "origin/custom": mock_remote_custom, - } - - repository_url = "https://foo/bar/baz.git" - mock_remote.url = repository_url - mock_remote.name = "origin" - mock_repo.remotes = [mock_remote] - Repository.return_value = mock_repo - - ds = mk_ds( - repository_url=repository_url, - working_directory="/fake/tempdir", - create_working_directory=False, - remove_working_directory=False, - branch="custom", - ) + assert str(wd) == ds.config.working_directory + assert (wd / ".git").exists() + assert (wd / "file").exists() - with ds: - mock_repo.checkout.assert_called_with(mock_custom, strategy=pygit2.GIT_CHECKOUT_FORCE) + assert not (wd / ".git").exists() -@patch("os.path.exists", return_value=True) -@patch("pygit2.discover_repository", return_value="/fake/tempdir/.git") -@patch("pygit2.Repository") -def test_GitDataSource_contextmgr_fetches_from_remote_in_already_cloned_repository( - Repository, - discover_repository, - exists, - no_mkdir, - no_rmtree, +@patch("tempfile.mkdtemp") +def test_GitDataSource_contextmgr_calls_mkdtemp_if_working_directory_is_not_set( + mkdtemp, tmp_path, clone_url ): - mock_repo, mock_remote_origin, mock_remote_other = MagicMock(), MagicMock(), MagicMock() - mock_branch, mock_remote_branch = MagicMock(), MagicMock() - - repository_url = "https://foo/bar/baz.git" - mock_remote_origin.url = repository_url - mock_remote_origin.name = "origin" - mock_remote_other.url = "https://some/other/url.git" - mock_repo.remotes = [mock_remote_other, mock_remote_origin] - mock_remote_branch.target = "asdf" - mock_remote_branch.shorthand = "origin/master" - mock_repo.head.shorthand = "master" - mock_repo.branches = {"master": mock_branch, "origin/master": mock_remote_branch} - - Repository.return_value = mock_repo - - ds = mk_ds( - repository_url=repository_url, - working_directory="/fake/tempdir", - create_working_directory=False, - remove_working_directory=False, - ) + mkdtemp.return_value = str(tmp_path / "working") + ds = mk_ds(repository_url=clone_url) with ds: - assert mock_remote_origin.fetch.called - assert not mock_remote_other.fetch.called + assert mkdtemp.called + assert ds.config.working_directory == str(tmp_path / "working") -@patch("os.path.exists", return_value=True) -@patch("pygit2.discover_repository", return_value="/fake/tempdir/.git") -@patch("pygit2.Repository") -def test_GitDataSource_contextmgr_adds_missing_remote_to_already_cloned_repository( - Repository, - discover_repository, - exists, +def test_GitDataSource_contextmgr_uses_existing_repository( + clone_url, + clone_url2, no_mkdir, no_rmtree, ): - mock_repo, mock_remote, mock_branch, mock_remote_branch = ( - MagicMock(), - MagicMock(), - MagicMock(), - MagicMock(), - ) - - repository_url = "https://foo/bar/baz.git" - mock_remote_branch.target = "asdf" - mock_remote_branch.shorthand = "added_by_vulnerablecode/master" - mock_repo.head.shorthand = "master" - mock_repo.branches = { - "master": mock_branch, - "added_by_vulnerablecode/master": mock_remote_branch, - } - - mock_remote.name = "added_by_vulnerablecode" - mock_remote.url = repository_url - mock_repo.remotes = MagicMock() - mock_repo.remotes.create.return_value = mock_remote - Repository.return_value = mock_repo - ds = mk_ds( - repository_url=repository_url, - working_directory="/fake/tempdir", + working_directory=clone_url, + repository_url=clone_url2, create_working_directory=False, remove_working_directory=False, ) with ds: - mock_repo.remotes.create.assert_called_once_with("added_by_vulnerablecode", repository_url) + # also make sure we switch the branch (original do not have file2) + assert os.path.exists(os.path.join(ds.config.working_directory, "file2")) + + assert os.path.exists(ds.config.working_directory) def test__include_file(): @@ -375,7 +281,7 @@ def test_file_changes_include_fixed_advisories(self): # pick a date that includes commit 9889ed0831b4fb4beb7675de361926d2e9a99c20 # ("Fix patched version for RUSTSEC-2020-0008") last_run_date = datetime.datetime( - year=2020, month=3, day=31, hour=19, tzinfo=datetime.timezone.utc + year=2020, month=3, day=31, hour=17, minute=40, tzinfo=datetime.timezone.utc ) ds = self.mk_ds(last_run_date=last_run_date, cutoff_date=None) From 9be7cee9bded3bf772002e0e99360159e0e2f003 Mon Sep 17 00:00:00 2001 From: Pierre Tardy Date: Thu, 15 Apr 2021 21:28:12 +0200 Subject: [PATCH 2/2] use binaryornot instead of handmade function Signed-off-by: Pierre Tardy --- requirements.txt | 1 + vulnerabilities/data_source.py | 12 +++--------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index d64e488d5..0a0414b06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ asgiref==3.2.7 attrs==19.3.0 backcall==0.1.0 beautifulsoup4==4.7.1 +binaryornot==0.4.4 cached-property==1.5.1 cffi==1.14.0 contextlib2==0.5.5 diff --git a/vulnerabilities/data_source.py b/vulnerabilities/data_source.py index 03d3e3b85..d1e4fd903 100644 --- a/vulnerabilities/data_source.py +++ b/vulnerabilities/data_source.py @@ -27,6 +27,8 @@ import shutil import tempfile import traceback +import xml.etree.ElementTree as ET +from binaryornot.helpers import is_binary_string from datetime import datetime from pathlib import Path from typing import Any @@ -37,8 +39,6 @@ from typing import Optional from typing import Set from typing import Tuple -import xml.etree.ElementTree as ET - from git import Repo, DiffIndex from packageurl import PackageURL @@ -332,13 +332,7 @@ def _collect_file_changes( return added_files, updated_files def _is_binary(d: DiffIndex): - if not d.b_blob: - return False - try: - d.b_blob.data_stream.read().decode() - except UnicodeDecodeError: - return True - return False + return is_binary_string(d.b_blob.data_stream.read(1024)) for d in cutoff_commit.diff(self._repo.head.commit): if not _include_file(d.b_path, subdir, recursive, file_ext) or _is_binary(d):