From 0640abfe9c879ad7051e907f5392f242a8092fec Mon Sep 17 00:00:00 2001 From: Ryan Morshead Date: Fri, 26 May 2023 11:59:25 -0600 Subject: [PATCH 1/5] fix urljoin for arbitrary URL protocols This works around an issue with urllib.parse.urljoin where it only handles relative URLs for protocols contained in urllib.parse.uses_relative. As it happens common protocols used with git, like ssh or git+ssh are not in that list. --- src/poetry/vcs/git/backend.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/poetry/vcs/git/backend.py b/src/poetry/vcs/git/backend.py index 3d58850f468..7a8d29d140d 100644 --- a/src/poetry/vcs/git/backend.py +++ b/src/poetry/vcs/git/backend.py @@ -7,7 +7,7 @@ from pathlib import Path from subprocess import CalledProcessError from typing import TYPE_CHECKING -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse, urlunparse from dulwich import porcelain from dulwich.client import HTTPUnauthorized @@ -348,7 +348,7 @@ def _clone_submodules(cls, repo: Repo) -> None: url_string = url.decode("utf-8") if relative_submodule_regex.search(url_string): - url_string = urljoin(f"{Git.get_remote_url(repo)}/", url_string) + url_string = _urljoin(f"{Git.get_remote_url(repo)}/", url_string) source_root = path_absolute.parent source_root.mkdir(parents=True, exist_ok=True) @@ -456,3 +456,22 @@ def clone( # fallback to legacy git client return cls._clone_legacy(url=url, refspec=refspec, target=target) + + +def _urljoin(base: str, path: str) -> str: + """Allow any URL to be joined with a path + + This works around an issue with urllib.parse.urljoin where it only handles + relative URLs for protocols contained in urllib.parse.uses_relative. As it + happens common protocols used with git, like ssh or git+ssh are not in that + list. + + Thus we need to implement our own version of urljoin that handles all URLs + protocols. This is accomplished by using urlparse and urlunparse to split + the URL into its components, join the path, and then reassemble the URL. + + See: https://github.com/python-poetry/poetry/issues/6499#issuecomment-1564712609 + """ + parsed_base = urlparse(base) + new = parsed_base._replace(path=urljoin(parsed_base.path, path)) + return urlunparse(new) From 52d1d60da6a05b9244daa6131cc11811e6189a49 Mon Sep 17 00:00:00 2001 From: Ryan Morshead Date: Fri, 26 May 2023 13:16:51 -0600 Subject: [PATCH 2/5] fix whitespace --- src/poetry/vcs/git/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/poetry/vcs/git/backend.py b/src/poetry/vcs/git/backend.py index 7a8d29d140d..b0116f65766 100644 --- a/src/poetry/vcs/git/backend.py +++ b/src/poetry/vcs/git/backend.py @@ -460,7 +460,7 @@ def clone( def _urljoin(base: str, path: str) -> str: """Allow any URL to be joined with a path - + This works around an issue with urllib.parse.urljoin where it only handles relative URLs for protocols contained in urllib.parse.uses_relative. As it happens common protocols used with git, like ssh or git+ssh are not in that From f56b347951bbf6860bce88339baf987730109c62 Mon Sep 17 00:00:00 2001 From: Ryan Morshead Date: Fri, 26 May 2023 13:19:59 -0600 Subject: [PATCH 3/5] fix docstring --- src/poetry/vcs/git/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/poetry/vcs/git/backend.py b/src/poetry/vcs/git/backend.py index b0116f65766..7f303a71a08 100644 --- a/src/poetry/vcs/git/backend.py +++ b/src/poetry/vcs/git/backend.py @@ -459,7 +459,8 @@ def clone( def _urljoin(base: str, path: str) -> str: - """Allow any URL to be joined with a path + """ + Allow any URL to be joined with a path This works around an issue with urllib.parse.urljoin where it only handles relative URLs for protocols contained in urllib.parse.uses_relative. As it From 5f0585f0688c92cc27a9de2211d216efcd3cfcec Mon Sep 17 00:00:00 2001 From: Ryan Morshead Date: Fri, 26 May 2023 13:33:53 -0600 Subject: [PATCH 4/5] run ruff --- src/poetry/vcs/git/backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/poetry/vcs/git/backend.py b/src/poetry/vcs/git/backend.py index 7f303a71a08..a3aa0509280 100644 --- a/src/poetry/vcs/git/backend.py +++ b/src/poetry/vcs/git/backend.py @@ -7,7 +7,9 @@ from pathlib import Path from subprocess import CalledProcessError from typing import TYPE_CHECKING -from urllib.parse import urljoin, urlparse, urlunparse +from urllib.parse import urljoin +from urllib.parse import urlparse +from urllib.parse import urlunparse from dulwich import porcelain from dulwich.client import HTTPUnauthorized From b215d826012a02cbcc75d4e9be8ddd09f8f2015e Mon Sep 17 00:00:00 2001 From: Ryan Morshead Date: Tue, 30 May 2023 15:02:50 -0600 Subject: [PATCH 5/5] refactor for newly added tests --- src/poetry/vcs/git/backend.py | 105 ++++++++++++++---------- tests/integration/test_utils_vcs_git.py | 31 +++++++ 2 files changed, 94 insertions(+), 42 deletions(-) diff --git a/src/poetry/vcs/git/backend.py b/src/poetry/vcs/git/backend.py index a3aa0509280..e84588c0995 100644 --- a/src/poetry/vcs/git/backend.py +++ b/src/poetry/vcs/git/backend.py @@ -32,6 +32,9 @@ logger = logging.getLogger(__name__) +# A relative URL by definition starts with ../ or ./ +RELATIVE_SUBMODULE_REGEX = re.compile(r"^\.{1,2}/") + def is_revision_sha(revision: str | None) -> bool: return re.match(r"^\b[0-9a-f]{5,40}\b$", revision or "") is not None @@ -332,49 +335,59 @@ def _clone_submodules(cls, repo: Repo) -> None: Helper method to identify configured submodules and clone them recursively. """ repo_root = Path(repo.path) - modules_config = repo_root.joinpath(".gitmodules") - - # A relative URL by definition starts with ../ or ./ - relative_submodule_regex = re.compile(r"^\.{1,2}/") - - if modules_config.exists(): - config = ConfigFile.from_path(str(modules_config)) - - url: bytes - path: bytes - submodules = parse_submodules(config) - - for path, url, name in submodules: - path_relative = Path(path.decode("utf-8")) - path_absolute = repo_root.joinpath(path_relative) - - url_string = url.decode("utf-8") - if relative_submodule_regex.search(url_string): - url_string = _urljoin(f"{Git.get_remote_url(repo)}/", url_string) - - source_root = path_absolute.parent - source_root.mkdir(parents=True, exist_ok=True) - - with repo: - try: - revision = repo.open_index()[path].sha.decode("utf-8") - except KeyError: - logger.debug( - "Skip submodule %s in %s, path %s not found", - name, - repo.path, - path, - ) - continue - - cls.clone( - url=url_string, - source_root=source_root, - name=path_relative.name, + for submodule in cls._get_submodules(repo): + path_absolute = repo_root / submodule.path + source_root = path_absolute.parent + source_root.mkdir(parents=True, exist_ok=True) + cls.clone( + url=submodule.url, + source_root=source_root, + name=path_absolute.name, + revision=submodule.revision, + clean=path_absolute.exists() + and not path_absolute.joinpath(".git").is_dir(), + ) + + @classmethod + def _get_submodules(cls, repo: Repo) -> list[SubmoduleInfo]: + modules_config = Path(repo.path, ".gitmodules") + + if not modules_config.exists(): + return [] + + config = ConfigFile.from_path(str(modules_config)) + + submodules: list[SubmoduleInfo] = [] + for path, url, name in parse_submodules(config): + url_str = url.decode("utf-8") + path_str = path.decode("utf-8") + name_str = name.decode("utf-8") + + if RELATIVE_SUBMODULE_REGEX.search(url_str): + url_str = urlpathjoin(f"{cls.get_remote_url(repo)}/", url_str) + + with repo: + try: + revision = repo.open_index()[path].sha.decode("utf-8") + except KeyError: + logger.debug( + "Skip submodule %s in %s, path %s not found", + name_str, + repo.path, + path_str, + ) + continue + + submodules.append( + SubmoduleInfo( + path=path_str, + url=url_str, + name=name_str, revision=revision, - clean=path_absolute.exists() - and not path_absolute.joinpath(".git").is_dir(), ) + ) + + return submodules @staticmethod def is_using_legacy_client() -> bool: @@ -460,7 +473,7 @@ def clone( return cls._clone_legacy(url=url, refspec=refspec, target=target) -def _urljoin(base: str, path: str) -> str: +def urlpathjoin(base: str, path: str) -> str: """ Allow any URL to be joined with a path @@ -478,3 +491,11 @@ def _urljoin(base: str, path: str) -> str: parsed_base = urlparse(base) new = parsed_base._replace(path=urljoin(parsed_base.path, path)) return urlunparse(new) + + +@dataclasses.dataclass +class SubmoduleInfo: + path: str + url: str + name: str + revision: str diff --git a/tests/integration/test_utils_vcs_git.py b/tests/integration/test_utils_vcs_git.py index dfc59bffee7..09df801c93f 100644 --- a/tests/integration/test_utils_vcs_git.py +++ b/tests/integration/test_utils_vcs_git.py @@ -8,6 +8,8 @@ from pathlib import Path from typing import TYPE_CHECKING from typing import Iterator +from urllib.parse import urlparse +from urllib.parse import urlunparse import pytest @@ -389,3 +391,32 @@ def test_system_git_called_when_configured( target=path, refspec=GitRefSpec(branch="0.1", revision=None, tag=None, ref=b"HEAD"), ) + + +def test_relative_submodules_with_ssh( + source_url: str, tmpdir: Path, mocker: MockerFixture +) -> None: + target = tmpdir / "temp" + ssh_source_url = urlunparse(urlparse(source_url)._replace(scheme="ssh")) + + repo_with_unresolved_submodules = Git._clone( + url=source_url, + refspec=GitRefSpec(branch="relative_submodule"), + target=target, + ) + + # construct fake git config + fake_config = ConfigFile( + {(b"remote", b"origin"): {b"url": ssh_source_url.encode("utf-8")}} + ) + # trick Git into thinking remote.origin is an ssh url + mock_get_config = mocker.patch.object(repo_with_unresolved_submodules, "get_config") + mock_get_config.return_value = fake_config + + submodules = Git._get_submodules(repo_with_unresolved_submodules) + + assert [s.url for s in submodules] == [ + "https://github.com/pypa/sample-namespace-packages.git", + "ssh://github.com/python-poetry/test-fixture-vcs-repository.git", + "ssh://github.com/python-poetry/test-fixture-vcs-repository.git", + ]