Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable partial check out of only some files from a repo #455

Merged
merged 3 commits into from
Oct 19, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions tools/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import logging
import subprocess as sp
from typing import Optional
from typing import Optional, List
import os

from sisyphus import *

Expand All @@ -14,7 +15,7 @@ class CloneGitRepositoryJob(Job):
Clone a git repository given optional branch name and commit hash
"""

__sis_hash_exclude__ = {"clone_submodules": False}
__sis_hash_exclude__ = {"clone_submodules": False, "files_to_checkout": None}

def __init__(
self,
Expand All @@ -23,6 +24,7 @@ def __init__(
commit: Optional[str] = None,
checkout_folder_name: str = "repository",
clone_submodules: bool = False,
files_to_checkout: Optional[List[str]] = None,
):
"""

Expand All @@ -31,27 +33,44 @@ def __init__(
:param commit: Git commit hash
:param checkout_folder_name: Name of the output path repository folder
:param clone_submodules: Flag to clone submodules if set to True
:param files_to_checkout: List of files to be checked out sparsely. If not set, the entire repo is checked out (default behaviour).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would the user need to use multiple files from different commits at some point? Maybe this parameter can be specified as a dictionary of commits to lists of files. Would that be confusing for the user?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that is a bit too much for my use-case because all the commits are specified at repo-level. I will not add this now, but if everyone thinks it is useful, please let me know.

"""
self.url = url
self.branch = branch
self.commit = commit
self.clone_submodules = clone_submodules
self.files_to_checkout = files_to_checkout

self.out_repository = self.output_path(checkout_folder_name, True)

def tasks(self):
yield Task("run", mini_task=True)

def run(self):
args = ["git", "clone", self.url]
if self.files_to_checkout is not None:
args = ["git", "clone", "--no-checkout", self.url]
else:
args = ["git", "clone", self.url]
if self.branch is not None:
args.extend(["-b", self.branch])
repository_dir = self.out_repository.get_path()
args += [repository_dir]
logging.info("running command: %s" % " ".join(args))
sp.run(args, check=True)

if self.commit is not None:
if self.files_to_checkout is not None:
args = ["git", "checkout"]
commit = self.commit if self.commit is not None else ""
moothiringote marked this conversation as resolved.
Show resolved Hide resolved
args.extend([commit, "--"] + self.files_to_checkout)
logging.info("running command: %s" % " ".join(args))
sp.run(args, cwd=repository_dir, check=True)
for file in self.files_to_checkout:
# some files may be links, so download the original file to avoid a missing symlink target
if os.path.islink(f"{repository_dir}/{file}"):
args = ["git", "checkout", commit, "--", os.path.realpath(f"{repository_dir}/{file}")]
logging.info("running command: %s" % " ".join(args))
sp.run(args, cwd=repository_dir, check=True)
elif self.commit is not None:
args = ["git", "checkout", self.commit]
logging.info("running command: %s" % " ".join(args))
sp.run(args, cwd=repository_dir, check=True)
Expand Down
Loading