Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/dependency repo caching #699

Merged
merged 11 commits into from
May 8, 2024
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

TODO add summary

## NEW FUNCTIONALITY

* `dependencies`: GitHub and ViashHub repositories now get properly cached (PR #699).
The cache is stored in the `~/.viash/cache` directory using sparse-checkout to only fetch the necessary files.
During a build, the cache is checked for the repository and if it is found and still up-to-date, the repository is not cloned again and instead the cache is copied to a temporary folder where the files are checked out from the sparse-checkout.

# Viash 0.9.0-RC3 (2024-04-26): Various bug fixes and minor improvements

Mainly fixes for code changes from previous release candidates. Some additional minor fixes and QoL improvements are included.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,97 +17,117 @@

package io.viash.config.dependencies

import io.viash.helpers.{IO, Exec, Logging}
import io.viash.helpers.{IO, Exec, Logging, Git}
import java.io.File
import java.nio.file.Paths
import io.viash.exceptions.CheckoutException
import io.viash.helpers.SysEnv
import java.nio.file.Path

trait AbstractGitRepository extends Repository with Logging {
val uri: String
val storePath: String

@inline
protected def getLoggers(fn: String) = Seq[String => Unit] { str: String => debug(s"$fn: $str") }

def copyRepo(
`type`: String,
tag: Option[String],
path: Option[String],
localPath: String
): AbstractGitRepository

def hasBranch(name: String, cwd: Option[File]): Boolean = {
val out = Exec.runCatch(
List("git", "show-ref", "--verify", "--quiet", s"refs/heads/$name"),
cwd = cwd
)
out.exitValue == 0
}

def hasTag(name: String, cwd: Option[File]): Boolean = {
val out = Exec.runCatch(
List("git", "show-ref", "--verify", "--quiet", s"refs/tags/$name"),
cwd = cwd
)
out.exitValue == 0
}

): AbstractGitRepository

// Get the repository part of where dependencies should be located in the target/dependencies folder
def subOutputPath: String = Paths.get(`type`, storePath, tag.getOrElse("")).toString()

protected def doGitClone(uri: String, cwd: Option[File]): Exec.ExecOutput = {
val singleBranch = tag match {
case None => List("--single-branch")
case Some(value) => List("--single-branch", "--branch", value)
}

val loggers = Seq[String => Unit] { (str: String) => {info(str)} }
Exec.runCatch(
List("git", "clone", uri, "--no-checkout", "--depth", "1") ++ singleBranch :+ ".",
cwd = cwd,
loggers = loggers,
)
def getCheckoutUri(): String
def getCacheIdentifier(): Option[String]
def fullCachePath: Option[Path] = {
val cacheIdentifier = getCacheIdentifier()
cacheIdentifier.map(cacheIdentifier => Paths.get(SysEnv.viashHome).resolve("repositories").resolve(cacheIdentifier))
}

protected def checkGitAuthentication(uri: String): Boolean = {
val res = Exec.runCatch(
List("git", "ls-remote", uri),
)
res.exitValue == 0
def findInCache(): Option[AbstractGitRepository] = {
val cachePath = fullCachePath
cachePath match {
case Some(path) if path.toFile.isDirectory() =>
debug(s"Found in cache: $path")
Some(copyRepo(localPath = path.toString))
case _ => None
}
}

def getCheckoutUri(): String
// compare the remote hash with the local hash
def checkCacheStillValid(): Boolean = {
if (AbstractGitRepository.isValidatedCache(localPath))
return true
val uri = getCheckoutUri()
val remoteHash = Git.getRemoteHash(uri, tag)
val localHash = Git.getCommit(Paths.get(localPath).toFile())
debug(s"remoteHash: $remoteHash localHash: $localHash")
val res = remoteHash == localHash && remoteHash.isDefined
if (res)
AbstractGitRepository.markValidatedCache(localPath)
res
}

// Clone of single branch with depth 1 but without checking out files
def checkoutSparse(): AbstractGitRepository = {
val temporaryFolder = IO.makeTemp("viash_hub_repo")
val cwd = Some(temporaryFolder.toFile)

val uri = getCheckoutUri()

info(s"temporaryFolder: $temporaryFolder uri: $uri")
debug(s"temporaryFolder: $temporaryFolder uri: $uri")

val out = doGitClone(uri, cwd)
val out = Git.cloneSparseAndShallow(uri, tag, temporaryFolder.toFile)
if (out.exitValue != 0)
throw new CheckoutException(this)

copyRepo(localPath = temporaryFolder.toString)
}

// Get cached repo if it exists and is still valid, otherwise checkout a new one
// If a new one is checked out, copy it to the cache
// If a cached repo is used, copy it to a new temporary folder
def getSparseRepoInTemp(): AbstractGitRepository = {
info(s"Fetching repo for $uri")
findInCache() match {
case Some(repo) if repo.checkCacheStillValid() =>
debug(s"Using cached repo from ${repo.localPath}")
val newTemp = IO.makeTemp("viash_hub_repo")
IO.copyFolder(repo.localPath, newTemp.toString)
repo.copyRepo(localPath = newTemp.toString)
case _ =>
debug(s"Cache either not present or outdated; checkout repository")
val repo = checkoutSparse()
repo.fullCachePath match {
case Some(cachePath) =>
debug(s"Copying repo to cache ${repo.fullCachePath}")
val cachePathFile = cachePath.toFile()
if (cachePathFile.exists())
IO.deleteRecursively(cachePath)
cachePathFile.mkdirs()
IO.copyFolder(repo.localPath, cachePath.toString)
AbstractGitRepository.markValidatedCache(cachePath.toString)
case None =>
}
repo
}
}

// Checkout of files from already cloned repository. Limit file checkout to the path that was specified
def checkout(): AbstractGitRepository = {
val pathStr = path.getOrElse(".")
val cwd = Some(Paths.get(localPath).toFile)
val localPathFile = Paths.get(localPath).toFile
val checkoutName = tag match {
case Some(name) if hasBranch(name, cwd) => s"origin/$name"
case Some(name) if hasTag(name, cwd) => s"tags/$name"
case Some(name) if Git.hasBranch(name, localPathFile) => s"origin/$name"
case Some(name) if Git.hasTag(name, localPathFile) => s"tags/$name"
case _ => "origin/HEAD"
}

val out = Exec.runCatch(
List("git", "checkout", checkoutName, "--", pathStr),
cwd = cwd
)
val out = Git.checkout(checkoutName, path, localPathFile)

info(s"checkout out: ${out.command} ${out.exitValue} ${out.output}")
if (out.exitValue != 0)
warn(s"checkout out: ${out.command} ${out.exitValue} ${out.output}")

if (path.isDefined)
copyRepo(localPath = Paths.get(localPath, path.get).toString)
Expand All @@ -116,3 +136,17 @@ trait AbstractGitRepository extends Repository with Logging {
this
}
}

object AbstractGitRepository extends Logging {
private val validatedCaches = scala.collection.mutable.ListBuffer[String]()
private def markValidatedCache(cacheIdentifier: String): Unit = {
debug("Marking cache as validated: " + cacheIdentifier)
if (!validatedCaches.contains(cacheIdentifier))
validatedCaches += cacheIdentifier
}
private def isValidatedCache(cacheIdentifier: String): Boolean = {
val res = validatedCaches.contains(cacheIdentifier)
debug(s"Cache is validated: $cacheIdentifier $res")
res
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import io.viash.schemas._
trait GitRepositoryTrait extends AbstractGitRepository {

def getCheckoutUri(): String = uri
def getCacheIdentifier(): Option[String] = None

// Strip the protocol and user credentials
val storePath = uri.replaceFirst("^.+://", "").replaceFirst(".+@", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package io.viash.config.dependencies

import io.viash.schemas._
import io.viash.helpers.Git

trait GithubRepositoryTrait extends AbstractGitRepository {

Expand All @@ -26,18 +27,20 @@ trait GithubRepositoryTrait extends AbstractGitRepository {
val repo: String


override def getCheckoutUri(): String = {
if (checkGitAuthentication(uri_nouser)) {
def getCheckoutUri(): String = {
if (Git.checkGitAuthentication(uri_nouser)) {
// First try https with bad user & password to disable asking credentials
// If successful, do checkout without the dummy credentials, don't want to store them in the repo remote address
uri
} else if (checkGitAuthentication(uri_ssh)) {
} else if (Git.checkGitAuthentication(uri_ssh)) {
// Checkout with ssh key
uri_ssh
} else {
uri
}
}
def getCacheIdentifier(): Option[String] =
Some(s"github-${repo.replace("/", "-")}${tag.map(_.prepended('-')).getOrElse("")}")

lazy val uri = s"https://github.com/$repo.git"
lazy val uri_ssh = s"[email protected]:$repo.git"
Expand Down
36 changes: 3 additions & 33 deletions src/main/scala/io/viash/config/dependencies/Repository.scala
Original file line number Diff line number Diff line change
Expand Up @@ -100,38 +100,11 @@ object Repository extends Logging {
}
}

// A poor man's approach to caching. The cache is only valid within this run of Viash.
// However, it solves the issue of having to fetch the same repository over and over again, now we just do it once per run.
// When proper multi-session caching would need to check for changed code bases, now we get this virtually for free.
// We just fetched a code base and we have to assume it will not change within this session.
private val cachedRepos = scala.collection.mutable.ListBuffer[Repository]()
private def getCachedRepository(repo: Repository): Option[Repository] = {
// Don't compare localPath as that is the information we're looking for.
val foundRepo = cachedRepos.find(p => p.copyRepo(localPath = "").equals(repo))
// Map Some(foundRepo) to original repo but with localPath filled in, returns None if no cache found.
foundRepo.map(r => repo.copyRepo(localPath = r.localPath))
}
private def storeRepositoryInCache(repo: Repository) = {
// don't cache local repositories with a path relative to the config. Identical paths but to different configs *might* result in different resolved paths.
repo match {
case r: LocalRepository if r.path.isDefined && !r.path.get.startsWith("/") =>
// don't do anything, this repo is not reliably cacheable
case _ =>
cachedRepos.append(repo)
}
}
def get(repo: Repository, configDir: Path, packageRootDir: Option[Path]): Repository = {

def cache(repo: Repository, configDir: Path, packageRootDir: Option[Path]): Repository = {

// Check if we can get a locally cached version of the repo
val existingRepo = getCachedRepository(repo)
if (existingRepo.isDefined)
return existingRepo.get

// No cache found so fetch it
val newRepo = repo match {
repo match {
case r: AbstractGitRepository => {
val r2 = r.checkoutSparse()
val r2 = r.getSparseRepoInTemp()
val r3 = r2.checkout()
// Stopgap solution to be able to use built repositories which were not built with dependency aware Viash version.
// TODO remove this section once it's deemed no longer necessary
Expand All @@ -155,8 +128,5 @@ object Repository extends Logging {
case r => r
}

// Store the newly fetched repo in the cache
storeRepositoryInCache(newRepo)
newRepo
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,30 @@
package io.viash.config.dependencies

import io.viash.schemas._
import io.viash.helpers.Git

trait ViashhubRepositoryTrait extends AbstractGitRepository {

@description("The name of the Viash-Hub repository.")
@example("repo: openpipelines-bio/openpipeline", "yaml")
val repo: String

override def getCheckoutUri(): String = {
if (checkGitAuthentication(uri_nouser)) {
def getCheckoutUri(): String = {
if (Git.checkGitAuthentication(uri_nouser)) {
// First try https with bad user & password to disable asking credentials
// If successful, do checkout without the dummy credentials, don't want to store them in the repo remote address
uri
} else if (checkGitAuthentication(uri_ssh)) {
} else if (Git.checkGitAuthentication(uri_ssh)) {
// Checkout with ssh key
uri_ssh
} else {
uri
}
}

def getCacheIdentifier(): Option[String] =
Some(s"viashhub-${repo.replace("/", "-")}${tag.map(_.prepended('-')).getOrElse("")}")

lazy val uri = s"https://viash-hub.com/$repo.git"
lazy val uri_ssh = s"[email protected]:$repo.git"
val fakeCredentials = "nouser:nopass@" // obfuscate the credentials a bit so we don't trigger GitGuardian
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/io/viash/helpers/DependencyResolver.scala
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ object DependencyResolver extends Logging {
.map{d =>
val repo = d.repository.toOption.get
val configDir = Paths.get(config2.build_info.get.config).getParent()
val localRepoPath = Repository.cache(repo, configDir, packageRootDir)
val localRepoPath = Repository.get(repo, configDir, packageRootDir)
d.copy(repository = Right(localRepoPath))
}
)(config2)
Expand Down
Loading
Loading