diff --git a/filesystems/http/src/main/scala/cromwell/filesystems/http/HttpPathBuilder.scala b/filesystems/http/src/main/scala/cromwell/filesystems/http/HttpPathBuilder.scala index fa1861dd81a..29910cd43ac 100644 --- a/filesystems/http/src/main/scala/cromwell/filesystems/http/HttpPathBuilder.scala +++ b/filesystems/http/src/main/scala/cromwell/filesystems/http/HttpPathBuilder.scala @@ -55,6 +55,8 @@ case class HttpPath(nioPath: NioPath) extends Path { override def pathWithoutScheme: String = pathAsString.replaceFirst("http[s]?://", "") + def pathWithoutSchemeOrQueryOrFragment: String = pathWithoutScheme.split("[?#]").head + def fetchSize(implicit executionContext: ExecutionContext, actorSystem: ActorSystem): Future[Long] = Http().singleRequest(HttpRequest(uri = pathAsString, method = HttpMethods.HEAD)).map { response => response.discardEntityBytes() diff --git a/supportedBackends/tes/src/main/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActor.scala b/supportedBackends/tes/src/main/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActor.scala index 44b24e89980..f8e0e2044cc 100644 --- a/supportedBackends/tes/src/main/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/tes/src/main/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActor.scala @@ -37,6 +37,7 @@ import cromwell.core.retry.Retry._ import cromwell.core.retry.SimpleExponentialBackoff import cromwell.filesystems.blob.{BlobPath, WSMBlobSasTokenGenerator} import cromwell.filesystems.drs.{DrsPath, DrsResolver} +import cromwell.filesystems.http.HttpPath import net.ceedubs.ficus.Ficus._ import wom.values.WomFile @@ -176,6 +177,46 @@ object TesAsyncBackendJobExecutionActor { ) ) } + + def mapInputPath(path: Path, tesJobPaths: TesJobPaths, commandDirectory: Path): String = + path match { + case drsPath: DrsPath => + val filepath = DrsResolver.getContainerRelativePath(drsPath).unsafeRunSync() + tesJobPaths.containerExec(commandDirectory, filepath) + case httpPath: HttpPath => + // Strip the query params and anything after a # from http paths when turning them into local paths + tesJobPaths.callInputsDockerRoot + .resolve(httpPath.pathWithoutSchemeOrQueryOrFragment.stripPrefix("/")) + .pathAsString + case path: Path if path.startsWith(tesJobPaths.workflowPaths.DockerRoot) => + path.pathAsString + case path: Path if path.equals(tesJobPaths.callExecutionRoot) => + commandDirectory.pathAsString + case path: Path if path.startsWith(tesJobPaths.callExecutionRoot) => + tesJobPaths.containerExec(commandDirectory, path.name) + case path: Path if path.startsWith(tesJobPaths.callRoot) => + tesJobPaths.callDockerRoot.resolve(path.name).pathAsString + case path: BlobPath if path.startsWith(tesJobPaths.workflowPaths.workflowRoot) => + // Blob paths can get really long, which causes problems for some tools. If this input file + // lives in the workflow execution directory, strip off that prefix from the path we're + // generating inside `inputs/` to keep the total path length under control. + // In Terra on Azure, this saves us 200+ characters. + tesJobPaths.callInputsDockerRoot + .resolve( + path.pathStringWithoutPrefix(tesJobPaths.workflowPaths.workflowRoot) + ) + .pathAsString + case path: BlobPath if path.startsWith(tesJobPaths.workflowPaths.executionRoot) => + // See comment above... if this file is in the execution root, strip that off. + // In Terra on Azure, this saves us 160+ characters. + tesJobPaths.callInputsDockerRoot + .resolve( + path.pathStringWithoutPrefix(tesJobPaths.workflowPaths.executionRoot) + ) + .pathAsString + case _ => + tesJobPaths.callInputsDockerRoot.resolve(path.pathWithoutScheme.stripPrefix("/")).pathAsString + } } class TesAsyncBackendJobExecutionActor(override val standardParams: StandardAsyncExecutionActorParams) @@ -262,37 +303,8 @@ class TesAsyncBackendJobExecutionActor(override val standardParams: StandardAsyn override def mapCommandLineJobInputWomFile(womFile: WomFile): WomFile = womFile.mapFile(value => getPath(value) match { - case Success(drsPath: DrsPath) => - val filepath = DrsResolver.getContainerRelativePath(drsPath).unsafeRunSync() - tesJobPaths.containerExec(commandDirectory, filepath) - case Success(path: Path) if path.startsWith(tesJobPaths.workflowPaths.DockerRoot) => - path.pathAsString - case Success(path: Path) if path.equals(tesJobPaths.callExecutionRoot) => - commandDirectory.pathAsString - case Success(path: Path) if path.startsWith(tesJobPaths.callExecutionRoot) => - tesJobPaths.containerExec(commandDirectory, path.name) - case Success(path: Path) if path.startsWith(tesJobPaths.callRoot) => - tesJobPaths.callDockerRoot.resolve(path.name).pathAsString - case Success(path: BlobPath) if path.startsWith(tesJobPaths.workflowPaths.workflowRoot) => - // Blob paths can get really long, which causes problems for some tools. If this input file - // lives in the workflow execution directory, strip off that prefix from the path we're - // generating inside `inputs/` to keep the total path length under control. - // In Terra on Azure, this saves us 200+ characters. - tesJobPaths.callInputsDockerRoot - .resolve( - path.pathStringWithoutPrefix(tesJobPaths.workflowPaths.workflowRoot) - ) - .pathAsString - case Success(path: BlobPath) if path.startsWith(tesJobPaths.workflowPaths.executionRoot) => - // See comment above... if this file is in the execution root, strip that off. - // In Terra on Azure, this saves us 160+ characters. - tesJobPaths.callInputsDockerRoot - .resolve( - path.pathStringWithoutPrefix(tesJobPaths.workflowPaths.executionRoot) - ) - .pathAsString case Success(path: Path) => - tesJobPaths.callInputsDockerRoot.resolve(path.pathWithoutScheme.stripPrefix("/")).pathAsString + TesAsyncBackendJobExecutionActor.mapInputPath(path, tesJobPaths, commandDirectory) case _ => value } diff --git a/supportedBackends/tes/src/test/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/tes/src/test/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActorSpec.scala index 53ba95606d0..f920f3633a7 100644 --- a/supportedBackends/tes/src/test/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActorSpec.scala +++ b/supportedBackends/tes/src/test/scala/cromwell/backend/impl/tes/TesAsyncBackendJobExecutionActorSpec.scala @@ -1,18 +1,27 @@ package cromwell.backend.impl.tes import common.mock.MockSugar +import cromwell.backend.BackendJobDescriptorKey +import cromwell.backend.BackendSpec.buildWdlWorkflowDescriptor import cromwell.core.logging.JobLogger -import cromwell.core.path.NioPath +import cromwell.core.path.{DefaultPathBuilder, NioPath} import cromwell.filesystems.blob.{BlobFileSystemManager, BlobPath, WSMBlobSasTokenGenerator} +import cromwell.filesystems.http.HttpPathBuilder import org.mockito.ArgumentMatchers.any import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks +import wom.graph.CommandCallNode import java.time.Duration import java.time.temporal.ChronoUnit import scala.util.{Failure, Try} -class TesAsyncBackendJobExecutionActorSpec extends AnyFlatSpec with Matchers with MockSugar { +class TesAsyncBackendJobExecutionActorSpec + extends AnyFlatSpec + with Matchers + with MockSugar + with TableDrivenPropertyChecks { behavior of "TesAsyncBackendJobExecutionActor" val fullyQualifiedName = "this.name.is.more.than.qualified" @@ -175,4 +184,53 @@ class TesAsyncBackendJobExecutionActorSpec extends AnyFlatSpec with Matchers wit generatedBashScript should include(echoCommandSubstring) generatedBashScript should include(exportCommandSubstring) } + + private val httpPathTestCases = Table( + ("test name", "http path", "local path in input dir"), + ( + "strip simple kv query params", + "http://example.com/my_sample.bam?k1=v1&k2=v2", + "example.com/my_sample.bam" + ), + ( + "handle http paths without query params", + "http://example.com/my_sample.bam", + "example.com/my_sample.bam" + ), + ( + "handle http paths without params but with a ?", + "http://example.com/my_sample.bam?", + "example.com/my_sample.bam" + ), + ( + "handle a blob file with SAS token attached", + "https://lzbc096764ae93ffff9f406e.blob.core.windows.net/sc-a7f7a9e0-2dcf-465c-997b-a276090a52da/workspace-services/cbas/terra-app-2f577477-763b-4e27-8e28-b03d91b6f3be/cromwell-workflow-logs/workflow.c621a5df-37f1-422d-b91a-1a65f6112a6a.log?sv=2023-11-03&spr=https&st=2024-04-09T23%3A35%3A37Z&se=2024-04-10T07%3A50%3A37Z&sr=c&sp=racwdlt&sig=REDACTEDS&rscd=100067995116984528334", + "lzbc096764ae93ffff9f406e.blob.core.windows.net/sc-a7f7a9e0-2dcf-465c-997b-a276090a52da/workspace-services/cbas/terra-app-2f577477-763b-4e27-8e28-b03d91b6f3be/cromwell-workflow-logs/workflow.c621a5df-37f1-422d-b91a-1a65f6112a6a.log" + ), + ( + "handle an http path with fragment", + "http://example.com/my_sample.bam#my_favorite_part", + "example.com/my_sample.bam" + ), + ( + "handle an http path with fragment and query params", + "http://example.com/my_sample.bam?k=yourface#my_favorite_part", + "example.com/my_sample.bam" + ) + ) + + forAll(httpPathTestCases) { (testName, httpPath, localPathInInputDir) => + it should testName in { + val wd = buildWdlWorkflowDescriptor(TestWorkflows.HelloWorld) + val call: CommandCallNode = wd.callable.taskCallNodes.head + val jobKey = BackendJobDescriptorKey(call, None, 1) + val jobPaths = TesJobPaths(jobKey, wd, TesTestConfig.backendConfig) + val commandDirectory = DefaultPathBuilder.build("/my/command/dir").get + val httpBuilder = new HttpPathBuilder() + + val httpPathWithParams = httpBuilder.build(httpPath) + val actual = TesAsyncBackendJobExecutionActor.mapInputPath(httpPathWithParams.get, jobPaths, commandDirectory) + actual shouldBe s"${jobPaths.callInputsDockerRoot}/$localPathInInputDir" + } + } }