From 31c63a956911fda095359de4d06ec740305f6a1f Mon Sep 17 00:00:00 2001 From: Richard van Heest Date: Tue, 7 Aug 2018 14:42:23 +0200 Subject: [PATCH] Complete and validate (#13) * isComplete implementation * IntelliJ formatting * isValid implementation * fix compiler warning * make docs more clear * bullit list in scaladoc * virtually valid type definition + documentation * clarify documentation on isComplete and isValid * formatting + imports * isVirtuallyValid is no longer part of this issue * formatting * extra tests for isComplete and isValid * formatting --- src/main/scala/nl/knaw/dans/bag/DansBag.scala | 35 +++ .../scala/nl/knaw/dans/bag/v0/DansV0Bag.scala | 28 +- .../dans/bag/fixtures/FetchFileMetadata.scala | 10 + .../bag/fixtures/TestSupportFixture.scala | 1 + .../nl/knaw/dans/bag/v0/DansV0BagSpec.scala | 265 +++++++++++++++++- 5 files changed, 331 insertions(+), 8 deletions(-) diff --git a/src/main/scala/nl/knaw/dans/bag/DansBag.scala b/src/main/scala/nl/knaw/dans/bag/DansBag.scala index 00522863..3aeb2e28 100644 --- a/src/main/scala/nl/knaw/dans/bag/DansBag.scala +++ b/src/main/scala/nl/knaw/dans/bag/DansBag.scala @@ -539,6 +539,41 @@ trait DansBag { * `scala.util.Failure` otherwise */ def save(): Try[Unit] + + /** + * Verifies if a bag is complete. + * According to the BagIt version 16 specs, a bag is complete when: + * - bagit.txt is present + * - data/ directory is present + * - at least one payload manifest exists + * - all fetch items are present in the bag + * - all files in every payload manifest must be present + * - all files in every tag manifest must be present + * - (>= V1.0 bag) all files in the payload directory are listed in all payload manifests + * - (< V1.0 bag) all files in the payload directory are listed in at least one payload manifest + * + * @return either `Unit` (if the bag is complete) or `String` (containing the error message if + * the bag is not complete) + */ + def isComplete: Either[String, Unit] + + /** + * Verifies if a bag is valid. + * According to the BagIt v16 specs, a bag is valid when: + * - the bag is complete + * - every checksum in every payload manifest has been successfully verified against the + * contents of the corresponding file + * - every checksum in every tag manifest has been successfully verified against the + * contents of the corresponding file + * + * Due to calculating the checksums for all files, this method may take some time to complete and + * return. It is therefore strongly advised to wrap a call to this method in a `Promise`/`Future`, + * `Observable` or any other desired data structure that deals with latency in a proper way. + * + * @return either `Unit` (if the bag is valid) or `String` (containing the error message if + * the bag is not valid) + */ + def isValid: Either[String, Unit] } object DansBag { diff --git a/src/main/scala/nl/knaw/dans/bag/v0/DansV0Bag.scala b/src/main/scala/nl/knaw/dans/bag/v0/DansV0Bag.scala index e4291ec1..537d7fa6 100644 --- a/src/main/scala/nl/knaw/dans/bag/v0/DansV0Bag.scala +++ b/src/main/scala/nl/knaw/dans/bag/v0/DansV0Bag.scala @@ -23,9 +23,10 @@ import java.util.{ UUID, Set => jSet } import better.files.{ CloseableOps, Disposable, File, Files, ManagedResource } import gov.loc.repository.bagit.creator.BagCreator -import gov.loc.repository.bagit.domain.{ Version, Bag => LocBag, Manifest => LocManifest, Metadata => LocMetadata, FetchItem => LocFetchItem } +import gov.loc.repository.bagit.domain.{ Version, Bag => LocBag, FetchItem => LocFetchItem, Manifest => LocManifest, Metadata => LocMetadata } import gov.loc.repository.bagit.reader.BagReader import gov.loc.repository.bagit.util.PathUtils +import gov.loc.repository.bagit.verify.BagVerifier import gov.loc.repository.bagit.writer.{ BagitFileWriter, FetchWriter, ManifestWriter, MetadataWriter } import nl.knaw.dans.bag.ChecksumAlgorithm.{ ChecksumAlgorithm, locDeconverter } import nl.knaw.dans.bag.{ ChecksumAlgorithm, DansBag, FetchItem, RelativePath, betterFileToPath } @@ -185,7 +186,7 @@ class DansV0Bag private(private[v0] val locBag: LocBag) extends DansBag { .map(_.asScala) .collect { case Seq(userId) => userId - case userIds if userIds.size > 1 => throw new IllegalStateException(s"Only one EASY-User-Account allowed; found ${userIds.size}") + case userIds if userIds.size > 1 => throw new IllegalStateException(s"Only one EASY-User-Account allowed; found ${ userIds.size }") } } @@ -209,7 +210,9 @@ class DansV0Bag private(private[v0] val locBag: LocBag) extends DansBag { /** * @inheritdoc */ - override def fetchFiles: Seq[FetchItem] = locBag.getItemsToFetch.asScala.map(fetch => fetch: FetchItem) + override def fetchFiles: Seq[FetchItem] = { + locBag.getItemsToFetch.asScala.map(fetch => fetch: FetchItem) + } /** * @inheritdoc @@ -453,7 +456,8 @@ class DansV0Bag private(private[v0] val locBag: LocBag) extends DansBag { /** * @inheritdoc */ - override def addPayloadFile(inputStream: InputStream)(pathInData: RelativePath): Try[DansV0Bag] = Try { + override def addPayloadFile(inputStream: InputStream) + (pathInData: RelativePath): Try[DansV0Bag] = Try { val file = pathInData(data) if (file.exists) @@ -633,6 +637,22 @@ class DansV0Bag private(private[v0] val locBag: LocBag) extends DansBag { ManifestWriter.writeTagManifests(locBag.getTagManifests, baseDir, baseDir, fileEncoding) } + /** + * @inheritdoc + */ + override def isComplete: Either[String, Unit] = { + Try { new ManagedResource(new BagVerifier()).apply(_.isComplete(this.locBag, false)) } + .toEither.left.map(_.getMessage) + } + + /** + * @inheritdoc + */ + override def isValid: Either[String, Unit] = { + Try { new ManagedResource(new BagVerifier()).apply(_.isValid(this.locBag, false)) } + .toEither.left.map(_.getMessage) + } + protected def validateURL(url: URL): Unit = { if (url.getProtocol != "http" && url.getProtocol != "https") throw new IllegalArgumentException("url can only have protocol 'http' or 'https'") diff --git a/src/test/scala/nl/knaw/dans/bag/fixtures/FetchFileMetadata.scala b/src/test/scala/nl/knaw/dans/bag/fixtures/FetchFileMetadata.scala index 14dd34a4..979abf28 100644 --- a/src/test/scala/nl/knaw/dans/bag/fixtures/FetchFileMetadata.scala +++ b/src/test/scala/nl/knaw/dans/bag/fixtures/FetchFileMetadata.scala @@ -17,11 +17,17 @@ package nl.knaw.dans.bag.fixtures import java.net.URL +import better.files.File import nl.knaw.dans.bag.RelativePath trait FetchFileMetadata { this: TestSupportFixture => + def findFile(pathRelativeToTestResources: String): File = { + File(getClass.getResource(pathRelativeToTestResources)) + } + + val lipsum1File: File = findFile("/fetch-files/lipsum1.txt") val lipsum1URL: URL = new URL("https://raw.githubusercontent.com/rvanheest-DANS-KNAW/" + "dans-bag-lib/c8681a5932e4081dfec95680abefc9a07740a97a/src/test/resources/fetch-files/" + "lipsum1.txt") @@ -33,6 +39,7 @@ trait FetchFileMetadata { "19d5cdbb6f0f84be9145643ac793dd63eaee407e51d6dacd5ba3c1d8f6fe07da" val lipsum1Size: Long = 485L + val lipsum2File: File = findFile("/fetch-files/lipsum2.txt") val lipsum2URL: URL = new URL("https://raw.githubusercontent.com/rvanheest-DANS-KNAW/" + "dans-bag-lib/c8681a5932e4081dfec95680abefc9a07740a97a/src/test/resources/fetch-files/" + "lipsum2.txt") @@ -44,6 +51,7 @@ trait FetchFileMetadata { "d4086396119e2ebaadd79548d623d8dfa350f854af6271da68311abb1c0e73e8" val lipsum2Size: Long = 989L + val lipsum3File: File = findFile("/fetch-files/lipsum3.txt") val lipsum3URL: URL = new URL("https://raw.githubusercontent.com/rvanheest-DANS-KNAW/" + "dans-bag-lib/c8681a5932e4081dfec95680abefc9a07740a97a/src/test/resources/fetch-files/" + "lipsum3.txt") @@ -55,6 +63,7 @@ trait FetchFileMetadata { "647c11784b75eed3a20ac60efda146f040daeaa3837fa3aba9a2b94e30da0965" val lipsum3Size: Long = 1641L + val lipsum4File: File = findFile("/fetch-files/lipsum4.txt") val lipsum4URL: URL = new URL("https://raw.githubusercontent.com/rvanheest-DANS-KNAW/" + "dans-bag-lib/c8681a5932e4081dfec95680abefc9a07740a97a/src/test/resources/fetch-files/" + "lipsum4.txt") @@ -66,6 +75,7 @@ trait FetchFileMetadata { "e7e3639cba8c3a864fea8a6f58fc0a37310b3f4550466d8ff2c9ba0835bb3231" val lipsum4Size: Long = 2095L + val lipsum5File: File = findFile("/fetch-files/lipsum5.txt") val lipsum5URL: URL = new URL("https://raw.githubusercontent.com/rvanheest-DANS-KNAW/" + "dans-bag-lib/c8681a5932e4081dfec95680abefc9a07740a97a/src/test/resources/fetch-files/" + "lipsum5.txt") diff --git a/src/test/scala/nl/knaw/dans/bag/fixtures/TestSupportFixture.scala b/src/test/scala/nl/knaw/dans/bag/fixtures/TestSupportFixture.scala index 9619b50e..f836c4f6 100644 --- a/src/test/scala/nl/knaw/dans/bag/fixtures/TestSupportFixture.scala +++ b/src/test/scala/nl/knaw/dans/bag/fixtures/TestSupportFixture.scala @@ -21,4 +21,5 @@ trait TestSupportFixture extends FlatSpec with Matchers with Inside with OptionValues + with EitherValues with Inspectors diff --git a/src/test/scala/nl/knaw/dans/bag/v0/DansV0BagSpec.scala b/src/test/scala/nl/knaw/dans/bag/v0/DansV0BagSpec.scala index 1e75ffad..add0eb34 100644 --- a/src/test/scala/nl/knaw/dans/bag/v0/DansV0BagSpec.scala +++ b/src/test/scala/nl/knaw/dans/bag/v0/DansV0BagSpec.scala @@ -22,7 +22,7 @@ import java.nio.file.{ FileAlreadyExistsException, NoSuchFileException } import java.util.UUID import better.files.File -import gov.loc.repository.bagit.conformance.{ BagitWarning, BagLinter } +import gov.loc.repository.bagit.conformance.{ BagLinter, BagitWarning } import gov.loc.repository.bagit.domain.Version import gov.loc.repository.bagit.verify.BagVerifier import nl.knaw.dans.bag.ChecksumAlgorithm.ChecksumAlgorithm @@ -32,7 +32,6 @@ import nl.knaw.dans.bag.fixtures._ import org.joda.time.format.ISODateTimeFormat import org.joda.time.{ DateTime, DateTimeZone } -import scala.collection import scala.collection.JavaConverters._ import scala.language.{ existentials, implicitConversions, postfixOps } import scala.util.{ Failure, Success, Try } @@ -1239,11 +1238,11 @@ class DansV0BagSpec extends TestSupportFixture case Failure(e: InvalidChecksumException) => e should have message s"checksum (${ ChecksumAlgorithm.SHA1 }) of the downloaded file was 'invalid-checksum' but should be '$checksum'" - bag.list.withFilter(_.isDirectory).map(_.name).toList should contain only ( + bag.list.withFilter(_.isDirectory).map(_.name).toList should contain only( "data", "metadata", ) - bag.fetchFiles should contain (fetchItem) + bag.fetchFiles should contain(fetchItem) x.toJava shouldNot exist } } @@ -2828,4 +2827,262 @@ class DansV0BagSpec extends TestSupportFixture } } } + + "isComplete" should "succeed on a complete bag" in { + val bag = simpleBagV0() + + (bag / "bagit.txt").toJava should exist + bag.data.toJava should exist + bag.glob("manifest-*.txt").toList should not be empty + bag.fetchFiles shouldBe empty + + bag.isComplete shouldBe 'right + } + + it should "succeed on a complete bag with resolved fetch files" in { + val bag = fetchBagV0() + bag.data / "sub" createDirectory() + lipsum1File.copyTo(bag.data / "sub" / "u") + lipsum2File.copyTo(bag.data / "sub" / "v") + lipsum3File.copyTo(bag.data / "y-old") + lipsum4File.copyTo(bag.data / "x") + + (bag / "bagit.txt").toJava should exist + bag.data.toJava should exist + bag.glob("manifest-*.txt").toList should not be empty + every(bag.fetchFiles.map(_.file.toJava)) should exist + + bag.isComplete shouldBe 'right + } + + it should "fail when bagit.txt does not exists" in { + val bag = simpleBagV0() + + bag / "bagit.txt" delete() + + bag.isComplete.left.value shouldBe s"File [${ bag / "bagit.txt" }] should exist but it doesn't!" + } + + it should "fail when no data/ directory is present" in pendingUntilFixed { // TODO https://github.com/LibraryOfCongress/bagit-java/issues/123 + val bag = simpleBagV0() + + bag.data.delete() + + bag.isComplete.left.value shouldBe s"File [${ bag.data }] should exist but it doesn't!" + } + + it should "fail when no payload manifest exists" in { + val bag = multipleManifestsBagV0() + + bag.glob("manifest-*.txt").foreach(_.delete()) + + bag.isComplete.left.value shouldBe "Bag does not contain a payload manifest file!" + } + + it should "fail when not all fetch files are resolved" in { + val bag = fetchBagV0() + + bag.isComplete.left.value shouldBe s"Fetch item [$lipsum1URL 12 ${ bag.data / "sub" / "u" }] has not been fetched!" + } + + it should "fail when not all files in the payload manifests are present" in { + val bag = simpleBagV0() + val x = bag.data / "x" + + forEvery(bag.payloadManifests) { + case (_, manifest) => + manifest should contain key x + } + + x.delete() + + bag.isComplete.left.value shouldBe s"Manifest(s) contains file(s) [$x] but they don't exist!" + } + + it should "fail when not all files in the tag manifests are present" in { + val bag = simpleBagV0() + val bagInfo = bag / "bag-info.txt" + + forEvery(bag.tagManifests) { + case (_, manifest) => + manifest should contain key bagInfo + } + + bagInfo.delete() + + bag.isComplete.left.value shouldBe s"Manifest(s) contains file(s) [$bagInfo] but they don't exist!" + } + + it should "fail when not all files are listed in all payload manifests" in { + val bagDir = multipleManifestsBagDirV0 + bagDir / "manifest-sha1.txt" writeText "" + + val bag = multipleManifestsBagV0().withBagitVersion(1, 0) + + bag.payloadManifests(ChecksumAlgorithm.SHA1) shouldBe empty + + bag.isComplete.left.value should ( + startWith("File [") and + endWith("] is in the payload directory but isn't listed in manifest manifest-sha1.txt!") + ) + } + + "isValid" should "succeed on a valid bag" in { + val bag = simpleBagV0() + + // check isComplete verifications + (bag / "bagit.txt").toJava should exist + bag.data.toJava should exist + bag.glob("manifest-*.txt").toList should not be empty + bag.fetchFiles shouldBe empty + + // check payload manifest verification + forEvery(bag.payloadManifests) { + case (algorithm, manifest) => + forEvery(manifest) { + case (file, checksum) => + file.checksum(algorithm).toLowerCase shouldBe checksum + } + } + + // check tag manifest verification + forEvery(bag.tagManifests) { + case (algorithm, manifest) => + forEvery(manifest) { + case (file, checksum) => + file.checksum(algorithm).toLowerCase shouldBe checksum + } + } + + bag.isValid shouldBe 'right + } + + it should "succeed on a valid bag with resolved fetch files" in { + val bag = fetchBagV0() + bag.data / "sub" createDirectory() + lipsum1File.copyTo(bag.data / "sub" / "u") + lipsum2File.copyTo(bag.data / "sub" / "v") + lipsum3File.copyTo(bag.data / "y-old") + lipsum4File.copyTo(bag.data / "x") + + // check isComplete verifications + (bag / "bagit.txt").toJava should exist + bag.data.toJava should exist + bag.glob("manifest-*.txt").toList should not be empty + every(bag.fetchFiles.map(_.file.toJava)) should exist + + // check payload manifest verification + forEvery(bag.payloadManifests) { + case (algorithm, manifest) => + forEvery(manifest) { + case (file, checksum) => + file.checksum(algorithm).toLowerCase shouldBe checksum + } + } + + // check tag manifest verification + forEvery(bag.tagManifests) { + case (algorithm, manifest) => + forEvery(manifest) { + case (file, checksum) => + file.checksum(algorithm).toLowerCase shouldBe checksum + } + } + + bag.isValid shouldBe 'right + } + + it should "fail when bagit.txt does not exists" in { + val bag = simpleBagV0() + + bag / "bagit.txt" delete() + + bag.isValid.left.value shouldBe s"File [${ bag / "bagit.txt" }] should exist but it doesn't!" + } + + it should "fail when no data/ directory is present" in pendingUntilFixed { // TODO https://github.com/LibraryOfCongress/bagit-java/issues/123 + val bag = simpleBagV0() + + bag.data.delete() + + bag.isValid.left.value shouldBe s"File [${ bag.data }] should exist but it doesn't!" + } + + it should "fail when no payload manifest exists" in { + val bag = multipleManifestsBagV0() + + bag.glob("manifest-*.txt").foreach(_.delete()) + + bag.isValid.left.value shouldBe "Bag does not contain a payload manifest file!" + } + + it should "fail when not all fetch files are resolved" in { + val bag = fetchBagV0() + + bag.isValid.left.value shouldBe s"Fetch item [$lipsum1URL 12 ${ bag.data / "sub" / "u" }] has not been fetched!" + } + + it should "fail when not all files in the payload manifests are present" in { + val bag = simpleBagV0() + val x = bag.data / "x" + + forEvery(bag.payloadManifests) { + case (_, manifest) => + manifest should contain key x + } + + x.delete() + + bag.isValid.left.value shouldBe s"Manifest(s) contains file(s) [$x] but they don't exist!" + } + + it should "fail when not all files in the tag manifests are present" in { + val bag = simpleBagV0() + val bagInfo = bag / "bag-info.txt" + + forEvery(bag.tagManifests) { + case (_, manifest) => + manifest should contain key bagInfo + } + + bagInfo.delete() + + bag.isValid.left.value shouldBe s"Manifest(s) contains file(s) [$bagInfo] but they don't exist!" + } + + it should "fail when not all files are listed in all payload manifests" in { + val bagDir = multipleManifestsBagDirV0 + bagDir / "manifest-sha1.txt" writeText "" + + val bag = multipleManifestsBagV0().withBagitVersion(1, 0) + + bag.payloadManifests(ChecksumAlgorithm.SHA1) shouldBe empty + + bag.isValid.left.value should ( + startWith("File [") and + endWith("] is in the payload directory but isn't listed in manifest manifest-sha1.txt!") + ) + } + + it should "fail when the bag contains an invalid payload checksum" in { + val bag = simpleBagV0() + val x = bag.data / "x" + val oldSha1 = x.sha1.toLowerCase + + x writeText "this causes an invalid checksum for the file" + val newSha1 = x.sha1.toLowerCase + + bag.isValid.left.value shouldBe s"File [$x] is suppose to have a [SHA-1] hash of [$oldSha1] but was computed [$newSha1]." + } + + it should "fail when the bag contains an invalid tag checksum" in { + val bag = simpleBagV0() + val datasetxml = bag / "metadata" / "dataset.xml" + val oldSha1 = datasetxml.sha1.toLowerCase + + datasetxml writeText "this causes an invalid checksum for the file" + val newSha1 = datasetxml.sha1.toLowerCase + + bag.isValid.left.value shouldBe s"File [$datasetxml] is suppose to have a [SHA-1] hash of [$oldSha1] but was computed [$newSha1]." + } }