Skip to content

Commit

Permalink
GCP/Datalake validation improvements (#1022)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsloan authored Jan 4, 2024
1 parent 66bfd59 commit 7ca74af
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ import cats.data.Validated
import io.lenses.streamreactor.connect.cloud.common.model.location.CloudLocation
import io.lenses.streamreactor.connect.cloud.common.model.location.CloudLocationValidator

/**
* This is a best-efforts validator for Datalake Container names. It won't validate DNS, ownership etc but it will allow the sink to fail fast in case validation fails on the broad rules.
*/
object DatalakeLocationValidator extends CloudLocationValidator {
private val ContainerNamePattern = "^[a-z0-9][a-z0-9-]{1,61}[a-z0-9]$".r

private val ContainerNamePattern = "^[a-z0-9](?!.*--)[a-z0-9-]{1,61}[a-z0-9]$".r

def validate(location: CloudLocation): Validated[Throwable, CloudLocation] =
Validated.fromEither(
Expand All @@ -29,11 +33,21 @@ object DatalakeLocationValidator extends CloudLocationValidator {
} yield location,
)

/**
* From [[https://learn.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata Microsoft Datalake Docs]]
* A container name must be a valid DNS name, conforming to the following naming rules:
* <ul>
* <li>Container names must start or end with a letter or number, and can contain only letters, numbers, and the hyphen/minus (-) character.</li>
* <li>Every hyphen/minus (-) character must be immediately preceded and followed by a letter or number; consecutive hyphens aren't permitted in container names.</li>
* <li>All letters in a container name must be lowercase.</li>
* <li>Container names must be from 3 through 63 characters long.</li>
* </ul>
*/
private def validateBucketName(bucketName: String): Validated[Throwable, String] =
if (ContainerNamePattern.matches(bucketName)) {
Validated.Valid(bucketName)
} else {
Validated.Invalid(new IllegalArgumentException("Nested prefix not currently supported"))
Validated.Invalid(new IllegalArgumentException("Invalid bucket name"))
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,62 @@
package io.lenses.streamreactor.connect.datalake.model.location

import cats.data.Validated
import cats.implicits.catsSyntaxOptionId
import cats.implicits.none
import io.lenses.streamreactor.connect.cloud.common.model.location.CloudLocation
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers
import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks

class DatalakeLocationValidatorTest extends AnyFunSuite with Matchers {
class DatalakeLocationValidatorTest
extends AnyFunSuite
with Matchers
with ValidatedValues
with ScalaCheckPropertyChecks {

private val validBucketNames =
Table(
"bucketName",
"abc123",
"container1",
"name-123",
"a-b-c-4",
"x12",
"my-container-99",
)

private val invalidBucketNames =
Table(
("bucketName", "prompt"),
("-abc", "Invalid bucket name"),
("container!", "Invalid bucket name"),
("name--123", "Invalid bucket name"),
("AaBbCc", "Invalid bucket name"),
("my container", "Invalid bucket name"),
("12345678901234567890123456789012345678901234567890123456789012345", "Invalid bucket name"),
("ab_c", "Invalid bucket name"),
("x1", "Invalid bucket name"),
)
private implicit val validator: DatalakeLocationValidator.type = DatalakeLocationValidator

test("DatalakeLocationValidator should validate a valid bucket name") {
val location = CloudLocation("valid-bucket-name", none, "valid-path".some)
val result = DatalakeLocationValidator.validate(location)
result shouldBe Validated.Valid(location)
}
forAll(validBucketNames) {
bN: String =>
test(s"allow valid bucket name : $bN") {

test("DatalakeLocationValidator should return an error for an invalid bucket name") {
val location = CloudLocation("invalid_bucket_name", none, "valid-path".some)
val result = DatalakeLocationValidator.validate(location)
result shouldBe a[Validated.Invalid[_]]
val validLocation = CloudLocation(bN, Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
DatalakeLocationValidator.validate(validLocation)
result.value should be(validLocation)
}
}

test("DatalakeLocationValidator should allow prefix with a slash in") {
val location = CloudLocation("valid-bucket-name", "prefix/".some, "valid-path".some)
val result = DatalakeLocationValidator.validate(location)
result shouldBe Validated.Valid(location)
forAll(invalidBucketNames) {
(bN: String, prompt: String) =>
test(s"disallow invalid bucket names : $bN") {

val validLocation = CloudLocation(bN, Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
DatalakeLocationValidator.validate(validLocation)
result.leftValue.getMessage should startWith("Invalid bucket name")
result.leftValue.getMessage should endWith(prompt)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright 2017-2024 Lenses.io Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.lenses.streamreactor.connect.datalake.model.location

import cats.data.Validated
import org.scalatest.Assertion
import org.scalatest.Assertions
import org.scalatest.matchers.should.Matchers

trait ValidatedValues extends Assertions with Matchers {

implicit class ValidatedOps[E, A](validated: Validated[E, A]) {
def isValid: Assertion =
assert(validated.isValid, s"Expected Valid, but got Invalid($validated)")

def isInvalid: Assertion =
assert(validated.isInvalid, s"Expected Invalid, but got Valid($validated)")

def value: A =
validated.getOrElse(throw new NoSuchElementException("Validated is Invalid"))

def leftValue: E =
validated.swap.getOrElse(throw new NoSuchElementException("Validated is Valid"))
}

}

object ValidatedValues extends ValidatedValues
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,56 @@ import cats.data.Validated
import io.lenses.streamreactor.connect.cloud.common.model.location.CloudLocation
import io.lenses.streamreactor.connect.cloud.common.model.location.CloudLocationValidator

/**
* This is a best-efforts validator for GCP bucket names. It won't validate DNS, ownership etc but it will allow the sink to fail fast in case an obvious error is made (eg. IP addresses used).
*/
object GCPStorageLocationValidator extends CloudLocationValidator {
private val ContainerNamePattern = "^[a-z0-9][a-z0-9-\\_\\.]{1,61}[a-z0-9]$".r

private val ContainerNamePattern = "^[a-z0-9][a-z0-9-\\_\\.]{1,61}[a-z0-9]$".r
private val IPPattern = "^([0-9]{1,3}\\.){3}([0-9]{1,3})$".r
private val NamesContainingDotsPattern = """^.{1,63}(?:\..{1,63})*$""".r
def validate(location: CloudLocation): Validated[Throwable, CloudLocation] =
Validated.fromEither(
for {
_ <- validateBucketName(location.bucket).toEither
} yield location,
)

/**
* From [[https://cloud.google.com/storage/docs/buckets#naming Google Cloud Docs]]
* Your bucket names must meet the following requirements:
*
* <li>Bucket names can only contain lowercase letters, numeric characters, dashes (-), underscores (_), and dots (.). Spaces are not allowed. Names containing dots require verification.<li>
* <li>Bucket names must start and end with a number or letter.<li>
* <li>Bucket names must contain 3-63 characters. Names containing dots can contain up to 222 characters, but each dot-separated component can be no longer than 63 characters.<li>
* <li>Bucket names cannot be represented as an IP address in dotted-decimal notation (for example, 192.168.5.4).<li>
* <li>Bucket names cannot begin with the "goog" prefix.<li>
* <li>Bucket names cannot contain "google" or close misspellings, such as "g00gle".<li>
*/
private def validateBucketName(bucketName: String): Validated[Throwable, String] =
if (ContainerNamePattern.matches(bucketName)) {
Validated.Valid(bucketName)
if (bucketName.contains("google") || bucketName.contains("g00gle") || bucketName.startsWith("goog")) {
Validated.Invalid(
new IllegalArgumentException("Invalid bucket name (Rule: Bucket name cannot contain 'google' or variants"),
)
} else if (IPPattern.matches(bucketName)) {
Validated.Invalid(
new IllegalArgumentException("Invalid bucket name (Rule: Bucket name should not be an IP address"),
)
} else if (bucketName.contains(".")) {
if (!NamesContainingDotsPattern.matches(bucketName)) {
Validated.Invalid(new IllegalArgumentException("Invalid bucket name (Rule: Bucket name should match regex"))
} else if (bucketName.length > 222) {
Validated.Invalid(
new IllegalArgumentException(
"Invalid bucket name (Rule: Bucket name containing dots should be less than 222 characters",
),
)
} else {
Validated.Valid(bucketName)
}
} else if (!ContainerNamePattern.matches(bucketName)) {
Validated.Invalid(new IllegalArgumentException("Invalid bucket name (Rule: Bucket name should match regex"))
} else {
Validated.Invalid(new IllegalArgumentException("Nested prefix not currently supported"))
Validated.Valid(bucketName)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -20,40 +20,61 @@ import io.lenses.streamreactor.connect.cloud.common.model.location.CloudLocation
import io.lenses.streamreactor.connect.cloud.common.utils.SampleData.cloudLocationValidator
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers
import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks

class GCPStorageLocationValidatorTest extends AnyFunSuite with Matchers with ValidatedValues {
class GCPStorageLocationValidatorTest
extends AnyFunSuite
with Matchers
with ValidatedValues
with ScalaCheckPropertyChecks {

test("validate should succeed for a valid CloudLocation") {
val validLocation = CloudLocation("valid-bucket", Some("valid-prefix"))
val result = GCPStorageLocationValidator.validate(validLocation)
result.value should be(validLocation)
}
private val validBucketNames =
Table(
"bucketName",
"my-travel-maps",
"0f75d593-8e7b-4418-a5ba-cb2970f0b91e",
"valid_bucket_name",
"dot.valid.bucket.name",
"123",
)

test("validate should fail for an invalid bucket name") {
val invalidLocation = CloudLocation("invalid@bucket", Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
GCPStorageLocationValidator.validate(invalidLocation)
result.leftValue.getMessage should be("Nested prefix not currently supported")
}
private val invalidBucketNames =
Table(
("bucketName", "prompt"),
("My-Travel-Maps", "Bucket name should match regex"),
("my_google_bucket", "Bucket name cannot contain 'google' or variants"),
("test bucket", "Bucket name should match regex"),
("invalid bucket name with space", "Bucket name should match regex"),
("192.168.5.4", "Bucket name should not be an IP address"),
("goog_bucket", "Bucket name cannot contain 'google' or variants"),
("g00gle_bucket", "Bucket name cannot contain 'google' or variants"),
("test_bucket_name_with_65_characters_xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "Rule: Bucket name should match regex"),
("test_bucket.name.with.225.characters.xxxxxxxxxxxxxxxxxxxxxxxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Bucket name containing dots should be less than 222 characters",
),
)

test("validate should fail for a prefix with slashes") {
val invalidLocation = CloudLocation("valid-bucket", Some("slash/prefix"))
val result: Validated[Throwable, CloudLocation] =
GCPStorageLocationValidator.validate(invalidLocation)
result.value should be(CloudLocation("valid-bucket", Some("slash/prefix")))
}
forAll(validBucketNames) {
bN: String =>
test(s"allow valid bucket name : $bN") {

test("validate should succeed for a valid prefix with slashes not allowed") {
val validLocation = CloudLocation("valid-bucket", Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
GCPStorageLocationValidator.validate(validLocation)
result.value should be(validLocation)
val validLocation = CloudLocation(bN, Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
GCPStorageLocationValidator.validate(validLocation)
result.value should be(validLocation)
}
}

test("validate should succeed for a valid prefix with slashes allowed") {
val validLocation = CloudLocation("valid-bucket", Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
GCPStorageLocationValidator.validate(validLocation)
result.value should be(validLocation)
forAll(invalidBucketNames) {
(bN: String, prompt: String) =>
test(s"disallow invalid bucket names : $bN") {

val validLocation = CloudLocation(bN, Some("valid-prefix"))
val result: Validated[Throwable, CloudLocation] =
GCPStorageLocationValidator.validate(validLocation)
result.leftValue.getMessage should startWith("Invalid bucket name")
result.leftValue.getMessage should endWith(prompt)
}
}

}

0 comments on commit 7ca74af

Please sign in to comment.