From d46a27d8c10fa76736da45f7c78e0645c0d60f0d Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Tue, 8 Nov 2022 15:15:35 -0800 Subject: [PATCH 1/7] add prefix marker file to config Signed-off-by: Spencer Schrock --- cron/config/config.yaml | 10 +++++++++- cron/config/config_test.go | 23 ++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/cron/config/config.yaml b/cron/config/config.yaml index d0846c1d1ad..e9d387e8da4 100644 --- a/cron/config/config.yaml +++ b/cron/config/config.yaml @@ -22,12 +22,20 @@ shard-size: 10 webhook-url: metric-exporter: stackdriver result-data-bucket-url: gs://ossf-scorecard-data2 + +# TODO temporarily leaving old variables until changes propagate to production input-bucket-url: gs://ossf-scorecard-input-projects # Can be used to specify directories within a bucket. Can be empty. input-bucket-prefix: additional-params: - criticality: + input-bucket: + url: gs://ossf-scorecard-input-projects + # Optional prefix to limit files used as input files within a bucket (e.g. a specific file or directory) + prefix: + # Optional file to read a prefix from, instead of statically defining prefix above (note: prefix must be blank to use this option) + # This is good in situations where the prefix changes frequently (e.g. always using the most recent folder in a bucket) + prefix-file: scorecard: # API results bucket diff --git a/cron/config/config_test.go b/cron/config/config_test.go index 96260c5e804..80ab95143b3 100644 --- a/cron/config/config_test.go +++ b/cron/config/config_test.go @@ -38,14 +38,20 @@ const ( prodShardSize int = 10 prodMetricExporter string = "stackdriver" // Raw results. - prodRawBucket = "gs://ossf-scorecard-rawdata" - prodRawBigQueryTable = "scorecard-rawdata" - prodAPIBucketURL = "gs://ossf-scorecard-cron-results" - prodInputBucketURL = "gs://ossf-scorecard-input-projects" - prodInputBucketPrefix = "" + prodRawBucket = "gs://ossf-scorecard-rawdata" + prodRawBigQueryTable = "scorecard-rawdata" + prodAPIBucketURL = "gs://ossf-scorecard-cron-results" + prodInputBucketURL = "gs://ossf-scorecard-input-projects" + prodInputBucketPrefix = "" + prodInputBucketPrefixFile = "" ) var ( + prodInputBucketParams = map[string]string{ + "url": prodInputBucketURL, + "prefix": prodInputBucketPrefix, + "prefix-file": prodInputBucketPrefixFile, + } prodScorecardParams = map[string]string{ "api-results-bucket-url": prodAPIBucketURL, "blacklisted-checks": prodBlacklistedChecks, @@ -53,10 +59,9 @@ var ( "raw-bigquery-table": prodRawBigQueryTable, "raw-result-data-bucket-url": prodRawBucket, } - prodCriticalityParams map[string]string = nil - prodAdditionalParams = map[string]map[string]string{ - "scorecard": prodScorecardParams, - "criticality": prodCriticalityParams, + prodAdditionalParams = map[string]map[string]string{ + "input-bucket": prodInputBucketParams, + "scorecard": prodScorecardParams, } ) From f852145f1310821fa3627599321bcbd7eafbf218 Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Tue, 8 Nov 2022 16:05:14 -0800 Subject: [PATCH 2/7] Read the new config values, if they exist. Signed-off-by: Spencer Schrock --- cron/config/config.go | 23 ++++++++++++++++++----- cron/config/config_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/cron/config/config.go b/cron/config/config.go index 34a501338bf..6b5d3b863cc 100644 --- a/cron/config/config.go +++ b/cron/config/config.go @@ -41,6 +41,8 @@ const ( configDefault string = "" configUsage string = "Location of config file. Required" + inputBucketName string = "input-bucket" + projectID string = "SCORECARD_PROJECT_ID" requestTopicURL string = "SCORECARD_REQUEST_TOPIC_URL" requestSubscriptionURL string = "SCORECARD_REQUEST_SUBSCRIPTION_URL" @@ -300,16 +302,27 @@ func GetAPIResultsBucketURL() (string, error) { // GetInputBucketURL() returns the bucket URL for input files. func GetInputBucketURL() (string, error) { - return getStringConfigValue(inputBucketURL, configYAML, "InputBucketURL", "input-bucket-url") + bucketParams, err := GetAdditionalParams(inputBucketName) + bURL, ok := bucketParams["url"] + if err != nil || !ok { + // TODO temporarily falling back to old variables until changes propagate to production + return getStringConfigValue(inputBucketURL, configYAML, "InputBucketURL", "input-bucket-url") + } + return bURL, nil } // GetInputBucketPrefix() returns the prefix used when fetching files from a bucket. func GetInputBucketPrefix() (string, error) { - prefix, err := getStringConfigValue(inputBucketPrefix, configYAML, "InputBucketPrefix", "input-bucket-prefix") - if err != nil && !errors.Is(err, ErrorEmptyConfigValue) { - return "", err + bucketParams, err := GetAdditionalParams(inputBucketName) + if err != nil { + // TODO temporarily falling back to old variables until changes propagate to production + prefix, err := getStringConfigValue(inputBucketPrefix, configYAML, "InputBucketPrefix", "input-bucket-prefix") + if err != nil && !errors.Is(err, ErrorEmptyConfigValue) { + return "", err + } + return prefix, nil } - return prefix, nil + return bucketParams["prefix"], nil } func GetAdditionalParams(subMapName string) (map[string]string, error) { diff --git a/cron/config/config_test.go b/cron/config/config_test.go index 80ab95143b3..3f9851d470b 100644 --- a/cron/config/config_test.go +++ b/cron/config/config_test.go @@ -479,3 +479,41 @@ func TestEnvVarName(t *testing.T) { }) } } + +func TestGetAdditionalParams(t *testing.T) { + t.Parallel() + //nolint:govet + tests := []struct { + name string + mapName string + want map[string]string + wantErr bool + }{ + { + name: "scorecard values", + mapName: "scorecard", + want: prodScorecardParams, + wantErr: false, + }, + { + name: "nonexistant value", + mapName: "this-value-should-never-exist", + want: map[string]string{}, + wantErr: true, + }, + } + for _, testcase := range tests { + testcase := testcase + t.Run(testcase.name, func(t *testing.T) { + t.Parallel() + got, err := GetAdditionalParams(testcase.mapName) + if testcase.wantErr != (err != nil) { + t.Fatalf("unexpected error value for GetAdditionalParams: %v", err) + } + if !cmp.Equal(got, testcase.want) { + diff := cmp.Diff(got, testcase.want) + t.Errorf("test failed: expected - %v, got - %v. \n%s", testcase.want, got, diff) + } + }) + } +} From 82501b46b68faa0ad55fff1cec3c8ea52ed689a1 Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Tue, 8 Nov 2022 16:28:12 -0800 Subject: [PATCH 3/7] Add function to fetch prefix file config value. Signed-off-by: Spencer Schrock --- cron/config/config.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cron/config/config.go b/cron/config/config.go index 6b5d3b863cc..14177c194de 100644 --- a/cron/config/config.go +++ b/cron/config/config.go @@ -325,6 +325,15 @@ func GetInputBucketPrefix() (string, error) { return bucketParams["prefix"], nil } +// GetInputBucketPrefixFile() returns the file whose contents specify the prefix to use. +func GetInputBucketPrefixFile() (string, error) { + bucketParams, err := GetAdditionalParams(inputBucketName) + if err != nil { + return "", fmt.Errorf("getting config for %s: %w", inputBucketName, err) + } + return bucketParams["prefix-file"], nil +} + func GetAdditionalParams(subMapName string) (map[string]string, error) { return getMapConfigValue(configYAML, "AdditionalParams", "additional-params", subMapName) } From 821d8720bcf5fd5a42fa1084912d47458940b76d Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Tue, 8 Nov 2022 16:32:28 -0800 Subject: [PATCH 4/7] Read prefix file if prefix not set. Signed-off-by: Spencer Schrock --- cron/internal/controller/bucket.go | 92 ++++++++++++++++++++++++++++++ cron/internal/controller/main.go | 37 ------------ 2 files changed, 92 insertions(+), 37 deletions(-) create mode 100644 cron/internal/controller/bucket.go diff --git a/cron/internal/controller/bucket.go b/cron/internal/controller/bucket.go new file mode 100644 index 00000000000..82dbda4d683 --- /dev/null +++ b/cron/internal/controller/bucket.go @@ -0,0 +1,92 @@ +// Copyright 2021 Security Scorecard Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package main implements the PubSub controller. +package main + +import ( + "bytes" + "context" + "fmt" + "strings" + + "github.com/ossf/scorecard/v4/cron/config" + "github.com/ossf/scorecard/v4/cron/data" +) + +// getPrefix returns the prefix used when reading input files from a bucket. +// If "prefix" is set, the value is used irrespective of the value of "prefix-file". +// Otherwise, the contents of "prefix-file" (if set) are used. +func getPrefix(ctx context.Context, bucket string) (string, error) { + prefix, err := config.GetInputBucketPrefix() + if err != nil { + return "", fmt.Errorf("config.GetInputBucketPrefix: %w", err) + } + if prefix != "" { + // prioritize prefix if set + return prefix, nil + } + + prefixFile, err := config.GetInputBucketPrefixFile() + if err != nil { + return "", fmt.Errorf("config.GetInputBucketPrefixFile: %w", err) + } + if prefixFile == "" { + // cant read a file which doesnt exist, but the value is optional so no error + return "", nil + } + + b, err := data.GetBlobContent(ctx, bucket, prefixFile) + if err != nil { + return "", fmt.Errorf("fetching contents of prefix-file: %w", err) + } + s := string(b) + return strings.TrimSpace(s), nil +} + +func bucketFiles(ctx context.Context) data.Iterator { + var iters []data.Iterator + + bucket, err := config.GetInputBucketURL() + if err != nil { + panic(err) + } + prefix, err := getPrefix(ctx, bucket) + if err != nil { + panic(err) + } + + files, err := data.GetBlobKeysWithPrefix(ctx, bucket, prefix) + if err != nil { + panic(err) + } + + for _, f := range files { + b, err := data.GetBlobContent(ctx, bucket, f) + if err != nil { + panic(err) + } + r := bytes.NewReader(b) + i, err := data.MakeIteratorFrom(r) + if err != nil { + panic(err) + } + iters = append(iters, i) + } + iter, err := data.MakeNestedIterator(iters) + if err != nil { + panic(err) + } + return iter +} diff --git a/cron/internal/controller/main.go b/cron/internal/controller/main.go index da37de62c8e..4ed44d7c9e1 100644 --- a/cron/internal/controller/main.go +++ b/cron/internal/controller/main.go @@ -16,7 +16,6 @@ package main import ( - "bytes" "context" "flag" "fmt" @@ -104,42 +103,6 @@ func localFiles(filenames []string) data.Iterator { return iter } -func bucketFiles(ctx context.Context) data.Iterator { - var iters []data.Iterator - - bucket, err := config.GetInputBucketURL() - if err != nil { - panic(err) - } - prefix, err := config.GetInputBucketPrefix() - if err != nil { - panic(err) - } - - files, err := data.GetBlobKeysWithPrefix(ctx, bucket, prefix) - if err != nil { - panic(err) - } - - for _, f := range files { - b, err := data.GetBlobContent(ctx, bucket, f) - if err != nil { - panic(err) - } - r := bytes.NewReader(b) - i, err := data.MakeIteratorFrom(r) - if err != nil { - panic(err) - } - iters = append(iters, i) - } - iter, err := data.MakeNestedIterator(iters) - if err != nil { - panic(err) - } - return iter -} - func main() { ctx := context.Background() t := time.Now() From 5a0505a2c6c2969a7a6d69e4e02553047076e58e Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Tue, 8 Nov 2022 16:40:34 -0800 Subject: [PATCH 5/7] Add tests to verify how List works with various prefixes Signed-off-by: Spencer Schrock --- cron/data/blob_test.go | 17 ++++++++++++++++- .../testdata/blob_test/subdir/nested/key5.txt | 0 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 cron/data/testdata/blob_test/subdir/nested/key5.txt diff --git a/cron/data/blob_test.go b/cron/data/blob_test.go index 205c7b47ffb..2c2121a5c50 100644 --- a/cron/data/blob_test.go +++ b/cron/data/blob_test.go @@ -128,11 +128,26 @@ func TestBlobKeysPrefix(t *testing.T) { { name: "no prefix", prefix: "", - want: []string{"key1.txt", "key2.txt", "key3.txt", "subdir/key4.txt"}, + want: []string{"key1.txt", "key2.txt", "key3.txt", "subdir/key4.txt", "subdir/nested/key5.txt"}, }, { name: "subdir prefix", prefix: "subdir/", + want: []string{"subdir/key4.txt", "subdir/nested/key5.txt"}, + }, + { + name: "subdir prefix no terminal slash", + prefix: "subdir", + want: []string{"subdir/key4.txt", "subdir/nested/key5.txt"}, + }, + { + name: "nested prefix", + prefix: "subdir/nested/", + want: []string{"subdir/nested/key5.txt"}, + }, + { + name: "file prefix", + prefix: "subdir/key4.txt", want: []string{"subdir/key4.txt"}, }, } diff --git a/cron/data/testdata/blob_test/subdir/nested/key5.txt b/cron/data/testdata/blob_test/subdir/nested/key5.txt new file mode 100644 index 00000000000..e69de29bb2d From 9e62f24ea1582526fe21b4ab8815d0a52c00a0ad Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Wed, 9 Nov 2022 09:52:53 -0800 Subject: [PATCH 6/7] Add tests for getPrefix Signed-off-by: Spencer Schrock --- cron/config/config.go | 17 ++-- cron/internal/controller/bucket.go | 2 +- cron/internal/controller/bucket_test.go | 95 +++++++++++++++++++ .../controller/testdata/getPrefix/marker | 1 + 4 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 cron/internal/controller/bucket_test.go create mode 100644 cron/internal/controller/testdata/getPrefix/marker diff --git a/cron/config/config.go b/cron/config/config.go index 14177c194de..cbc47293313 100644 --- a/cron/config/config.go +++ b/cron/config/config.go @@ -37,11 +37,10 @@ const ( // TransferStatusFilename file identifies if shard transfer to BigQuery is completed. TransferStatusFilename string = ".transfer_complete" - configFlag string = "config" - configDefault string = "" - configUsage string = "Location of config file. Required" - - inputBucketName string = "input-bucket" + configFlag string = "config" + configDefault string = "" + configUsage string = "Location of config file. Required" + inputBucketParams string = "input-bucket" projectID string = "SCORECARD_PROJECT_ID" requestTopicURL string = "SCORECARD_REQUEST_TOPIC_URL" @@ -302,7 +301,7 @@ func GetAPIResultsBucketURL() (string, error) { // GetInputBucketURL() returns the bucket URL for input files. func GetInputBucketURL() (string, error) { - bucketParams, err := GetAdditionalParams(inputBucketName) + bucketParams, err := GetAdditionalParams(inputBucketParams) bURL, ok := bucketParams["url"] if err != nil || !ok { // TODO temporarily falling back to old variables until changes propagate to production @@ -313,7 +312,7 @@ func GetInputBucketURL() (string, error) { // GetInputBucketPrefix() returns the prefix used when fetching files from a bucket. func GetInputBucketPrefix() (string, error) { - bucketParams, err := GetAdditionalParams(inputBucketName) + bucketParams, err := GetAdditionalParams(inputBucketParams) if err != nil { // TODO temporarily falling back to old variables until changes propagate to production prefix, err := getStringConfigValue(inputBucketPrefix, configYAML, "InputBucketPrefix", "input-bucket-prefix") @@ -327,9 +326,9 @@ func GetInputBucketPrefix() (string, error) { // GetInputBucketPrefixFile() returns the file whose contents specify the prefix to use. func GetInputBucketPrefixFile() (string, error) { - bucketParams, err := GetAdditionalParams(inputBucketName) + bucketParams, err := GetAdditionalParams(inputBucketParams) if err != nil { - return "", fmt.Errorf("getting config for %s: %w", inputBucketName, err) + return "", fmt.Errorf("getting config for %s: %w", inputBucketParams, err) } return bucketParams["prefix-file"], nil } diff --git a/cron/internal/controller/bucket.go b/cron/internal/controller/bucket.go index 82dbda4d683..203f872112e 100644 --- a/cron/internal/controller/bucket.go +++ b/cron/internal/controller/bucket.go @@ -1,4 +1,4 @@ -// Copyright 2021 Security Scorecard Authors +// Copyright 2022 Security Scorecard Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/cron/internal/controller/bucket_test.go b/cron/internal/controller/bucket_test.go new file mode 100644 index 00000000000..38e78b419e9 --- /dev/null +++ b/cron/internal/controller/bucket_test.go @@ -0,0 +1,95 @@ +// Copyright 2022 Security Scorecard Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "path/filepath" + "testing" +) + +//nolint:tparallel,paralleltest // since t.Setenv is used +func TestGetPrefix(t *testing.T) { + //nolint:govet + testcases := []struct { + name string + url string + prefix string + prefixFile string + want string + wantErr bool + }{ + { + name: "no prefix", + url: "testdata/getPrefix", + prefix: "", + prefixFile: "", + want: "", + wantErr: false, + }, + { + name: "prefix set", + url: "testdata/getPrefix", + prefix: "foo", + prefixFile: "", + want: "foo", + wantErr: false, + }, + { + name: "prefix file set", + url: "testdata/getPrefix", + prefix: "", + prefixFile: "marker", + want: "bar", + wantErr: false, + }, + { + name: "prefix and prefix file set", + url: "testdata/getPrefix", + prefix: "foo", + prefixFile: "marker", + want: "foo", + wantErr: false, + }, + { + name: "non existent prefix file", + url: "testdata/getPrefix", + prefix: "", + prefixFile: "baz", + want: "", + wantErr: true, + }, + } + + for _, testcase := range testcases { + testcase := testcase + t.Run(testcase.name, func(t *testing.T) { + t.Setenv("INPUT_BUCKET_URL", testcase.url) + t.Setenv("INPUT_BUCKET_PREFIX", testcase.prefix) + t.Setenv("INPUT_BUCKET_PREFIX_FILE", testcase.prefixFile) + testdataPath, err := filepath.Abs(testcase.url) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + got, err := getPrefix(context.Background(), "file:///"+testdataPath) + if (err != nil) != testcase.wantErr { + t.Errorf("unexpected error produced: %v", err) + } + if got != testcase.want { + t.Errorf("test failed: expected - %s, got = %s", testcase.want, got) + } + }) + } +} diff --git a/cron/internal/controller/testdata/getPrefix/marker b/cron/internal/controller/testdata/getPrefix/marker new file mode 100644 index 00000000000..5716ca5987c --- /dev/null +++ b/cron/internal/controller/testdata/getPrefix/marker @@ -0,0 +1 @@ +bar From 937a80aae188996313474ad50aa9dab185b45098 Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Wed, 9 Nov 2022 13:12:13 -0800 Subject: [PATCH 7/7] Remove panics from iterator helper functions Signed-off-by: Spencer Schrock --- cron/internal/controller/bucket.go | 16 ++++++++-------- cron/internal/controller/main.go | 18 +++++++++++------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/cron/internal/controller/bucket.go b/cron/internal/controller/bucket.go index 203f872112e..0cf7bbfdf80 100644 --- a/cron/internal/controller/bucket.go +++ b/cron/internal/controller/bucket.go @@ -55,38 +55,38 @@ func getPrefix(ctx context.Context, bucket string) (string, error) { return strings.TrimSpace(s), nil } -func bucketFiles(ctx context.Context) data.Iterator { +func bucketFiles(ctx context.Context) (data.Iterator, error) { var iters []data.Iterator bucket, err := config.GetInputBucketURL() if err != nil { - panic(err) + return nil, fmt.Errorf("config.GetInputBucketURL: %w", err) } prefix, err := getPrefix(ctx, bucket) if err != nil { - panic(err) + return nil, fmt.Errorf("getPrefix: %w", err) } files, err := data.GetBlobKeysWithPrefix(ctx, bucket, prefix) if err != nil { - panic(err) + return nil, fmt.Errorf("data.GetBlobKeysWithPrefix: %w", err) } for _, f := range files { b, err := data.GetBlobContent(ctx, bucket, f) if err != nil { - panic(err) + return nil, fmt.Errorf("data.GetBlobContent: %w", err) } r := bytes.NewReader(b) i, err := data.MakeIteratorFrom(r) if err != nil { - panic(err) + return nil, fmt.Errorf("data.MakeIteratorFrom: %w", err) } iters = append(iters, i) } iter, err := data.MakeNestedIterator(iters) if err != nil { - panic(err) + return nil, fmt.Errorf("data.MakeNestedIterator: %w", err) } - return iter + return iter, nil } diff --git a/cron/internal/controller/main.go b/cron/internal/controller/main.go index 4ed44d7c9e1..628a25894cf 100644 --- a/cron/internal/controller/main.go +++ b/cron/internal/controller/main.go @@ -83,24 +83,24 @@ func publishToRepoRequestTopic(iter data.Iterator, topicPublisher pubsub.Publish return shardNum, nil } -func localFiles(filenames []string) data.Iterator { +func localFiles(filenames []string) (data.Iterator, error) { var iters []data.Iterator for _, filename := range filenames { f, err := os.Open(filename) if err != nil { - panic(err) + return nil, fmt.Errorf("unable to open input file: %w", err) } i, err := data.MakeIteratorFrom(f) if err != nil { - panic(err) + return nil, fmt.Errorf("data.MakeIteratorFrom: %w", err) } iters = append(iters, i) } iter, err := data.MakeNestedIterator(iters) if err != nil { - panic(err) + return nil, fmt.Errorf("data.MakeNestedIterator: %w", err) } - return iter + return iter, nil } func main() { @@ -138,10 +138,14 @@ func main() { var reader data.Iterator if useLocalFiles := len(flag.Args()) > 0; useLocalFiles { - reader = localFiles(flag.Args()) + reader, err = localFiles(flag.Args()) } else { - reader = bucketFiles(ctx) + reader, err = bucketFiles(ctx) } + if err != nil { + panic(err) + } + shardNum, err := publishToRepoRequestTopic(reader, topicPublisher, shardSize, t) if err != nil { panic(err)