From 1a4fdb9fd28b6c88f1f4328346e2c81274f0eb78 Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Thu, 3 Nov 2022 12:59:43 -0700 Subject: [PATCH] enable more performant isText Signed-off-by: Spencer Schrock --- checks/raw/binary_artifact.go | 35 ++++++++--------------------------- cron/k8s/worker.release.yaml | 2 -- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/checks/raw/binary_artifact.go b/checks/raw/binary_artifact.go index 15b61cf61f0..ea63aaa187a 100644 --- a/checks/raw/binary_artifact.go +++ b/checks/raw/binary_artifact.go @@ -16,12 +16,9 @@ package raw import ( "fmt" - "log" - "os" "path/filepath" "regexp" "strings" - "unicode" "unicode/utf8" semver "github.com/Masterminds/semver/v3" @@ -158,12 +155,7 @@ var checkBinaryFileContent fileparser.DoWhileTrueOnFileContent = func(path strin } exists2 := binaryFileTypes[strings.ReplaceAll(filepath.Ext(path), ".", "")] - // TODO remove the comparison after testing in release-test worker - isTextFile := isText(content) - if _, enabled := os.LookupEnv("SCORECARD_COMPARE_ISTEXT"); enabled && isTextFile != isText2(content) { - log.Printf("isText implementations differ (raw.isText: %t) for file: %s", isTextFile, path) - } - if !isTextFile && exists2 { + if !isText(content) && exists2 { *pfiles = append(*pfiles, checker.File{ Path: path, Type: checker.FileTypeBinary, @@ -174,22 +166,11 @@ var checkBinaryFileContent fileparser.DoWhileTrueOnFileContent = func(path strin return true, nil } -// TODO: refine this function. -func isText(content []byte) bool { - for _, c := range string(content) { - if c == '\t' || c == '\n' || c == '\r' { - continue - } - if !unicode.IsPrint(c) { - return false - } - } - return true -} - -// TODO decine between isText and isText2 after testing in release-test worker -// modified version of golang.org/x/tools/godoc/util. -func isText2(s []byte) bool { +// determines if the first 1024 bytes are text +// +// A version of golang.org/x/tools/godoc/util modified to allow carriage returns +// and utf8.RuneError (0xFFFD), as the file may not be utf8 encoded. +func isText(s []byte) bool { const max = 1024 // at least utf8.UTFMax if len(s) > max { s = s[0:max] @@ -199,8 +180,8 @@ func isText2(s []byte) bool { // last char may be incomplete - ignore break } - if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' && c != '\r' { - // decoding error or control character - not a text file + if c < ' ' && c != '\n' && c != '\t' && c != '\r' { + // control character - not a text file return false } } diff --git a/cron/k8s/worker.release.yaml b/cron/k8s/worker.release.yaml index 335366eee95..2924ad3f490 100644 --- a/cron/k8s/worker.release.yaml +++ b/cron/k8s/worker.release.yaml @@ -44,8 +44,6 @@ spec: value: "10.4.4.210:80" - name: "SCORECARD_API_RESULTS_BUCKET_URL" value: "gs://ossf-scorecard-cron-releasetest-results" - - name: SCORECARD_COMPARE_ISTEXT - value: "1" resources: requests: memory: 5Gi