Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing of apk databases with large entries #1365

Merged
merged 2 commits into from
Nov 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
342 changes: 224 additions & 118 deletions syft/pkg/cataloger/apkdb/parse_apk_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@ package apkdb
import (
"bufio"
"fmt"
"io"
"path"
"strconv"
"strings"

"github.com/mitchellh/mapstructure"

"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file"
Expand All @@ -22,157 +19,266 @@ import (
// integrity check
var _ generic.Parser = parseApkDB

// parseApkDb parses individual packages from a given Alpine DB file. For more information on specific fields
// see https://wiki.alpinelinux.org/wiki/Apk_spec .
// parseApkDB parses packages from a given APK installed DB file. For more
// information on specific fields, see https://wiki.alpinelinux.org/wiki/Apk_spec.
//
//nolint:funlen
func parseApkDB(_ source.FileResolver, env *generic.Environment, reader source.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
// larger capacity for the scanner.
const maxScannerCapacity = 1024 * 1024
// a new larger buffer for the scanner
bufScan := make([]byte, maxScannerCapacity)
pkgs := make([]pkg.Package, 0)

scanner := bufio.NewScanner(reader)
scanner.Buffer(bufScan, maxScannerCapacity)
onDoubleLF := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
for i := 0; i < len(data); i++ {
if i > 0 && data[i-1] == '\n' && data[i] == '\n' {
return i + 1, data[:i-1], nil

var apks []pkg.ApkMetadata
var currentEntry pkg.ApkMetadata
entryParsingInProgress := false
fileParsingCtx := newApkFileParsingContext()

// creating a dedicated append-like function here instead of using `append(...)`
// below since there is nontrivial logic to be performed for each finalized apk
// entry.
appendApk := func(p pkg.ApkMetadata) {
if files := fileParsingCtx.files; len(files) >= 1 {
// attached accumulated files to current package
p.Files = files

// reset file parsing for next use
fileParsingCtx = newApkFileParsingContext()
}

nilFieldsToEmptySlice(&p)
apks = append(apks, p)
}

for scanner.Scan() {
line := scanner.Text()

if line == "" {
// i.e. apk entry separator

if entryParsingInProgress {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: I might be reading the flow wrong here:

"If entry parsing In Progress (true)" then the entry is complete and we append?

Does the opposite sound better?

"If entry parsing is no longer in progress (false)" then currentEntry is complete and we append.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"If entry parsing In Progress (true)" then the entry is complete and we append?

Yes (almost) — "If entry parsing has been in progress AND we've now encountered an empty line, then call the entry complete NOW".

We don't know that parsing has completed for a given package entry until we enter the conditional branch on line 57, where the conditions on both line 53 and line 56 are evaluated to true.

The idea here is that each successive line of the file has a field that's applied to the same package entry unless the entry has been completely parsed, i.e. an empty line has been found. And only in that case would we start applying data to a new package entry (if there's more package data left in the file).

The if check on line 56 there is needed because if we encounter an empty line, but we hadn't been working on an entry, we shouldn't add anything to the package collection.

Very unopinionated on the variable name, so if you have something that makes more sense here, I can go ahead and update it!

// current entry is complete
appendApk(currentEntry)
}

entryParsingInProgress = false

// zero-out currentEntry for use by any future entry
currentEntry = pkg.ApkMetadata{}

continue
}
if !atEOF {
return 0, nil, nil

field := parseApkField(line)
if field == nil {
log.Warnf("unable to parse field data from line %q", line)
continue
}
// deliver the last token (which could be an empty string)
return 0, data, bufio.ErrFinalToken

entryParsingInProgress = true

field.apply(&currentEntry, fileParsingCtx)
}

if entryParsingInProgress {
// There was no final empty line, so currentEntry hasn't been added to the
// collection yet; but we've now reached the end of scanning, so let's be sure to
// add currentEntry to the collection.
appendApk(currentEntry)
}

if err := scanner.Err(); err != nil {
return nil, nil, fmt.Errorf("failed to parse APK installed DB file: %w", err)
}

var r *linux.Release
if env != nil {
r = env.LinuxRelease
}

scanner.Split(onDoubleLF)
for scanner.Scan() {
metadata, err := parseApkDBEntry(strings.NewReader(scanner.Text()))
if err != nil {
return nil, nil, err
}
if metadata != nil {
pkgs = append(pkgs, newPackage(*metadata, r, reader.Location))
}
pkgs := make([]pkg.Package, 0, len(apks))
for _, apk := range apks {
pkgs = append(pkgs, newPackage(apk, r, reader.Location))
}

if err := scanner.Err(); err != nil {
return nil, nil, fmt.Errorf("failed to parse APK DB file: %w", err)
return pkgs, discoverPackageDependencies(pkgs), nil
}

func parseApkField(line string) *apkField {
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
return nil
}

return pkgs, discoverPackageDependencies(pkgs), nil
f := apkField{
name: parts[0],
value: parts[1],
}

return &f
}

type apkField struct {
name string
value string
}

// parseApkDBEntry reads and parses a single pkg.ApkMetadata element from the stream, returning nil if their are no more entries.
//
//nolint:funlen
func parseApkDBEntry(reader io.Reader) (*pkg.ApkMetadata, error) {
var entry pkg.ApkMetadata
pkgFields := make(map[string]interface{})
func (f apkField) apply(p *pkg.ApkMetadata, ctx *apkFileParsingContext) {
switch f.name {
// APKINDEX field parsing

// We want sane defaults for collections, i.e. an empty array instead of null.
pkgFields["D"] = []string{}
pkgFields["p"] = []string{}
files := make([]pkg.ApkFileRecord, 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are now defaulted in parseListValue

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah good catch, there's a bit of overlap here. Maybe I should adjust parseListValue to return nil for an empty list, and leave the nil-to-empty-slice conversion in nilFieldsToEmptySlice, since only the latter is handling an empty files list currently... 🤔

Also, question (that maybe I knew the answer to at one point?):

Why is it necessary to use empty slices (e.g. []string{} instead of nil) here in the parsing logic? I know sometimes we've wanted to ensure JSON output uses [] instead of null, but that's more of a presentation concern, and I'm not remembering other reasons why this is important...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh sorry - I added this comment for myself to track where this had been moved to keep track of the diff.

I have no opinion either way on the overlap nilFieldsToEmptySlice and parseListValue and defer to you where the default of []string{} should live.

The presentation concern is valid, but I was not around when this was initially written. Another concern could be guarding against any kind of exceptions that may occur downstream for any functions that access these fields after assembly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adjusted for the overlap in f9cf2fc.

The "why []string{}?" question can be a lingering curiosity of mine 😄 — not needed for this PR in particular

case "P":
p.Package = f.value
case "o":
p.OriginPackage = f.value
case "m":
p.Maintainer = f.value
case "V":
p.Version = f.value
case "L":
p.License = f.value
case "A":
p.Architecture = f.value
case "U":
p.URL = f.value
case "T":
p.Description = f.value
case "S":
i, err := strconv.Atoi(f.value)
if err != nil {
log.Warnf("unable to parse value %q for field %q: %w", f.value, f.name, err)
return
}

var fileRecord *pkg.ApkFileRecord
lastFile := "/"
p.Size = i
case "I":
i, err := strconv.Atoi(f.value)
if err != nil {
log.Warnf("unable to parse value %q for field %q: %w", f.value, f.name, err)
return
}

scanner := bufio.NewScanner(reader)
for scanner.Scan() {
line := scanner.Text()
fields := strings.SplitN(line, ":", 2)
if len(fields) != 2 {
continue
p.InstalledSize = i
case "D":
deps := parseListValue(f.value)
p.Dependencies = deps
case "p":
provides := parseListValue(f.value)
p.Provides = provides
case "C":
p.Checksum = f.value
case "c":
p.GitCommit = f.value

// File/directory field parsing:

case "F":
directory := path.Join("/", f.value)

ctx.files = append(ctx.files, pkg.ApkFileRecord{Path: directory})
ctx.indexOfLatestDirectory = len(ctx.files) - 1
case "M":
i := ctx.indexOfLatestDirectory
latest := ctx.files[i]

var ok bool
latest.OwnerUID, latest.OwnerGID, latest.Permissions, ok = processFileInfo(f.value)
if !ok {
log.Warnf("unexpected value for APK ACL field %q: %q", f.name, f.value)
return
}

key := fields[0]
value := strings.TrimSpace(fields[1])
// save updated directory
ctx.files[i] = latest
case "R":
var regularFile string

switch key {
case "D", "p":
entries := strings.Split(value, " ")
pkgFields[key] = entries
case "F":
currentFile := "/" + value
dirIndex := ctx.indexOfLatestDirectory
if dirIndex < 0 {
regularFile = path.Join("/", f.value)
} else {
latestDirPath := ctx.files[dirIndex].Path
regularFile = path.Join(latestDirPath, f.value)
}

newFileRecord := pkg.ApkFileRecord{
Path: currentFile,
}
files = append(files, newFileRecord)
fileRecord = &files[len(files)-1]
ctx.files = append(ctx.files, pkg.ApkFileRecord{Path: regularFile})
ctx.indexOfLatestRegularFile = len(ctx.files) - 1
case "a":
i := ctx.indexOfLatestRegularFile
latest := ctx.files[i]

// future aux references are relative to previous "F" records
lastFile = currentFile
continue
case "R":
newFileRecord := pkg.ApkFileRecord{
Path: path.Join(lastFile, value),
}
files = append(files, newFileRecord)
fileRecord = &files[len(files)-1]
case "a", "M":
ownershipFields := strings.Split(value, ":")
if len(ownershipFields) < 3 {
log.Warnf("unexpected APK ownership field: %q", value)
continue
}
if fileRecord == nil {
log.Warnf("ownership field with no parent record: %q", value)
continue
}
fileRecord.OwnerUID = ownershipFields[0]
fileRecord.OwnerGID = ownershipFields[1]
fileRecord.Permissions = ownershipFields[2]
// note: there are more optional fields available that we are not capturing, e.g.:
// "0:0:755:Q1JaDEHQHBbizhEzoWK1YxuraNU/4="
case "Z":
if fileRecord == nil {
log.Warnf("checksum field with no parent record: %q", value)
continue
}
fileRecord.Digest = processChecksum(value)
case "I", "S":
// coerce to integer
iVal, err := strconv.Atoi(value)
if err != nil {
return nil, fmt.Errorf("failed to parse APK int: '%+v'", value)
}
pkgFields[key] = iVal
default:
pkgFields[key] = value
var ok bool
latest.OwnerUID, latest.OwnerGID, latest.Permissions, ok = processFileInfo(f.value)
if !ok {
log.Warnf("unexpected value for APK ACL field %q: %q", f.name, f.value)
return
}

// save updated file
ctx.files[i] = latest
case "Z":
i := ctx.indexOfLatestRegularFile
latest := ctx.files[i]
latest.Digest = processChecksum(f.value)

// save updated file
ctx.files[i] = latest
}
Copy link
Contributor

@spiffcs spiffcs Nov 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to add a default case for apply here to indicate if f.name is unknown or is that overkill?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can! Is an extra/unknown field something we'd want to warn users about? (And if so, why? Just curious)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might have more context on the why - How often if at all do new field names get added or removed here?

If it's super stable and no additional information will be added in the future then it's probably not worth the default case.

The why from my side is the proactive => "Hey! This new type of entry has been added and syft doesn't handle it" kind of message.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I gotcha, yeah.

AFAIK the format of the installed DB is very stable. I believe most of the current efforts on apk itself are focused on the upcoming apkv3, which has an entirely new encoding of this data altogether. The single letter field thing (e.g. P:, Z:, etc.) is an apkv2 thing (not v3) — so while the apkv2 format itself is probably not going to change, eventually apkv2 will probably be phased out. And when apkv3 is eventually rolled out, Syft will need an entirely new parser in order to support it.

So based on your thoughts, I'd suggest not handling extra fields explicitly right now, since I don't think that's very likely to happen?

}

func processFileInfo(v string) (uid, gid, perms string, ok bool) {
ok = false

fileInfo := strings.Split(v, ":")
if len(fileInfo) < 3 {
return
}

decoder, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
// By default, mapstructure compares field names in a *case-insensitive* manner.
// That would be the wrong approach here, since these apk files use case
// *sensitive* field names (e.g. 'P' vs. 'p').
MatchName: func(mapKey, fieldName string) bool {
return mapKey == fieldName
},
Result: &entry,
})
if err != nil {
return nil, err
uid = fileInfo[0]
gid = fileInfo[1]
perms = fileInfo[2]

// note: there are more optional fields available that we are not capturing,
// e.g.: "0:0:755:Q1JaDEHQHBbizhEzoWK1YxuraNU/4="

ok = true
return
}

// apkFileParsingContext helps keep track of what file data has been captured so far for the APK currently being parsed.
type apkFileParsingContext struct {
files []pkg.ApkFileRecord
indexOfLatestDirectory int
indexOfLatestRegularFile int
}

func newApkFileParsingContext() *apkFileParsingContext {
return &apkFileParsingContext{
indexOfLatestDirectory: -1, // no directories yet
indexOfLatestRegularFile: -1, // no regular files yet
}
}

if err := decoder.Decode(pkgFields); err != nil {
return nil, fmt.Errorf("unable to parse APK metadata: %w", err)
// parseListValue parses a space-separated list from an apk entry field value.
func parseListValue(value string) []string {
items := strings.Split(value, " ")
if len(items) >= 1 {
return items
}
if entry.Package == "" {
return nil, nil

return nil
}

func nilFieldsToEmptySlice(p *pkg.ApkMetadata) {
if p.Dependencies == nil {
p.Dependencies = []string{}
}

entry.Files = files
if p.Provides == nil {
p.Provides = []string{}
}

return &entry, nil
if p.Files == nil {
p.Files = []pkg.ApkFileRecord{}
}
}

func processChecksum(value string) *file.Digest {
Expand Down
Loading