-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix parsing of apk databases with large entries #1365
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,10 @@ package apkdb | |
import ( | ||
"bufio" | ||
"fmt" | ||
"io" | ||
"path" | ||
"strconv" | ||
"strings" | ||
|
||
"github.com/mitchellh/mapstructure" | ||
|
||
"github.com/anchore/syft/internal/log" | ||
"github.com/anchore/syft/syft/artifact" | ||
"github.com/anchore/syft/syft/file" | ||
|
@@ -22,157 +19,266 @@ import ( | |
// integrity check | ||
var _ generic.Parser = parseApkDB | ||
|
||
// parseApkDb parses individual packages from a given Alpine DB file. For more information on specific fields | ||
// see https://wiki.alpinelinux.org/wiki/Apk_spec . | ||
// parseApkDB parses packages from a given APK installed DB file. For more | ||
// information on specific fields, see https://wiki.alpinelinux.org/wiki/Apk_spec. | ||
// | ||
//nolint:funlen | ||
func parseApkDB(_ source.FileResolver, env *generic.Environment, reader source.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { | ||
// larger capacity for the scanner. | ||
const maxScannerCapacity = 1024 * 1024 | ||
// a new larger buffer for the scanner | ||
bufScan := make([]byte, maxScannerCapacity) | ||
pkgs := make([]pkg.Package, 0) | ||
|
||
scanner := bufio.NewScanner(reader) | ||
scanner.Buffer(bufScan, maxScannerCapacity) | ||
onDoubleLF := func(data []byte, atEOF bool) (advance int, token []byte, err error) { | ||
for i := 0; i < len(data); i++ { | ||
if i > 0 && data[i-1] == '\n' && data[i] == '\n' { | ||
return i + 1, data[:i-1], nil | ||
|
||
var apks []pkg.ApkMetadata | ||
var currentEntry pkg.ApkMetadata | ||
entryParsingInProgress := false | ||
fileParsingCtx := newApkFileParsingContext() | ||
|
||
// creating a dedicated append-like function here instead of using `append(...)` | ||
// below since there is nontrivial logic to be performed for each finalized apk | ||
// entry. | ||
appendApk := func(p pkg.ApkMetadata) { | ||
if files := fileParsingCtx.files; len(files) >= 1 { | ||
// attached accumulated files to current package | ||
p.Files = files | ||
|
||
// reset file parsing for next use | ||
fileParsingCtx = newApkFileParsingContext() | ||
} | ||
|
||
nilFieldsToEmptySlice(&p) | ||
apks = append(apks, p) | ||
} | ||
|
||
for scanner.Scan() { | ||
line := scanner.Text() | ||
|
||
if line == "" { | ||
// i.e. apk entry separator | ||
|
||
if entryParsingInProgress { | ||
// current entry is complete | ||
appendApk(currentEntry) | ||
} | ||
|
||
entryParsingInProgress = false | ||
|
||
// zero-out currentEntry for use by any future entry | ||
currentEntry = pkg.ApkMetadata{} | ||
|
||
continue | ||
} | ||
if !atEOF { | ||
return 0, nil, nil | ||
|
||
field := parseApkField(line) | ||
if field == nil { | ||
log.Warnf("unable to parse field data from line %q", line) | ||
continue | ||
} | ||
// deliver the last token (which could be an empty string) | ||
return 0, data, bufio.ErrFinalToken | ||
|
||
entryParsingInProgress = true | ||
|
||
field.apply(¤tEntry, fileParsingCtx) | ||
} | ||
|
||
if entryParsingInProgress { | ||
// There was no final empty line, so currentEntry hasn't been added to the | ||
// collection yet; but we've now reached the end of scanning, so let's be sure to | ||
// add currentEntry to the collection. | ||
appendApk(currentEntry) | ||
} | ||
|
||
if err := scanner.Err(); err != nil { | ||
return nil, nil, fmt.Errorf("failed to parse APK installed DB file: %w", err) | ||
} | ||
|
||
var r *linux.Release | ||
if env != nil { | ||
r = env.LinuxRelease | ||
} | ||
|
||
scanner.Split(onDoubleLF) | ||
for scanner.Scan() { | ||
metadata, err := parseApkDBEntry(strings.NewReader(scanner.Text())) | ||
if err != nil { | ||
return nil, nil, err | ||
} | ||
if metadata != nil { | ||
pkgs = append(pkgs, newPackage(*metadata, r, reader.Location)) | ||
} | ||
pkgs := make([]pkg.Package, 0, len(apks)) | ||
for _, apk := range apks { | ||
pkgs = append(pkgs, newPackage(apk, r, reader.Location)) | ||
} | ||
|
||
if err := scanner.Err(); err != nil { | ||
return nil, nil, fmt.Errorf("failed to parse APK DB file: %w", err) | ||
return pkgs, discoverPackageDependencies(pkgs), nil | ||
} | ||
|
||
func parseApkField(line string) *apkField { | ||
parts := strings.SplitN(line, ":", 2) | ||
if len(parts) != 2 { | ||
return nil | ||
} | ||
|
||
return pkgs, discoverPackageDependencies(pkgs), nil | ||
f := apkField{ | ||
name: parts[0], | ||
value: parts[1], | ||
} | ||
|
||
return &f | ||
} | ||
|
||
type apkField struct { | ||
name string | ||
value string | ||
} | ||
|
||
// parseApkDBEntry reads and parses a single pkg.ApkMetadata element from the stream, returning nil if their are no more entries. | ||
// | ||
//nolint:funlen | ||
func parseApkDBEntry(reader io.Reader) (*pkg.ApkMetadata, error) { | ||
var entry pkg.ApkMetadata | ||
pkgFields := make(map[string]interface{}) | ||
func (f apkField) apply(p *pkg.ApkMetadata, ctx *apkFileParsingContext) { | ||
switch f.name { | ||
// APKINDEX field parsing | ||
|
||
// We want sane defaults for collections, i.e. an empty array instead of null. | ||
pkgFields["D"] = []string{} | ||
pkgFields["p"] = []string{} | ||
files := make([]pkg.ApkFileRecord, 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are now defaulted in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah good catch, there's a bit of overlap here. Maybe I should adjust Also, question (that maybe I knew the answer to at one point?): Why is it necessary to use empty slices (e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh sorry - I added this comment for myself to track where this had been moved to keep track of the diff. I have no opinion either way on the overlap The presentation concern is valid, but I was not around when this was initially written. Another concern could be guarding against any kind of exceptions that may occur downstream for any functions that access these fields after assembly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adjusted for the overlap in f9cf2fc. The "why |
||
case "P": | ||
p.Package = f.value | ||
case "o": | ||
p.OriginPackage = f.value | ||
case "m": | ||
p.Maintainer = f.value | ||
case "V": | ||
p.Version = f.value | ||
case "L": | ||
p.License = f.value | ||
case "A": | ||
p.Architecture = f.value | ||
case "U": | ||
p.URL = f.value | ||
case "T": | ||
p.Description = f.value | ||
case "S": | ||
i, err := strconv.Atoi(f.value) | ||
if err != nil { | ||
log.Warnf("unable to parse value %q for field %q: %w", f.value, f.name, err) | ||
return | ||
} | ||
|
||
var fileRecord *pkg.ApkFileRecord | ||
lastFile := "/" | ||
p.Size = i | ||
case "I": | ||
i, err := strconv.Atoi(f.value) | ||
if err != nil { | ||
log.Warnf("unable to parse value %q for field %q: %w", f.value, f.name, err) | ||
return | ||
} | ||
|
||
scanner := bufio.NewScanner(reader) | ||
for scanner.Scan() { | ||
line := scanner.Text() | ||
fields := strings.SplitN(line, ":", 2) | ||
if len(fields) != 2 { | ||
continue | ||
p.InstalledSize = i | ||
case "D": | ||
deps := parseListValue(f.value) | ||
p.Dependencies = deps | ||
case "p": | ||
provides := parseListValue(f.value) | ||
p.Provides = provides | ||
case "C": | ||
p.Checksum = f.value | ||
case "c": | ||
p.GitCommit = f.value | ||
|
||
// File/directory field parsing: | ||
|
||
case "F": | ||
directory := path.Join("/", f.value) | ||
|
||
ctx.files = append(ctx.files, pkg.ApkFileRecord{Path: directory}) | ||
ctx.indexOfLatestDirectory = len(ctx.files) - 1 | ||
case "M": | ||
i := ctx.indexOfLatestDirectory | ||
latest := ctx.files[i] | ||
|
||
var ok bool | ||
latest.OwnerUID, latest.OwnerGID, latest.Permissions, ok = processFileInfo(f.value) | ||
if !ok { | ||
log.Warnf("unexpected value for APK ACL field %q: %q", f.name, f.value) | ||
return | ||
} | ||
|
||
key := fields[0] | ||
value := strings.TrimSpace(fields[1]) | ||
// save updated directory | ||
ctx.files[i] = latest | ||
case "R": | ||
var regularFile string | ||
|
||
switch key { | ||
case "D", "p": | ||
entries := strings.Split(value, " ") | ||
pkgFields[key] = entries | ||
case "F": | ||
currentFile := "/" + value | ||
dirIndex := ctx.indexOfLatestDirectory | ||
if dirIndex < 0 { | ||
regularFile = path.Join("/", f.value) | ||
} else { | ||
latestDirPath := ctx.files[dirIndex].Path | ||
regularFile = path.Join(latestDirPath, f.value) | ||
} | ||
|
||
newFileRecord := pkg.ApkFileRecord{ | ||
Path: currentFile, | ||
} | ||
files = append(files, newFileRecord) | ||
fileRecord = &files[len(files)-1] | ||
ctx.files = append(ctx.files, pkg.ApkFileRecord{Path: regularFile}) | ||
ctx.indexOfLatestRegularFile = len(ctx.files) - 1 | ||
case "a": | ||
i := ctx.indexOfLatestRegularFile | ||
latest := ctx.files[i] | ||
|
||
// future aux references are relative to previous "F" records | ||
lastFile = currentFile | ||
continue | ||
case "R": | ||
newFileRecord := pkg.ApkFileRecord{ | ||
Path: path.Join(lastFile, value), | ||
} | ||
files = append(files, newFileRecord) | ||
fileRecord = &files[len(files)-1] | ||
case "a", "M": | ||
ownershipFields := strings.Split(value, ":") | ||
if len(ownershipFields) < 3 { | ||
log.Warnf("unexpected APK ownership field: %q", value) | ||
continue | ||
} | ||
if fileRecord == nil { | ||
log.Warnf("ownership field with no parent record: %q", value) | ||
continue | ||
} | ||
fileRecord.OwnerUID = ownershipFields[0] | ||
fileRecord.OwnerGID = ownershipFields[1] | ||
fileRecord.Permissions = ownershipFields[2] | ||
// note: there are more optional fields available that we are not capturing, e.g.: | ||
// "0:0:755:Q1JaDEHQHBbizhEzoWK1YxuraNU/4=" | ||
case "Z": | ||
if fileRecord == nil { | ||
log.Warnf("checksum field with no parent record: %q", value) | ||
continue | ||
} | ||
fileRecord.Digest = processChecksum(value) | ||
case "I", "S": | ||
// coerce to integer | ||
iVal, err := strconv.Atoi(value) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to parse APK int: '%+v'", value) | ||
} | ||
pkgFields[key] = iVal | ||
default: | ||
pkgFields[key] = value | ||
var ok bool | ||
latest.OwnerUID, latest.OwnerGID, latest.Permissions, ok = processFileInfo(f.value) | ||
if !ok { | ||
log.Warnf("unexpected value for APK ACL field %q: %q", f.name, f.value) | ||
return | ||
} | ||
|
||
// save updated file | ||
ctx.files[i] = latest | ||
case "Z": | ||
i := ctx.indexOfLatestRegularFile | ||
latest := ctx.files[i] | ||
latest.Digest = processChecksum(f.value) | ||
|
||
// save updated file | ||
ctx.files[i] = latest | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to add a default case for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can! Is an extra/unknown field something we'd want to warn users about? (And if so, why? Just curious) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might have more context on the why - How often if at all do new field names get added or removed here? If it's super stable and no additional information will be added in the future then it's probably not worth the default case. The why from my side is the proactive => "Hey! This new type of entry has been added and syft doesn't handle it" kind of message. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I gotcha, yeah. AFAIK the format of the installed DB is very stable. I believe most of the current efforts on So based on your thoughts, I'd suggest not handling extra fields explicitly right now, since I don't think that's very likely to happen? |
||
} | ||
|
||
func processFileInfo(v string) (uid, gid, perms string, ok bool) { | ||
ok = false | ||
|
||
fileInfo := strings.Split(v, ":") | ||
if len(fileInfo) < 3 { | ||
return | ||
} | ||
|
||
decoder, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ | ||
// By default, mapstructure compares field names in a *case-insensitive* manner. | ||
// That would be the wrong approach here, since these apk files use case | ||
// *sensitive* field names (e.g. 'P' vs. 'p'). | ||
MatchName: func(mapKey, fieldName string) bool { | ||
return mapKey == fieldName | ||
}, | ||
Result: &entry, | ||
}) | ||
if err != nil { | ||
return nil, err | ||
uid = fileInfo[0] | ||
gid = fileInfo[1] | ||
perms = fileInfo[2] | ||
|
||
// note: there are more optional fields available that we are not capturing, | ||
// e.g.: "0:0:755:Q1JaDEHQHBbizhEzoWK1YxuraNU/4=" | ||
|
||
ok = true | ||
return | ||
} | ||
|
||
// apkFileParsingContext helps keep track of what file data has been captured so far for the APK currently being parsed. | ||
type apkFileParsingContext struct { | ||
files []pkg.ApkFileRecord | ||
indexOfLatestDirectory int | ||
indexOfLatestRegularFile int | ||
} | ||
|
||
func newApkFileParsingContext() *apkFileParsingContext { | ||
return &apkFileParsingContext{ | ||
indexOfLatestDirectory: -1, // no directories yet | ||
indexOfLatestRegularFile: -1, // no regular files yet | ||
} | ||
} | ||
|
||
if err := decoder.Decode(pkgFields); err != nil { | ||
return nil, fmt.Errorf("unable to parse APK metadata: %w", err) | ||
// parseListValue parses a space-separated list from an apk entry field value. | ||
func parseListValue(value string) []string { | ||
items := strings.Split(value, " ") | ||
if len(items) >= 1 { | ||
return items | ||
} | ||
if entry.Package == "" { | ||
return nil, nil | ||
|
||
return nil | ||
} | ||
|
||
func nilFieldsToEmptySlice(p *pkg.ApkMetadata) { | ||
if p.Dependencies == nil { | ||
p.Dependencies = []string{} | ||
} | ||
|
||
entry.Files = files | ||
if p.Provides == nil { | ||
p.Provides = []string{} | ||
} | ||
|
||
return &entry, nil | ||
if p.Files == nil { | ||
p.Files = []pkg.ApkFileRecord{} | ||
} | ||
} | ||
|
||
func processChecksum(value string) *file.Digest { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question: I might be reading the flow wrong here:
"If entry parsing In Progress (true)" then the entry is complete and we append?
Does the opposite sound better?
"If entry parsing is no longer in progress (false)" then currentEntry is complete and we append.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes (almost) — "If entry parsing has been in progress AND we've now encountered an empty line, then call the entry complete NOW".
We don't know that parsing has completed for a given package entry until we enter the conditional branch on line 57, where the conditions on both line 53 and line 56 are evaluated to
true
.The idea here is that each successive line of the file has a field that's applied to the same package entry unless the entry has been completely parsed, i.e. an empty line has been found. And only in that case would we start applying data to a new package entry (if there's more package data left in the file).
The
if
check on line 56 there is needed because if we encounter an empty line, but we hadn't been working on an entry, we shouldn't add anything to the package collection.Very unopinionated on the variable name, so if you have something that makes more sense here, I can go ahead and update it!