Skip to content

Commit

Permalink
Merge pull request #1062 from dcantah/coredump
Browse files Browse the repository at this point in the history
Add process dump functionality for WCOW/LCOW
  • Loading branch information
dcantah authored Sep 24, 2021
2 parents 50c48de + f964e28 commit 18e2356
Show file tree
Hide file tree
Showing 23 changed files with 576 additions and 28 deletions.
14 changes: 14 additions & 0 deletions cmd/gcs/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre
func main() {
startTime := time.Now()
logLevel := flag.String("loglevel", "debug", "Logging Level: debug, info, warning, error, fatal, panic.")
coreDumpLoc := flag.String("core-dump-location", "", "The location/format where process core dumps will be written to.")
kmsgLogLevel := flag.Uint("kmsgLogLevel", uint(kmsg.Warning), "Log all kmsg entries with a priority less than or equal to the supplied level.")
logFile := flag.String("logfile", "", "Logging Target: An optional file name/path. Omit for console output.")
logFormat := flag.String("log-format", "text", "Logging Format: text or json")
Expand Down Expand Up @@ -144,6 +145,19 @@ func main() {

logrus.Info("GCS started")

// Set the process core dump location. This will be global to all containers as it's a kernel configuration.
// If no path is specified core dumps will just be placed in the working directory of wherever the process
// was invoked to a file named "core".
if *coreDumpLoc != "" {
if err := ioutil.WriteFile(
"/proc/sys/kernel/core_pattern",
[]byte(*coreDumpLoc),
0644,
); err != nil {
logrus.WithError(err).Fatal("failed to set core dump location")
}
}

// Continuously log /dev/kmsg
go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel))

Expand Down
3 changes: 3 additions & 0 deletions internal/guest/runtime/hcsv2/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ func (c *Container) ExecProcess(ctx context.Context, process *oci.Process, conSe
return -1, err
}

// Add in the core rlimit specified on the container in case there was one set. This makes it so that execed processes can also generate
// core dumps.
process.Rlimits = c.spec.Process.Rlimits
p, err := c.container.ExecProcess(process, stdioSet)
if err != nil {
stdioSet.Close()
Expand Down
6 changes: 6 additions & 0 deletions internal/guest/runtime/hcsv2/sandbox_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
}
}

if rlimCore := spec.Annotations["io.microsoft.lcow.rlimitcore"]; rlimCore != "" {
if err := setCoreRLimit(spec, rlimCore); err != nil {
return err
}
}

// TODO: JTERRY75 /dev/shm is not properly setup for LCOW I believe. CRI
// also has a concept of a sandbox/shm file when the IPC NamespaceMode !=
// NODE.
Expand Down
26 changes: 26 additions & 0 deletions internal/guest/runtime/hcsv2/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,32 @@ func setProcess(spec *oci.Spec) {
}
}

func setCoreRLimit(spec *oci.Spec, value string) error {
setProcess(spec)

vals := strings.Split(value, ";")
if len(vals) != 2 {
return errors.New("wrong number of values supplied for rlimit core")
}

soft, err := strconv.ParseUint(vals[0], 10, 64)
if err != nil {
return errors.Wrap(err, "failed to parse soft core rlimit")
}
hard, err := strconv.ParseUint(vals[1], 10, 64)
if err != nil {
return errors.Wrap(err, "failed to parse hard core rlimit")
}

spec.Process.Rlimits = append(spec.Process.Rlimits, oci.POSIXRlimit{
Type: "RLIMIT_CORE",
Soft: soft,
Hard: hard,
})

return nil
}

// setUserStr sets `spec.Process` to the valid `userstr` based on the OCI Image Spec
// v1.0.0 `userstr`.
//
Expand Down
6 changes: 6 additions & 0 deletions internal/guest/runtime/hcsv2/workload_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
return err
}

if rlimCore := spec.Annotations["io.microsoft.lcow.rlimitcore"]; rlimCore != "" {
if err := setCoreRLimit(spec, rlimCore); err != nil {
return err
}
}

// Force the parent cgroup into our /containers root
spec.Linux.CgroupsPath = "/containers/" + id

Expand Down
65 changes: 65 additions & 0 deletions internal/hcsoci/hcsdoc_wcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package hcsoci

import (
"context"
"errors"
"fmt"
"path/filepath"
"regexp"
Expand Down Expand Up @@ -385,6 +386,51 @@ func createWindowsContainerDocument(ctx context.Context, coi *createOptionsInter
}
v2Container.AdditionalDeviceNamespace = extensions

// Process dump setup (if requested)
dumpPath := ""
if coi.HostingSystem != nil {
dumpPath = coi.HostingSystem.ProcessDumpLocation()
}

if specDumpPath, ok := coi.Spec.Annotations[oci.AnnotationContainerProcessDumpLocation]; ok {
// If a process dump path was specified at pod creation time for a hypervisor isolated pod, then
// use this value. If one was specified on the container creation document then override with this
// instead. Unlike Linux, Windows containers can set the dump path on a per container basis.
dumpPath = specDumpPath
}

if dumpPath != "" {
dumpType, err := parseDumpType(coi.Spec.Annotations)
if err != nil {
return nil, nil, err
}

// Setup WER registry keys for local process dump creation if specified.
// https://docs.microsoft.com/en-us/windows/win32/wer/collecting-user-mode-dumps
v2Container.RegistryChanges = &hcsschema.RegistryChanges{
AddValues: []hcsschema.RegistryValue{
{
Key: &hcsschema.RegistryKey{
Hive: "Software",
Name: "Microsoft\\Windows\\Windows Error Reporting\\LocalDumps",
},
Name: "DumpFolder",
StringValue: dumpPath,
Type_: "String",
},
{
Key: &hcsschema.RegistryKey{
Hive: "Software",
Name: "Microsoft\\Windows\\Windows Error Reporting\\LocalDumps",
},
Name: "DumpType",
DWordValue: dumpType,
Type_: "DWord",
},
},
}
}

return v1, v2Container, nil
}

Expand Down Expand Up @@ -415,3 +461,22 @@ func parseAssignedDevices(ctx context.Context, coi *createOptionsInternal, v2 *h
v2.AssignedDevices = v2AssignedDevices
return nil
}

// parseDumpType parses the passed in string representation of the local user mode process dump type to the
// corresponding value the registry expects to be set.
//
// See DumpType at https://docs.microsoft.com/en-us/windows/win32/wer/collecting-user-mode-dumps for the mappings
func parseDumpType(annotations map[string]string) (int32, error) {
dmpTypeStr := annotations[oci.AnnotationWCOWProcessDumpType]
switch dmpTypeStr {
case "":
// If no type specified, default to full dumps.
return 2, nil
case "mini":
return 1, nil
case "full":
return 2, nil
default:
return -1, errors.New(`unknown dump type specified, valid values are "mini" or "full"`)
}
}
13 changes: 13 additions & 0 deletions internal/oci/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,17 @@ const (

// AnnotationSecurityPolicy is used to specify a security policy for opengcs to enforce
AnnotationSecurityPolicy = "io.microsoft.virtualmachine.lcow.securitypolicy"
// AnnotationContainerProcessDumpLocation specifies a path inside of containers to save process dumps to. As
// the scratch space for a container is generally cleaned up after exit, this is best set to a volume mount of
// some kind (vhd, bind mount, fileshare mount etc.)
AnnotationContainerProcessDumpLocation = "io.microsoft.container.processdumplocation"

// AnnotationWCOWProcessDumpType specifies the type of dump to create when generating a local user mode
// process dump for Windows containers. The supported options are "mini", and "full".
// See DumpType: https://docs.microsoft.com/en-us/windows/win32/wer/collecting-user-mode-dumps
AnnotationWCOWProcessDumpType = "io.microsoft.wcow.processdumptype"

// AnnotationRLimitCore specifies the core rlimit value for a container. This will need to be set
// in order to have core dumps generated for a given container.
AnnotationRLimitCore = "io.microsoft.lcow.rlimitcore"
)
3 changes: 2 additions & 1 deletion internal/oci/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (
lopts.EnableScratchEncryption = parseAnnotationsBool(ctx, s.Annotations, AnnotationEncryptedScratchDisk, lopts.EnableScratchEncryption)
lopts.SecurityPolicy = parseAnnotationsString(s.Annotations, AnnotationSecurityPolicy, lopts.SecurityPolicy)
lopts.KernelBootOptions = parseAnnotationsString(s.Annotations, AnnotationKernelBootOptions, lopts.KernelBootOptions)

lopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, AnnotationContainerProcessDumpLocation, lopts.ProcessDumpLocation)
handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts)
handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts)

Expand All @@ -357,6 +357,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (
wopts.CPUGroupID = parseAnnotationsString(s.Annotations, AnnotationCPUGroupID, wopts.CPUGroupID)
wopts.NetworkConfigProxy = parseAnnotationsString(s.Annotations, AnnotationNetworkConfigProxy, wopts.NetworkConfigProxy)
wopts.NoDirectMap = parseAnnotationsBool(ctx, s.Annotations, AnnotationVSMBNoDirectMap, wopts.NoDirectMap)
wopts.ProcessDumpLocation = parseAnnotationsString(s.Annotations, AnnotationContainerProcessDumpLocation, wopts.ProcessDumpLocation)
handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, wopts)
if err := handleCloneAnnotations(ctx, s.Annotations, wopts); err != nil {
return nil, err
Expand Down
11 changes: 11 additions & 0 deletions internal/uvm/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ type Options struct {
// that receives the UVMs set of NICs from this proxy instead of enumerating
// the endpoints locally.
NetworkConfigProxy string

// Sets the location for process dumps to be placed in. On Linux this is a kernel setting so it will be
// applied to all containers. On Windows it's configurable per container, but we can mimic this for
// Windows by just applying the location specified here per container.
ProcessDumpLocation string
}

// compares the create opts used during template creation with the create opts
Expand Down Expand Up @@ -347,6 +352,12 @@ func (uvm *UtilityVM) PhysicallyBacked() bool {
return uvm.physicallyBacked
}

// ProcessDumpLocation returns the location that process dumps will get written to for containers running
// in the UVM.
func (uvm *UtilityVM) ProcessDumpLocation() string {
return uvm.processDumpLocation
}

func (uvm *UtilityVM) normalizeMemorySize(ctx context.Context, requested uint64) uint64 {
actual := (requested + 1) &^ 1 // align up to an even number
if requested != actual {
Expand Down
4 changes: 4 additions & 0 deletions internal/uvm/create_lcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,10 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error

initArgs += " " + opts.ExecCommandLine

if opts.ProcessDumpLocation != "" {
initArgs += " -core-dump-location " + opts.ProcessDumpLocation
}

if vmDebugging {
// Launch a shell on the console.
initArgs = `sh -c "` + initArgs + ` & exec sh"`
Expand Down
53 changes: 41 additions & 12 deletions internal/uvm/create_wcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,25 +105,54 @@ func prepareConfigDoc(ctx context.Context, uvm *UtilityVM, opts *OptionsWCOW, uv
},
}

var registryChanges hcsschema.RegistryChanges
// We're getting asked to setup local dump collection for WCOW. We need to:
//
// 1. Turn off WER reporting, so we don't both upload the dump and save a local copy.
// 2. Set WerSvc to start when the UVM starts to work around a bug when generating dumps for certain exceptions.
// https://github.com/microsoft/Windows-Containers/issues/60#issuecomment-834633192
// This supposedly should be fixed soon but for now keep this until we know which container images
// (1809, 1903/9, 2004 etc.) this went out too.
if opts.ProcessDumpLocation != "" {
uvm.processDumpLocation = opts.ProcessDumpLocation
registryChanges.AddValues = append(registryChanges.AddValues,
hcsschema.RegistryValue{
Key: &hcsschema.RegistryKey{
Hive: "System",
Name: "ControlSet001\\Services\\WerSvc",
},
Name: "Start",
DWordValue: 2,
Type_: "DWord",
},
hcsschema.RegistryValue{
Key: &hcsschema.RegistryKey{
Hive: "Software",
Name: "Microsoft\\Windows\\Windows Error Reporting",
},
Name: "Disabled",
DWordValue: 1,
Type_: "DWord",
},
)
}

// Here for a temporary workaround until the need for setting this regkey is no more. To protect
// against any undesired behavior (such as some general networking scenarios ceasing to function)
// with a recent change to fix SMB share access in the UVM, this registry key will be checked to
// enable the change in question inside GNS.dll.
var registryChanges hcsschema.RegistryChanges
if !opts.DisableCompartmentNamespace {
registryChanges = hcsschema.RegistryChanges{
AddValues: []hcsschema.RegistryValue{
{
Key: &hcsschema.RegistryKey{
Hive: "System",
Name: "CurrentControlSet\\Services\\gns",
},
Name: "EnableCompartmentNamespace",
DWordValue: 1,
Type_: "DWord",
registryChanges.AddValues = append(registryChanges.AddValues,
hcsschema.RegistryValue{
Key: &hcsschema.RegistryKey{
Hive: "System",
Name: "CurrentControlSet\\Services\\gns",
},
Name: "EnableCompartmentNamespace",
DWordValue: 1,
Type_: "DWord",
},
}
)
}

processor := &hcsschema.Processor2{
Expand Down
4 changes: 4 additions & 0 deletions internal/uvm/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,13 @@ type UtilityVM struct {
// is true
TemplateID string

// Location that container process dumps will get written too.
processDumpLocation string

// The CreateOpts used to create this uvm. These can be either of type
// uvm.OptionsLCOW or uvm.OptionsWCOW
createOpts interface{}

// Network config proxy client. If nil then this wasn't requested and the
// uvms network will be configured locally.
ncProxyClient ncproxyttrpc.NetworkConfigProxyService
Expand Down
6 changes: 4 additions & 2 deletions test/cri-containerd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ const (
lcowRuntimeHandler = "runhcs-lcow"
imageLcowK8sPause = "k8s.gcr.io/pause:3.1"
imageLcowAlpine = "docker.io/library/alpine:latest"
imageLcowAlpineCoreDump = "cplatpublic.azurecr.io/stackoverflow-alpine:latest"
imageWindowsProcessDump = "cplatpublic.azurecr.io/crashdump:latest"
imageLcowCosmos = "cosmosarno/spark-master:2.4.1_2019-04-18_8e864ce"
imageJobContainerHNS = "cplatpublic.azurecr.io/jobcontainer_hns:latest"
imageJobContainerETW = "cplatpublic.azurecr.io/jobcontainer_etw:latest"
Expand Down Expand Up @@ -162,7 +164,7 @@ func getWindowsNanoserverImage(build uint16) string {
case osversion.V20H2:
return "mcr.microsoft.com/windows/nanoserver:2009"
default:
panic("unsupported build")
return "mcr.microsoft.com/windows/nanoserver:2009"
}
}

Expand All @@ -179,7 +181,7 @@ func getWindowsServerCoreImage(build uint16) string {
case osversion.V20H2:
return "mcr.microsoft.com/windows/servercore:2009"
default:
panic("unsupported build")
return "mcr.microsoft.com/windows/nanoserver:2009"
}
}

Expand Down
Loading

0 comments on commit 18e2356

Please sign in to comment.