diff --git a/docs/book/src/topics/troubleshooting.md b/docs/book/src/topics/troubleshooting.md index e994ec8e245..6825d8488b9 100644 --- a/docs/book/src/topics/troubleshooting.md +++ b/docs/book/src/topics/troubleshooting.md @@ -155,10 +155,11 @@ ssh -J capi@${apiserver} capi@${node} ## Automated log collection -As part of [CI](../../../../scripts/ci-e2e.sh) there is a [log collection script](../../../../hack/log/log-dump.sh) which you can also leverage to pull all the logs for machines which will dump logs to `${PWD}/_artifacts}` by default: +As part of CI there is a [log collection tool](https://github.com/kubernetes-sigs/cluster-api-provider-azure/tree/main/test/logger.go) which you can also leverage to pull all the logs for machines which will dump logs to `${PWD}/_artifacts}` by default. +The following works if your kubeconfig is configured with the management cluster. See the the tool for more settings. ```bash -./hack/log/log-dump.sh +go run -tags e2e ./test/logger.go --clustername capz-conf-j1t0qh --namespace default ``` -There are also some [provided scripts](../../../../hack/debugging/Readme.md) that can help automate a few common tasks. +There are also some [provided scripts](https://github.com/kubernetes-sigs/cluster-api-provider-azure/tree/main/hack/debugging) that can help automate a few common tasks. diff --git a/hack/log/log-dump-daemonset-windows.yaml b/hack/log/log-dump-daemonset-windows.yaml deleted file mode 100644 index db23e9c4676..00000000000 --- a/hack/log/log-dump-daemonset-windows.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: log-dump-node-windows -spec: - selector: - matchLabels: - app: log-dump-node-windows - template: - metadata: - labels: - app: - log-dump-node-windows - spec: - securityContext: - windowsOptions: - runAsUserName: ContainerAdministrator - containers: - - name: log-dump-node-windows - image: mcr.microsoft.com/oss/kubernetes/pause:3.6 - volumeMounts: - - name: varlog - mountPath: /var/log - nodeSelector: - kubernetes.io/os: windows - tolerations: - - effect: NoExecute - operator: Exists - - effect: NoSchedule - operator: Exists - - key: CriticalAddonsOnly - operator: Exists - volumes: - - name: varlog - hostPath: - path: /var/log diff --git a/hack/log/log-dump-daemonset.yaml b/hack/log/log-dump-daemonset.yaml deleted file mode 100644 index 23f95156bcf..00000000000 --- a/hack/log/log-dump-daemonset.yaml +++ /dev/null @@ -1,48 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: log-dump-node -spec: - selector: - matchLabels: - app: log-dump-node - template: - metadata: - labels: - app: log-dump-node - spec: - containers: - - name: log-dump-node - image: fedora:30 # A base image that has 'journalctl' binary - args: - - sleep - - "3600" - volumeMounts: - - name: varlog - mountPath: /var/log - - name: runlog - mountPath: /run/log - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - key: node-role.kubernetes.io/master - operator: Equal - value: "true" - - effect: NoSchedule - key: node-role.kubernetes.io/control-plane - operator: Equal - value: "true" - - effect: NoExecute - operator: Exists - - effect: NoSchedule - operator: Exists - - key: CriticalAddonsOnly - operator: Exists - volumes: - - name: varlog - hostPath: - path: /var/log - - name: runlog - hostPath: - path: /run/log diff --git a/hack/log/log-dump.sh b/hack/log/log-dump.sh deleted file mode 100755 index 76092cf876e..00000000000 --- a/hack/log/log-dump.sh +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2020 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -set -o nounset -set -o pipefail - -# Install kubectl -REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. -KUBECTL="${REPO_ROOT}/hack/tools/bin/kubectl" -cd "${REPO_ROOT}" && make "${KUBECTL##*/}" - -# shellcheck source=hack/ensure-kind.sh -source "${REPO_ROOT}/hack/ensure-kind.sh" - -export ARTIFACTS="${ARTIFACTS:-${PWD}/_artifacts}" -mkdir -p "${ARTIFACTS}/management-cluster" "${ARTIFACTS}/workload-cluster" - -export KUBECONFIG="${KUBECONFIG:-${PWD}/kubeconfig}" - -get_node_name() { - local -r pod_name="${1}" - # shellcheck disable=SC1083 - "${KUBECTL}" get pod "${pod_name}" -ojsonpath={.spec.nodeName} -} - -dump_mgmt_cluster_logs() { - # Assume the first kind cluster is the management cluster - local -r mgmt_cluster_name="$(kind get clusters | head -n 1)" - if [[ -z "${mgmt_cluster_name}" ]]; then - echo "No kind cluster is found" - return - fi - - kind get kubeconfig --name "${mgmt_cluster_name}" > "${PWD}/kind.kubeconfig" - local -r kubectl_kind="kubectl --kubeconfig=${PWD}/kind.kubeconfig" - - local -r resources=( - "clusters" - "azureclusters" - "machines" - "azuremachines" - "kubeadmconfigs" - "machinedeployments" - "azuremachinetemplates" - "kubeadmconfigtemplates" - "machinesets" - "kubeadmcontrolplanes" - "machinepools" - "azuremachinepools" - ) - mkdir -p "${ARTIFACTS}/management-cluster/resources" - for resource in "${resources[@]}"; do - ${kubectl_kind} get --all-namespaces "${resource}" -oyaml > "${ARTIFACTS}/management-cluster/resources/${resource}.log" || true - done - - { - echo "images in docker" - docker images - echo "images in bootstrap cluster using kubectl CLI" - (${kubectl_kind} get pods --all-namespaces -ojson \ - | jq --raw-output '.items[].spec.containers[].image' | sort) - echo "images in deployed cluster using kubectl CLI" - (${kubectl_kind} get pods --all-namespaces -ojson \ - | jq --raw-output '.items[].spec.containers[].image' | sort) - } > "${ARTIFACTS}/management-cluster/images.info" - - { - echo "kind cluster-info" - ${kubectl_kind} cluster-info dump - } > "${ARTIFACTS}/management-cluster/kind-cluster.info" - - kind export logs --name="${mgmt_cluster_name}" "${ARTIFACTS}/management-cluster" -} - -dump_workload_cluster_logs() { - echo "================ DUMPING LOGS FOR WORKLOAD CLUSTER (Linux) ==========" - dump_workload_cluster_logs_linux - - if [[ -z "${TEST_WINDOWS}" ]]; then - echo "TEST_WINDOWS envvar not set, skipping log collection for Windows nodes." - else - echo "================ DUMPING LOGS FOR WORKLOAD CLUSTER (Windows) ========" - dump_workload_cluster_logs_windows - fi - - dump_kube_system_pod_describes -} - -dump_kube_system_pod_describes() { - workload_cluster_dir="${ARTIFACTS}/workload-cluster" - echo "Collecting describe information of Pods in kube-system namespace" - "${KUBECTL}" get pod -n kube-system --no-headers=true | awk '{print $1}' | while read -r POD_NAME; do - NODE_NAME=$(kubectl get pod -n kube-system "${POD_NAME}" -o jsonpath='{.spec.nodeName}') - mkdir -p "${workload_cluster_dir}/${NODE_NAME}/pods/kube-system_${POD_NAME}" - kubectl describe pod -n kube-system "${POD_NAME}" > "${workload_cluster_dir}/${NODE_NAME}/pods/kube-system_${POD_NAME}/pod-describe.txt" & - done -} - -dump_workload_cluster_logs_linux() { - echo "Deploying log-dump-daemonset" - "${KUBECTL}" apply -f "${REPO_ROOT}/hack/log/log-dump-daemonset.yaml" - "${KUBECTL}" wait pod -l app=log-dump-node --for=condition=Ready --timeout=5m - - IFS=" " read -ra log_dump_pods <<< "$(kubectl get pod -l app=log-dump-node -ojsonpath='{.items[*].metadata.name}')" - local log_dump_commands=( - "journalctl --output=short-precise -u kubelet > kubelet.log" - "journalctl --output=short-precise -u containerd > containerd.log" - "journalctl --output=short-precise -k > kern.log" - "journalctl --output=short-precise > journal.log" - "cat /var/log/cloud-init.log > cloud-init.log" - "cat /var/log/cloud-init-output.log > cloud-init-output.log" - ) - - if [[ "$(uname)" == "Darwin" ]]; then - # tar on Mac OS does not support --wildcards flag - log_dump_commands+=( "tar -cf - var/log/pods --ignore-failed-read | tar xf - --strip-components=2 -C . '*kube-system*'" ) - else - log_dump_commands+=( "tar -cf - var/log/pods --ignore-failed-read | tar xf - --strip-components=2 -C . --wildcards '*kube-system*'" ) - fi - - node_names=() - for log_dump_pod in "${log_dump_pods[@]}"; do - local node_name - node_name="$(get_node_name "${log_dump_pod}")" - node_names+=("${node_name}") - - local log_dump_dir="${ARTIFACTS}/workload-cluster/${node_name}" - mkdir -p "${log_dump_dir}" - pushd "${log_dump_dir}" > /dev/null - for cmd in "${log_dump_commands[@]}"; do - bash -c "kubectl exec ${log_dump_pod} -- ${cmd}" & - done - popd > /dev/null - - echo "Exported logs for node \"${node_name}\"" - done - - # Wait for log-dumping commands running in the background to complete - wait - - # Rename Pods' directories: - # workload-cluster/node0/pods/ns0_pod0_xxx -> workload-cluster/node0/pods/ns0_pod0 - for node_name in "${node_names[@]}"; do - pushd "${ARTIFACTS}/workload-cluster/${node_name}/pods" > /dev/null - for dir in *; do - mv "${dir}" "${dir%_*}" - done - popd > /dev/null - done -} - -dump_workload_cluster_logs_windows() { - echo "Deploying log-dump-daemonset-windows" - "${KUBECTL}" apply -f "${REPO_ROOT}/hack/log/log-dump-daemonset-windows.yaml" - echo "Waiting for log-dump-daemonset-windows" - "${KUBECTL}" wait pod -l app=log-dump-node-windows --for=condition=Ready --timeout=5m - - IFS=" " read -ra log_dump_pods <<< "$(kubectl get pod -l app=log-dump-node-windows -ojsonpath='{.items[*].metadata.name}')" - - node_names=() - for log_dump_pod in "${log_dump_pods[@]}"; do - local node_name - node_name="$(get_node_name "${log_dump_pod}")" - echo "Getting logs for node ${node_name}" - node_names+=("${node_name}") - - local log_dump_dir="${ARTIFACTS}/workload-cluster/${node_name}" - mkdir -p "${log_dump_dir}" - - # make a new folder to copy logs to since files cannot be read to directly - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c mkdir log - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c xcopy /s c:\\var\\log\\kubelet c:\\log\\ - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c xcopy /s c:\\var\\log\\pods c:\\log\\ - - # Get a list of all of the files to copy with dir - # /s - recurse - # /B - bare format (no heading info or summaries) - # /A-D - exclude directories - IFS=" " read -ra log_dump_files <<< "$(kubectl exec "${log_dump_pod}" -- cmd.exe /c dir /s /B /A-D log | tr '\n' ' ' | tr -d '\r' )" - echo "Collecting pod logs" - - for log_dump_file in "${log_dump_files[@]}"; do - echo " Getting logfile ${log_dump_file}" - # reverse slashes and remove c:\log\ from paths - fixed_dump_file_path="$(echo "${log_dump_file//\\//}" | cut -d "/" -f3-)" - dir="$(dirname "${fixed_dump_file_path}")" - file="$(basename "${fixed_dump_file_path}")" - mkdir -p "${log_dump_dir}/pods/${dir}" - if [[ "${file}" =~ "kubelet" ]]; then - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c type "${log_dump_file}" > "${log_dump_dir}/${dir}/${file}" - else - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c type "${log_dump_file}" > "${log_dump_dir}/pods/${dir}/${file}" - fi - done - - echo "Exported logs for node \"${node_name}\"" - done - - # Rename Pods' directories - for node_name in "${node_names[@]}"; do - pushd "${ARTIFACTS}/workload-cluster/${node_name}/pods" > /dev/null - for dir in *; do - mv "${dir}" "${dir%_*}" - done - popd > /dev/null - done -} - -cleanup() { - "${KUBECTL}" delete -f "${REPO_ROOT}/hack/log/log-dump-daemonset.yaml" || true - "${KUBECTL}" delete -f "${REPO_ROOT}/hack/log/log-dump-daemonset-windows.yaml" || true - # shellcheck source=hack/log/redact.sh - source "${REPO_ROOT}/hack/log/redact.sh" -} - -trap cleanup EXIT - -echo "================ DUMPING LOGS FOR MANAGEMENT CLUSTER ================" -dump_mgmt_cluster_logs - -dump_workload_cluster_logs diff --git a/scripts/ci-entrypoint.sh b/scripts/ci-entrypoint.sh index faa615fb79f..7f66e7ffda5 100755 --- a/scripts/ci-entrypoint.sh +++ b/scripts/ci-entrypoint.sh @@ -139,8 +139,11 @@ cleanup() { } on_exit() { + # unset kubeconfig which is currently pointing at workload cluster. + # we want to be pointing at the management cluster (kind in this case) unset KUBECONFIG - "${REPO_ROOT}/hack/log/log-dump.sh" || true + go run -tags e2e "${REPO_ROOT}"/test/logger.go --clustername "${CLUSTER_NAME}" --namespace default + "${REPO_ROOT}/hack/log/redact.sh" || true # cleanup if [[ -z "${SKIP_CLEANUP:-}" ]]; then cleanup diff --git a/test/e2e/azure_clusterproxy.go b/test/e2e/azure_clusterproxy.go new file mode 100644 index 00000000000..e63c1f39b02 --- /dev/null +++ b/test/e2e/azure_clusterproxy.go @@ -0,0 +1,260 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "io" + "os" + "path" + "path/filepath" + "strings" + "time" + + aadpodv1 "github.com/Azure/aad-pod-identity/pkg/apis/aadpodidentity/v1" + "github.com/Azure/azure-sdk-for-go/profiles/2020-09-01/monitor/mgmt/insights" + "github.com/Azure/go-autorest/autorest/azure/auth" + "github.com/Azure/go-autorest/autorest/to" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" + expv1 "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" + clusterv1exp "sigs.k8s.io/cluster-api/exp/api/v1beta1" + "sigs.k8s.io/cluster-api/test/framework" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type ( + AzureClusterProxy struct { + framework.ClusterProxy + } + // myEventData is used to be able to Marshal insights.EventData into JSON + // see https://github.com/Azure/azure-sdk-for-go/issues/8224#issuecomment-614777550 + myEventData insights.EventData +) + +func NewAzureClusterProxy(name string, kubeconfigPath string, options ...framework.Option) *AzureClusterProxy { + proxy, ok := framework.NewClusterProxy(name, kubeconfigPath, initScheme(), options...).(framework.ClusterProxy) + Expect(ok).To(BeTrue(), "framework.NewClusterProxy must implement capi_e2e.ClusterProxy") + return &AzureClusterProxy{ + ClusterProxy: proxy, + } +} + +func initScheme() *runtime.Scheme { + scheme := runtime.NewScheme() + framework.TryAddDefaultSchemes(scheme) + Expect(infrav1.AddToScheme(scheme)).To(Succeed()) + Expect(expv1.AddToScheme(scheme)).To(Succeed()) + Expect(clusterv1exp.AddToScheme(scheme)).To(Succeed()) + // Add aadpodidentity v1 to the scheme. + aadPodIdentityGroupVersion := schema.GroupVersion{Group: aadpodv1.GroupName, Version: "v1"} + scheme.AddKnownTypes(aadPodIdentityGroupVersion, + &aadpodv1.AzureIdentity{}, + &aadpodv1.AzureIdentityList{}, + &aadpodv1.AzureIdentityBinding{}, + &aadpodv1.AzureIdentityBindingList{}, + &aadpodv1.AzureAssignedIdentity{}, + &aadpodv1.AzureAssignedIdentityList{}, + &aadpodv1.AzurePodIdentityException{}, + &aadpodv1.AzurePodIdentityExceptionList{}, + ) + metav1.AddToGroupVersion(scheme, aadPodIdentityGroupVersion) + return scheme +} + +func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) { + Byf("Dumping workload cluster %s/%s logs", namespace, name) + acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath) + + aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1) + + Byf("Dumping workload cluster %s/%s kube-system pod logs", namespace, name) + start := time.Now() + acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath) + Byf("Fetching kube-system pod logs took %s", time.Since(start).String()) + + Byf("Dumping workload cluster %s/%s Azure activity log", namespace, name) + start = time.Now() + acp.collectActivityLogs(ctx, aboveMachinesPath) + Byf("Fetching activity logs took %s", time.Since(start).String()) +} + +func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) { + workload := acp.GetWorkloadCluster(ctx, namespace, name) + pods := &corev1.PodList{} + Expect(workload.GetClient().List(ctx, pods, client.InNamespace(kubesystem))).To(Succeed()) + + events, err := workload.GetClientSet().CoreV1().Events(kubesystem).List(ctx, metav1.ListOptions{}) + eventMsgs := map[string]string{} + if err != nil { + Byf("failed to get events in kube-system namespace: %v", err) + } else { + for _, event := range events.Items { + if event.InvolvedObject.Kind == "Pod" { + if _, ok := eventMsgs[event.InvolvedObject.Name]; !ok { + eventMsgs[event.InvolvedObject.Name] = event.Message + } else { + eventMsgs[event.InvolvedObject.Name] += fmt.Sprintf("\n%s", event.Message) + } + } + } + } + + for _, pod := range pods.Items { + for _, container := range pod.Spec.Containers { + // Watch each container's logs in a goroutine so we can stream them all concurrently. + go func(pod corev1.Pod, container corev1.Container) { + defer GinkgoRecover() + + Byf("Creating log watcher for controller %s/%s, container %s", kubesystem, pod.Name, container.Name) + logFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, container.Name+".log") + if os.MkdirAll(filepath.Dir(logFile), 0755); err != nil { + // Failing to mkdir should not cause the test to fail + Byf("Error mkdir: %v", err) + return + } + + f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error opening file to write pod logs: %v", err) + return + } + defer f.Close() + + opts := &corev1.PodLogOptions{ + Container: container.Name, + Follow: true, + } + + podLogs, err := workload.GetClientSet().CoreV1().Pods(kubesystem).GetLogs(pod.Name, opts).Stream(ctx) + if err != nil { + // Failing to stream logs should not cause the test to fail + Byf("Error starting logs stream for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) + return + } + defer podLogs.Close() + + out := bufio.NewWriter(f) + defer out.Flush() + _, err = out.ReadFrom(podLogs) + if err != nil && err != io.ErrUnexpectedEOF { + // Failing to stream logs should not cause the test to fail + Byf("Got error while streaming logs for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) + } + }(pod, container) + } + + go func(pod corev1.Pod) { + defer GinkgoRecover() + + Byf("Collecting events for Pod %s/%s", kubesystem, pod.Name) + eventFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, "pod-events.txt") + if err := os.MkdirAll(filepath.Dir(eventFile), 0755); err != nil { + // Failing to mkdir should not cause the test to fail + Byf("Error mkdir: %v", err) + return + } + + f, err := os.OpenFile(eventFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + // Failing to open the file should not cause the test to fail + Byf("Error opening file to write Pod events: %v", err) + return + } + defer f.Close() + + msg, ok := eventMsgs[pod.Name] + if !ok { + Byf("failed to find events of Pod %q", pod.Name) + return + } + + out := bufio.NewWriter(f) + defer out.Flush() + _, err = out.WriteString(msg) + if err != nil && err != io.ErrUnexpectedEOF { + // Failing to collect event message should not cause the test to fail + Byf("failed to collect event message of pod %s/%s: %v", kubesystem, pod.Name, err) + } + }(pod) + } +} + +func (acp *AzureClusterProxy) collectActivityLogs(ctx context.Context, aboveMachinesPath string) { + timeoutctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + settings, err := auth.GetSettingsFromEnvironment() + Expect(err).NotTo(HaveOccurred()) + subscriptionID := settings.GetSubscriptionID() + authorizer, err := settings.GetAuthorizer() + Expect(err).NotTo(HaveOccurred()) + activityLogsClient := insights.NewActivityLogsClient(subscriptionID) + activityLogsClient.Authorizer = authorizer + + groupName := os.Getenv(AzureResourceGroup) + start := time.Now().Add(-2 * time.Hour).UTC().Format(time.RFC3339) + end := time.Now().UTC().Format(time.RFC3339) + + itr, err := activityLogsClient.ListComplete(timeoutctx, fmt.Sprintf("eventTimestamp ge '%s' and eventTimestamp le '%s' and resourceGroupName eq '%s'", start, end, groupName), "") + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error fetching activity logs for resource group %s: %v", groupName, err) + return + } + + logFile := path.Join(aboveMachinesPath, activitylog, groupName+".log") + Expect(os.MkdirAll(filepath.Dir(logFile), 0755)).To(Succeed()) + + f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error opening file to write activity logs: %v", err) + return + } + defer f.Close() + out := bufio.NewWriter(f) + defer out.Flush() + + for ; itr.NotDone(); err = itr.NextWithContext(timeoutctx) { + if err != nil { + Byf("Got error while iterating over activity logs for resource group %s: %v", groupName, err) + return + } + event := itr.Value() + if to.String(event.Category.Value) != "Policy" { + b, err := json.MarshalIndent(myEventData(event), "", " ") + if err != nil { + Byf("Got error converting activity logs data to json: %v", err) + } + if _, err = out.WriteString(string(b) + "\n"); err != nil { + Byf("Got error while writing activity logs for resource group %s: %v", groupName, err) + } + } + } +} diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index b61c80bcc91..11756b1faab 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -20,230 +20,24 @@ limitations under the License. package e2e import ( - "bufio" "context" - "encoding/json" "flag" "fmt" - "io" "os" - "path" "path/filepath" "strings" "testing" - "time" - aadpodv1 "github.com/Azure/aad-pod-identity/pkg/apis/aadpodidentity/v1" - "github.com/Azure/azure-sdk-for-go/services/preview/monitor/mgmt/2019-06-01/insights" - "github.com/Azure/go-autorest/autorest/azure/auth" - "github.com/Azure/go-autorest/autorest/to" . "github.com/onsi/ginkgo" "github.com/onsi/ginkgo/config" "github.com/onsi/ginkgo/reporters" . "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" - expv1 "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" - clusterv1exp "sigs.k8s.io/cluster-api/exp/api/v1beta1" capi_e2e "sigs.k8s.io/cluster-api/test/e2e" "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/bootstrap" "sigs.k8s.io/cluster-api/test/framework/clusterctl" - "sigs.k8s.io/controller-runtime/pkg/client" ) -type ( - AzureClusterProxy struct { - framework.ClusterProxy - } - // myEventData is used to be able to Marshal insights.EventData into JSON - // see https://github.com/Azure/azure-sdk-for-go/issues/8224#issuecomment-614777550 - myEventData insights.EventData -) - -func NewAzureClusterProxy(name string, kubeconfigPath string, scheme *runtime.Scheme, options ...framework.Option) *AzureClusterProxy { - proxy, ok := framework.NewClusterProxy(name, kubeconfigPath, scheme, options...).(framework.ClusterProxy) - Expect(ok).To(BeTrue(), "framework.NewClusterProxy must implement capi_e2e.ClusterProxy") - return &AzureClusterProxy{ - ClusterProxy: proxy, - } -} - -func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) { - Byf("Dumping workload cluster %s/%s logs", namespace, name) - acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath) - - aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1) - - Byf("Dumping workload cluster %s/%s kube-system pod logs", namespace, name) - start := time.Now() - acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath) - Byf("Fetching kube-system pod logs took %s", time.Since(start).String()) - - Byf("Dumping workload cluster %s/%s Azure activity log", namespace, name) - start = time.Now() - acp.collectActivityLogs(ctx, aboveMachinesPath) - Byf("Fetching activity logs took %s", time.Since(start).String()) -} - -func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) { - workload := acp.GetWorkloadCluster(ctx, namespace, name) - pods := &corev1.PodList{} - Expect(workload.GetClient().List(ctx, pods, client.InNamespace(kubesystem))).To(Succeed()) - - events, err := workload.GetClientSet().CoreV1().Events(kubesystem).List(ctx, metav1.ListOptions{}) - eventMsgs := map[string]string{} - if err != nil { - Byf("failed to get events in kube-system namespace: %v", err) - } else { - for _, event := range events.Items { - if event.InvolvedObject.Kind == "Pod" { - if _, ok := eventMsgs[event.InvolvedObject.Name]; !ok { - eventMsgs[event.InvolvedObject.Name] = event.Message - } else { - eventMsgs[event.InvolvedObject.Name] += fmt.Sprintf("\n%s", event.Message) - } - } - } - } - - for _, pod := range pods.Items { - for _, container := range pod.Spec.Containers { - // Watch each container's logs in a goroutine so we can stream them all concurrently. - go func(pod corev1.Pod, container corev1.Container) { - defer GinkgoRecover() - - Byf("Creating log watcher for controller %s/%s, container %s", kubesystem, pod.Name, container.Name) - logFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, container.Name+".log") - if os.MkdirAll(filepath.Dir(logFile), 0755); err != nil { - // Failing to mkdir should not cause the test to fail - Byf("Error mkdir: %v", err) - return - } - - f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - // Failing to fetch logs should not cause the test to fail - Byf("Error opening file to write pod logs: %v", err) - return - } - defer f.Close() - - opts := &corev1.PodLogOptions{ - Container: container.Name, - Follow: true, - } - - podLogs, err := workload.GetClientSet().CoreV1().Pods(kubesystem).GetLogs(pod.Name, opts).Stream(ctx) - if err != nil { - // Failing to stream logs should not cause the test to fail - Byf("Error starting logs stream for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) - return - } - defer podLogs.Close() - - out := bufio.NewWriter(f) - defer out.Flush() - _, err = out.ReadFrom(podLogs) - if err != nil && err != io.ErrUnexpectedEOF { - // Failing to stream logs should not cause the test to fail - Byf("Got error while streaming logs for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) - } - }(pod, container) - } - - go func(pod corev1.Pod) { - defer GinkgoRecover() - - Byf("Collecting events for Pod %s/%s", kubesystem, pod.Name) - eventFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, "pod-events.txt") - if err := os.MkdirAll(filepath.Dir(eventFile), 0755); err != nil { - // Failing to mkdir should not cause the test to fail - Byf("Error mkdir: %v", err) - return - } - - f, err := os.OpenFile(eventFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - // Failing to open the file should not cause the test to fail - Byf("Error opening file to write Pod events: %v", err) - return - } - defer f.Close() - - msg, ok := eventMsgs[pod.Name] - if !ok { - Byf("failed to find events of Pod %q", pod.Name) - return - } - - out := bufio.NewWriter(f) - defer out.Flush() - _, err = out.WriteString(msg) - if err != nil && err != io.ErrUnexpectedEOF { - // Failing to collect event message should not cause the test to fail - Byf("failed to collect event message of pod %s/%s: %v", kubesystem, pod.Name, err) - } - }(pod) - } -} - -func (acp *AzureClusterProxy) collectActivityLogs(ctx context.Context, aboveMachinesPath string) { - timeoutctx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - settings, err := auth.GetSettingsFromEnvironment() - Expect(err).NotTo(HaveOccurred()) - subscriptionID := settings.GetSubscriptionID() - authorizer, err := settings.GetAuthorizer() - Expect(err).NotTo(HaveOccurred()) - activityLogsClient := insights.NewActivityLogsClient(subscriptionID) - activityLogsClient.Authorizer = authorizer - - groupName := os.Getenv(AzureResourceGroup) - start := time.Now().Add(-2 * time.Hour).UTC().Format(time.RFC3339) - end := time.Now().UTC().Format(time.RFC3339) - - itr, err := activityLogsClient.ListComplete(timeoutctx, fmt.Sprintf("eventTimestamp ge '%s' and eventTimestamp le '%s' and resourceGroupName eq '%s'", start, end, groupName), "") - if err != nil { - // Failing to fetch logs should not cause the test to fail - Byf("Error fetching activity logs for resource group %s: %v", groupName, err) - return - } - - logFile := path.Join(aboveMachinesPath, activitylog, groupName+".log") - Expect(os.MkdirAll(filepath.Dir(logFile), 0755)).To(Succeed()) - - f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - // Failing to fetch logs should not cause the test to fail - Byf("Error opening file to write activity logs: %v", err) - return - } - defer f.Close() - out := bufio.NewWriter(f) - defer out.Flush() - - for ; itr.NotDone(); err = itr.NextWithContext(timeoutctx) { - if err != nil { - Byf("Got error while iterating over activity logs for resource group %s: %v", groupName, err) - return - } - event := itr.Value() - if to.String(event.Category.Value) != "Policy" { - b, err := json.MarshalIndent(myEventData(event), "", " ") - if err != nil { - Byf("Got error converting activity logs data to json: %v", err) - } - if _, err = out.WriteString(string(b) + "\n"); err != nil { - Byf("Got error while writing activity logs for resource group %s: %v", groupName, err) - } - } - } -} - func init() { flag.StringVar(&configPath, "e2e.config", "", "path to the e2e config file") flag.StringVar(&artifactFolder, "e2e.artifacts-folder", "", "folder where e2e test artifact should be stored") @@ -270,9 +64,6 @@ var _ = SynchronizedBeforeSuite(func() []byte { Expect(configPath).To(BeAnExistingFile(), "Invalid test suite argument. e2e.config should be an existing file.") Expect(os.MkdirAll(artifactFolder, 0755)).To(Succeed(), "Invalid test suite argument. Can't create e2e.artifacts-folder %q", artifactFolder) - By("Initializing a runtime.Scheme with all the GVK relevant for this test") - scheme := initScheme() - Byf("Loading the e2e test configuration from %q", configPath) e2eConfig = loadE2EConfig(configPath) @@ -280,7 +71,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { clusterctlConfigPath = createClusterctlLocalRepository(e2eConfig, filepath.Join(artifactFolder, "repository")) By("Setting up the bootstrap cluster") - bootstrapClusterProvider, bootstrapClusterProxy = setupBootstrapCluster(e2eConfig, scheme, useExistingCluster) + bootstrapClusterProvider, bootstrapClusterProxy = setupBootstrapCluster(e2eConfig, useExistingCluster) By("Initializing the bootstrap cluster") initBootstrapCluster(bootstrapClusterProxy, e2eConfig, clusterctlConfigPath, artifactFolder) @@ -305,8 +96,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { kubeconfigPath := parts[3] e2eConfig = loadE2EConfig(configPath) - bootstrapClusterProxy = NewAzureClusterProxy("bootstrap", kubeconfigPath, initScheme(), - framework.WithMachineLogCollector(AzureLogCollector{})) + bootstrapClusterProxy = NewAzureClusterProxy("bootstrap", kubeconfigPath, framework.WithMachineLogCollector(AzureLogCollector{})) }) // Using a SynchronizedAfterSuite for controlling how to delete resources shared across ParallelNodes (~ginkgo threads). @@ -323,28 +113,6 @@ var _ = SynchronizedAfterSuite(func() { } }) -func initScheme() *runtime.Scheme { - scheme := runtime.NewScheme() - framework.TryAddDefaultSchemes(scheme) - Expect(infrav1.AddToScheme(scheme)).To(Succeed()) - Expect(expv1.AddToScheme(scheme)).To(Succeed()) - Expect(clusterv1exp.AddToScheme(scheme)).To(Succeed()) - // Add aadpodidentity v1 to the scheme. - aadPodIdentityGroupVersion := schema.GroupVersion{Group: aadpodv1.GroupName, Version: "v1"} - scheme.AddKnownTypes(aadPodIdentityGroupVersion, - &aadpodv1.AzureIdentity{}, - &aadpodv1.AzureIdentityList{}, - &aadpodv1.AzureIdentityBinding{}, - &aadpodv1.AzureIdentityBindingList{}, - &aadpodv1.AzureAssignedIdentity{}, - &aadpodv1.AzureAssignedIdentityList{}, - &aadpodv1.AzurePodIdentityException{}, - &aadpodv1.AzurePodIdentityExceptionList{}, - ) - metav1.AddToGroupVersion(scheme, aadPodIdentityGroupVersion) - return scheme -} - func loadE2EConfig(configPath string) *clusterctl.E2EConfig { config := clusterctl.LoadE2EConfig(context.TODO(), clusterctl.LoadE2EConfigInput{ConfigPath: configPath}) Expect(config).NotTo(BeNil(), "Failed to load E2E config from %s", configPath) @@ -371,7 +139,7 @@ func createClusterctlLocalRepository(config *clusterctl.E2EConfig, repositoryFol return clusterctlConfig } -func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme, useExistingCluster bool) (bootstrap.ClusterProvider, framework.ClusterProxy) { +func setupBootstrapCluster(config *clusterctl.E2EConfig, useExistingCluster bool) (bootstrap.ClusterProvider, framework.ClusterProxy) { var clusterProvider bootstrap.ClusterProvider kubeconfigPath := "" if !useExistingCluster { @@ -386,7 +154,7 @@ func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme, Expect(kubeconfigPath).To(BeAnExistingFile(), "Failed to get the kubeconfig file for the bootstrap cluster") } - clusterProxy := NewAzureClusterProxy("bootstrap", kubeconfigPath, scheme) + clusterProxy := NewAzureClusterProxy("bootstrap", kubeconfigPath) Expect(clusterProxy).NotTo(BeNil(), "Failed to get a bootstrap cluster proxy") return clusterProvider, clusterProxy diff --git a/test/logger.go b/test/logger.go new file mode 100644 index 00000000000..ce79a8ffcc5 --- /dev/null +++ b/test/logger.go @@ -0,0 +1,115 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "flag" + "fmt" + "os" + "path" + "path/filepath" + + . "github.com/onsi/gomega" + + "sigs.k8s.io/cluster-api-provider-azure/test/e2e" + "sigs.k8s.io/cluster-api/test/framework" +) + +func Fail(message string, callerSkip ...int) { + panic(message) +} + +func main() { + // needed for ginkgo/gomega which is used by the capi test framework + RegisterFailHandler(Fail) + + // required flags + kubeconfigPath := flag.String("kube-config", getKubeConfigPath(), "The kubeconfig for the management cluster") + clustername := flag.String("clustername", "", "workload cluster to collect logs for") + + // optional flags that default + namespace := flag.String("namespace", "", "namespace on management cluster to collect logs for") + artifactFolder := flag.String("artifacts-folder", getArtifactsFolder(), "folder to store cluster logs") + azureresourcegroup := flag.String("azure-rg", "", "azure resource group that contains the cluster") + + flag.Parse() + + // use the cluster name as the namespace which is default in e2e tests + if *namespace == "" { + namespace = clustername + } + + // required to collect the Azure Activity logs + if *azureresourcegroup == "" { + os.Setenv(e2e.AzureResourceGroup, *clustername) + } else { + os.Setenv(e2e.AzureResourceGroup, *azureresourcegroup) + } + + fmt.Printf("Collecting logs for cluster %s in namepaces %s and dumping logs to %s\n", *clustername, *namespace, *artifactFolder) + bootstrapClusterProxy := e2e.NewAzureClusterProxy("bootstrap", *kubeconfigPath, framework.WithMachineLogCollector(e2e.AzureLogCollector{})) + + // Set up log paths + clusterLogPath := filepath.Join(*artifactFolder, "clusters", *clustername) + resourcesYaml := filepath.Join(*artifactFolder, "clusters", "bootstrap", "resources") + managementClusterLogPath := filepath.Join(*artifactFolder, "clusters", "bootstrap", "controllers") + + collectManagementClusterLogs(bootstrapClusterProxy, managementClusterLogPath, namespace, resourcesYaml) + bootstrapClusterProxy.CollectWorkloadClusterLogs(context.TODO(), *namespace, *clustername, clusterLogPath) +} + +func collectManagementClusterLogs(bootstrapClusterProxy *e2e.AzureClusterProxy, managementClusterLogPath string, namespace *string, workLoadClusterLogPath string) { + controllersDeployments := framework.GetControllerDeployments(context.TODO(), framework.GetControllerDeploymentsInput{ + Lister: bootstrapClusterProxy.GetClient(), + }) + for _, deployment := range controllersDeployments { + framework.WatchDeploymentLogs(context.TODO(), framework.WatchDeploymentLogsInput{ + GetLister: bootstrapClusterProxy.GetClient(), + ClientSet: bootstrapClusterProxy.GetClientSet(), + Deployment: deployment, + LogPath: managementClusterLogPath, + }) + } + + framework.DumpAllResources(context.TODO(), framework.DumpAllResourcesInput{ + Lister: bootstrapClusterProxy.GetClient(), + Namespace: *namespace, + LogPath: workLoadClusterLogPath, + }) +} + +func getKubeConfigPath() string { + config := os.Getenv("KUBECONFIG") + if config == "" { + d, _ := os.UserHomeDir() + return path.Join(d, ".kube", "config") + } + + return config +} + +func getArtifactsFolder() string { + artifacts := os.Getenv("ARTIFACTS") + if artifacts == "" { + return "_artifacts" + } + return artifacts +}