diff --git a/docs/book/src/topics/troubleshooting.md b/docs/book/src/topics/troubleshooting.md index e994ec8e2456..230d1e14de92 100644 --- a/docs/book/src/topics/troubleshooting.md +++ b/docs/book/src/topics/troubleshooting.md @@ -155,10 +155,12 @@ ssh -J capi@${apiserver} capi@${node} ## Automated log collection -As part of [CI](../../../../scripts/ci-e2e.sh) there is a [log collection script](../../../../hack/log/log-dump.sh) which you can also leverage to pull all the logs for machines which will dump logs to `${PWD}/_artifacts}` by default: +As part of CI there is a [log collection tool](https://github.com/kubernetes-sigs/cluster-api-provider-azure/tree/main/test/logger.go) +which you can also leverage to pull all the logs for machines which will dump logs to `${PWD}/_artifacts}` by default. The following works +if your kubeconfig is configured with the management cluster. See the the tool for more settings. ```bash -./hack/log/log-dump.sh +go run -tags e2e ./test/logger.go --name capz-conf-j1t0qh --namespace default ``` -There are also some [provided scripts](../../../../hack/debugging/Readme.md) that can help automate a few common tasks. +There are also some [provided scripts](https://github.com/kubernetes-sigs/cluster-api-provider-azure/tree/main/hack/debugging) that can help automate a few common tasks. diff --git a/hack/log/log-dump-daemonset-windows.yaml b/hack/log/log-dump-daemonset-windows.yaml deleted file mode 100644 index db23e9c4676f..000000000000 --- a/hack/log/log-dump-daemonset-windows.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: log-dump-node-windows -spec: - selector: - matchLabels: - app: log-dump-node-windows - template: - metadata: - labels: - app: - log-dump-node-windows - spec: - securityContext: - windowsOptions: - runAsUserName: ContainerAdministrator - containers: - - name: log-dump-node-windows - image: mcr.microsoft.com/oss/kubernetes/pause:3.6 - volumeMounts: - - name: varlog - mountPath: /var/log - nodeSelector: - kubernetes.io/os: windows - tolerations: - - effect: NoExecute - operator: Exists - - effect: NoSchedule - operator: Exists - - key: CriticalAddonsOnly - operator: Exists - volumes: - - name: varlog - hostPath: - path: /var/log diff --git a/hack/log/log-dump-daemonset.yaml b/hack/log/log-dump-daemonset.yaml deleted file mode 100644 index 23f95156bcfa..000000000000 --- a/hack/log/log-dump-daemonset.yaml +++ /dev/null @@ -1,48 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: log-dump-node -spec: - selector: - matchLabels: - app: log-dump-node - template: - metadata: - labels: - app: log-dump-node - spec: - containers: - - name: log-dump-node - image: fedora:30 # A base image that has 'journalctl' binary - args: - - sleep - - "3600" - volumeMounts: - - name: varlog - mountPath: /var/log - - name: runlog - mountPath: /run/log - nodeSelector: - kubernetes.io/os: linux - tolerations: - - effect: NoSchedule - key: node-role.kubernetes.io/master - operator: Equal - value: "true" - - effect: NoSchedule - key: node-role.kubernetes.io/control-plane - operator: Equal - value: "true" - - effect: NoExecute - operator: Exists - - effect: NoSchedule - operator: Exists - - key: CriticalAddonsOnly - operator: Exists - volumes: - - name: varlog - hostPath: - path: /var/log - - name: runlog - hostPath: - path: /run/log diff --git a/hack/log/log-dump.sh b/hack/log/log-dump.sh deleted file mode 100755 index 76092cf876e0..000000000000 --- a/hack/log/log-dump.sh +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2020 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -set -o nounset -set -o pipefail - -# Install kubectl -REPO_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. -KUBECTL="${REPO_ROOT}/hack/tools/bin/kubectl" -cd "${REPO_ROOT}" && make "${KUBECTL##*/}" - -# shellcheck source=hack/ensure-kind.sh -source "${REPO_ROOT}/hack/ensure-kind.sh" - -export ARTIFACTS="${ARTIFACTS:-${PWD}/_artifacts}" -mkdir -p "${ARTIFACTS}/management-cluster" "${ARTIFACTS}/workload-cluster" - -export KUBECONFIG="${KUBECONFIG:-${PWD}/kubeconfig}" - -get_node_name() { - local -r pod_name="${1}" - # shellcheck disable=SC1083 - "${KUBECTL}" get pod "${pod_name}" -ojsonpath={.spec.nodeName} -} - -dump_mgmt_cluster_logs() { - # Assume the first kind cluster is the management cluster - local -r mgmt_cluster_name="$(kind get clusters | head -n 1)" - if [[ -z "${mgmt_cluster_name}" ]]; then - echo "No kind cluster is found" - return - fi - - kind get kubeconfig --name "${mgmt_cluster_name}" > "${PWD}/kind.kubeconfig" - local -r kubectl_kind="kubectl --kubeconfig=${PWD}/kind.kubeconfig" - - local -r resources=( - "clusters" - "azureclusters" - "machines" - "azuremachines" - "kubeadmconfigs" - "machinedeployments" - "azuremachinetemplates" - "kubeadmconfigtemplates" - "machinesets" - "kubeadmcontrolplanes" - "machinepools" - "azuremachinepools" - ) - mkdir -p "${ARTIFACTS}/management-cluster/resources" - for resource in "${resources[@]}"; do - ${kubectl_kind} get --all-namespaces "${resource}" -oyaml > "${ARTIFACTS}/management-cluster/resources/${resource}.log" || true - done - - { - echo "images in docker" - docker images - echo "images in bootstrap cluster using kubectl CLI" - (${kubectl_kind} get pods --all-namespaces -ojson \ - | jq --raw-output '.items[].spec.containers[].image' | sort) - echo "images in deployed cluster using kubectl CLI" - (${kubectl_kind} get pods --all-namespaces -ojson \ - | jq --raw-output '.items[].spec.containers[].image' | sort) - } > "${ARTIFACTS}/management-cluster/images.info" - - { - echo "kind cluster-info" - ${kubectl_kind} cluster-info dump - } > "${ARTIFACTS}/management-cluster/kind-cluster.info" - - kind export logs --name="${mgmt_cluster_name}" "${ARTIFACTS}/management-cluster" -} - -dump_workload_cluster_logs() { - echo "================ DUMPING LOGS FOR WORKLOAD CLUSTER (Linux) ==========" - dump_workload_cluster_logs_linux - - if [[ -z "${TEST_WINDOWS}" ]]; then - echo "TEST_WINDOWS envvar not set, skipping log collection for Windows nodes." - else - echo "================ DUMPING LOGS FOR WORKLOAD CLUSTER (Windows) ========" - dump_workload_cluster_logs_windows - fi - - dump_kube_system_pod_describes -} - -dump_kube_system_pod_describes() { - workload_cluster_dir="${ARTIFACTS}/workload-cluster" - echo "Collecting describe information of Pods in kube-system namespace" - "${KUBECTL}" get pod -n kube-system --no-headers=true | awk '{print $1}' | while read -r POD_NAME; do - NODE_NAME=$(kubectl get pod -n kube-system "${POD_NAME}" -o jsonpath='{.spec.nodeName}') - mkdir -p "${workload_cluster_dir}/${NODE_NAME}/pods/kube-system_${POD_NAME}" - kubectl describe pod -n kube-system "${POD_NAME}" > "${workload_cluster_dir}/${NODE_NAME}/pods/kube-system_${POD_NAME}/pod-describe.txt" & - done -} - -dump_workload_cluster_logs_linux() { - echo "Deploying log-dump-daemonset" - "${KUBECTL}" apply -f "${REPO_ROOT}/hack/log/log-dump-daemonset.yaml" - "${KUBECTL}" wait pod -l app=log-dump-node --for=condition=Ready --timeout=5m - - IFS=" " read -ra log_dump_pods <<< "$(kubectl get pod -l app=log-dump-node -ojsonpath='{.items[*].metadata.name}')" - local log_dump_commands=( - "journalctl --output=short-precise -u kubelet > kubelet.log" - "journalctl --output=short-precise -u containerd > containerd.log" - "journalctl --output=short-precise -k > kern.log" - "journalctl --output=short-precise > journal.log" - "cat /var/log/cloud-init.log > cloud-init.log" - "cat /var/log/cloud-init-output.log > cloud-init-output.log" - ) - - if [[ "$(uname)" == "Darwin" ]]; then - # tar on Mac OS does not support --wildcards flag - log_dump_commands+=( "tar -cf - var/log/pods --ignore-failed-read | tar xf - --strip-components=2 -C . '*kube-system*'" ) - else - log_dump_commands+=( "tar -cf - var/log/pods --ignore-failed-read | tar xf - --strip-components=2 -C . --wildcards '*kube-system*'" ) - fi - - node_names=() - for log_dump_pod in "${log_dump_pods[@]}"; do - local node_name - node_name="$(get_node_name "${log_dump_pod}")" - node_names+=("${node_name}") - - local log_dump_dir="${ARTIFACTS}/workload-cluster/${node_name}" - mkdir -p "${log_dump_dir}" - pushd "${log_dump_dir}" > /dev/null - for cmd in "${log_dump_commands[@]}"; do - bash -c "kubectl exec ${log_dump_pod} -- ${cmd}" & - done - popd > /dev/null - - echo "Exported logs for node \"${node_name}\"" - done - - # Wait for log-dumping commands running in the background to complete - wait - - # Rename Pods' directories: - # workload-cluster/node0/pods/ns0_pod0_xxx -> workload-cluster/node0/pods/ns0_pod0 - for node_name in "${node_names[@]}"; do - pushd "${ARTIFACTS}/workload-cluster/${node_name}/pods" > /dev/null - for dir in *; do - mv "${dir}" "${dir%_*}" - done - popd > /dev/null - done -} - -dump_workload_cluster_logs_windows() { - echo "Deploying log-dump-daemonset-windows" - "${KUBECTL}" apply -f "${REPO_ROOT}/hack/log/log-dump-daemonset-windows.yaml" - echo "Waiting for log-dump-daemonset-windows" - "${KUBECTL}" wait pod -l app=log-dump-node-windows --for=condition=Ready --timeout=5m - - IFS=" " read -ra log_dump_pods <<< "$(kubectl get pod -l app=log-dump-node-windows -ojsonpath='{.items[*].metadata.name}')" - - node_names=() - for log_dump_pod in "${log_dump_pods[@]}"; do - local node_name - node_name="$(get_node_name "${log_dump_pod}")" - echo "Getting logs for node ${node_name}" - node_names+=("${node_name}") - - local log_dump_dir="${ARTIFACTS}/workload-cluster/${node_name}" - mkdir -p "${log_dump_dir}" - - # make a new folder to copy logs to since files cannot be read to directly - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c mkdir log - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c xcopy /s c:\\var\\log\\kubelet c:\\log\\ - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c xcopy /s c:\\var\\log\\pods c:\\log\\ - - # Get a list of all of the files to copy with dir - # /s - recurse - # /B - bare format (no heading info or summaries) - # /A-D - exclude directories - IFS=" " read -ra log_dump_files <<< "$(kubectl exec "${log_dump_pod}" -- cmd.exe /c dir /s /B /A-D log | tr '\n' ' ' | tr -d '\r' )" - echo "Collecting pod logs" - - for log_dump_file in "${log_dump_files[@]}"; do - echo " Getting logfile ${log_dump_file}" - # reverse slashes and remove c:\log\ from paths - fixed_dump_file_path="$(echo "${log_dump_file//\\//}" | cut -d "/" -f3-)" - dir="$(dirname "${fixed_dump_file_path}")" - file="$(basename "${fixed_dump_file_path}")" - mkdir -p "${log_dump_dir}/pods/${dir}" - if [[ "${file}" =~ "kubelet" ]]; then - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c type "${log_dump_file}" > "${log_dump_dir}/${dir}/${file}" - else - "${KUBECTL}" exec "${log_dump_pod}" -- cmd.exe /c type "${log_dump_file}" > "${log_dump_dir}/pods/${dir}/${file}" - fi - done - - echo "Exported logs for node \"${node_name}\"" - done - - # Rename Pods' directories - for node_name in "${node_names[@]}"; do - pushd "${ARTIFACTS}/workload-cluster/${node_name}/pods" > /dev/null - for dir in *; do - mv "${dir}" "${dir%_*}" - done - popd > /dev/null - done -} - -cleanup() { - "${KUBECTL}" delete -f "${REPO_ROOT}/hack/log/log-dump-daemonset.yaml" || true - "${KUBECTL}" delete -f "${REPO_ROOT}/hack/log/log-dump-daemonset-windows.yaml" || true - # shellcheck source=hack/log/redact.sh - source "${REPO_ROOT}/hack/log/redact.sh" -} - -trap cleanup EXIT - -echo "================ DUMPING LOGS FOR MANAGEMENT CLUSTER ================" -dump_mgmt_cluster_logs - -dump_workload_cluster_logs diff --git a/scripts/ci-entrypoint.sh b/scripts/ci-entrypoint.sh index c422758b4ac8..0d35fa3080a7 100755 --- a/scripts/ci-entrypoint.sh +++ b/scripts/ci-entrypoint.sh @@ -153,8 +153,11 @@ on_exit() { "${KUBECTL}" get pods -A -owide || echo "Unable to get pods" fi + # unset kubeconfig which is currently pointing at workload cluster. + # we want to be pointing at the management cluster (kind in this case) unset KUBECONFIG - "${REPO_ROOT}/hack/log/log-dump.sh" || true + go run -tags e2e "${REPO_ROOT}"/test/logger.go --name "${CLUSTER_NAME}" --namespace default + "${REPO_ROOT}/hack/log/redact.sh" || true # cleanup if [[ -z "${SKIP_CLEANUP:-}" ]]; then cleanup diff --git a/test/e2e/azure_clusterproxy.go b/test/e2e/azure_clusterproxy.go new file mode 100644 index 000000000000..311c298301e4 --- /dev/null +++ b/test/e2e/azure_clusterproxy.go @@ -0,0 +1,268 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "io" + "os" + "path" + "path/filepath" + "strings" + "time" + + aadpodv1 "github.com/Azure/aad-pod-identity/pkg/apis/aadpodidentity/v1" + "github.com/Azure/azure-sdk-for-go/profiles/2020-09-01/monitor/mgmt/insights" + "github.com/Azure/go-autorest/autorest/azure/auth" + "github.com/Azure/go-autorest/autorest/to" + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" + expv1 "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" + clusterv1exp "sigs.k8s.io/cluster-api/exp/api/v1beta1" + "sigs.k8s.io/cluster-api/test/framework" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type ( + AzureClusterProxy struct { + framework.ClusterProxy + } + // myEventData is used to be able to Marshal insights.EventData into JSON + // see https://github.com/Azure/azure-sdk-for-go/issues/8224#issuecomment-614777550 + myEventData insights.EventData +) + +func NewAzureClusterProxy(name string, kubeconfigPath string, options ...framework.Option) *AzureClusterProxy { + proxy, ok := framework.NewClusterProxy(name, kubeconfigPath, initScheme(), options...).(framework.ClusterProxy) + Expect(ok).To(BeTrue(), "framework.NewClusterProxy must implement capi_e2e.ClusterProxy") + return &AzureClusterProxy{ + ClusterProxy: proxy, + } +} + +func initScheme() *runtime.Scheme { + scheme := runtime.NewScheme() + framework.TryAddDefaultSchemes(scheme) + Expect(infrav1.AddToScheme(scheme)).To(Succeed()) + Expect(expv1.AddToScheme(scheme)).To(Succeed()) + Expect(clusterv1exp.AddToScheme(scheme)).To(Succeed()) + // Add aadpodidentity v1 to the scheme. + aadPodIdentityGroupVersion := schema.GroupVersion{Group: aadpodv1.GroupName, Version: "v1"} + scheme.AddKnownTypes(aadPodIdentityGroupVersion, + &aadpodv1.AzureIdentity{}, + &aadpodv1.AzureIdentityList{}, + &aadpodv1.AzureIdentityBinding{}, + &aadpodv1.AzureIdentityBindingList{}, + &aadpodv1.AzureAssignedIdentity{}, + &aadpodv1.AzureAssignedIdentityList{}, + &aadpodv1.AzurePodIdentityException{}, + &aadpodv1.AzurePodIdentityExceptionList{}, + ) + metav1.AddToGroupVersion(scheme, aadPodIdentityGroupVersion) + return scheme +} + +func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) { + Byf("Dumping workload cluster %s/%s logs", namespace, name) + acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath) + + aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1) + + Byf("Dumping workload cluster %s/%s kube-system pod logs", namespace, name) + start := time.Now() + acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath) + Byf("Fetching kube-system pod logs took %s", time.Since(start).String()) + + Byf("Dumping workload cluster %s/%s Azure activity log", namespace, name) + start = time.Now() + + acp.collectActivityLogs(ctx, namespace, name, aboveMachinesPath) + Byf("Fetching activity logs took %s", time.Since(start).String()) +} + +func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) { + workload := acp.GetWorkloadCluster(ctx, namespace, name) + pods := &corev1.PodList{} + Expect(workload.GetClient().List(ctx, pods, client.InNamespace(kubesystem))).To(Succeed()) + + events, err := workload.GetClientSet().CoreV1().Events(kubesystem).List(ctx, metav1.ListOptions{}) + eventMsgs := map[string]string{} + if err != nil { + Byf("failed to get events in kube-system namespace: %v", err) + } else { + for _, event := range events.Items { + if event.InvolvedObject.Kind == "Pod" { + if _, ok := eventMsgs[event.InvolvedObject.Name]; !ok { + eventMsgs[event.InvolvedObject.Name] = event.Message + } else { + eventMsgs[event.InvolvedObject.Name] += fmt.Sprintf("\n%s", event.Message) + } + } + } + } + + for _, pod := range pods.Items { + for _, container := range pod.Spec.Containers { + // Watch each container's logs in a goroutine so we can stream them all concurrently. + go func(pod corev1.Pod, container corev1.Container) { + defer GinkgoRecover() + + Byf("Creating log watcher for controller %s/%s, container %s", kubesystem, pod.Name, container.Name) + logFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, container.Name+".log") + if os.MkdirAll(filepath.Dir(logFile), 0755); err != nil { + // Failing to mkdir should not cause the test to fail + Byf("Error mkdir: %v", err) + return + } + + f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error opening file to write pod logs: %v", err) + return + } + defer f.Close() + + opts := &corev1.PodLogOptions{ + Container: container.Name, + Follow: true, + } + + podLogs, err := workload.GetClientSet().CoreV1().Pods(kubesystem).GetLogs(pod.Name, opts).Stream(ctx) + if err != nil { + // Failing to stream logs should not cause the test to fail + Byf("Error starting logs stream for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) + return + } + defer podLogs.Close() + + out := bufio.NewWriter(f) + defer out.Flush() + _, err = out.ReadFrom(podLogs) + if err != nil && err != io.ErrUnexpectedEOF { + // Failing to stream logs should not cause the test to fail + Byf("Got error while streaming logs for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) + } + }(pod, container) + } + + go func(pod corev1.Pod) { + defer GinkgoRecover() + + Byf("Collecting events for Pod %s/%s", kubesystem, pod.Name) + eventFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, "pod-events.txt") + if err := os.MkdirAll(filepath.Dir(eventFile), 0755); err != nil { + // Failing to mkdir should not cause the test to fail + Byf("Error mkdir: %v", err) + return + } + + f, err := os.OpenFile(eventFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + // Failing to open the file should not cause the test to fail + Byf("Error opening file to write Pod events: %v", err) + return + } + defer f.Close() + + msg, ok := eventMsgs[pod.Name] + if !ok { + Byf("failed to find events of Pod %q", pod.Name) + return + } + + out := bufio.NewWriter(f) + defer out.Flush() + _, err = out.WriteString(msg) + if err != nil && err != io.ErrUnexpectedEOF { + // Failing to collect event message should not cause the test to fail + Byf("failed to collect event message of pod %s/%s: %v", kubesystem, pod.Name, err) + } + }(pod) + } +} + +func (acp *AzureClusterProxy) collectActivityLogs(ctx context.Context, namespace, name, aboveMachinesPath string) { + timeoutctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + settings, err := auth.GetSettingsFromEnvironment() + Expect(err).NotTo(HaveOccurred()) + subscriptionID := settings.GetSubscriptionID() + authorizer, err := settings.GetAuthorizer() + Expect(err).NotTo(HaveOccurred()) + activityLogsClient := insights.NewActivityLogsClient(subscriptionID) + activityLogsClient.Authorizer = authorizer + + workloadCluster, err := getAzureCluster(ctx, acp.GetClient(), namespace, name) + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error fetching activity logs for cluster %s in namespace %s. Not able to find the workload cluster on the management cluster: %v", name, namespace, err) + return + } + groupName := workloadCluster.Spec.ResourceGroup + + start := time.Now().Add(-2 * time.Hour).UTC().Format(time.RFC3339) + end := time.Now().UTC().Format(time.RFC3339) + + itr, err := activityLogsClient.ListComplete(timeoutctx, fmt.Sprintf("eventTimestamp ge '%s' and eventTimestamp le '%s' and resourceGroupName eq '%s'", start, end, groupName), "") + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error fetching activity logs for resource group %s: %v", groupName, err) + return + } + + logFile := path.Join(aboveMachinesPath, activitylog, groupName+".log") + Expect(os.MkdirAll(filepath.Dir(logFile), 0755)).To(Succeed()) + + f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + // Failing to fetch logs should not cause the test to fail + Byf("Error opening file to write activity logs: %v", err) + return + } + defer f.Close() + out := bufio.NewWriter(f) + defer out.Flush() + + for ; itr.NotDone(); err = itr.NextWithContext(timeoutctx) { + if err != nil { + Byf("Got error while iterating over activity logs for resource group %s: %v", groupName, err) + return + } + event := itr.Value() + if to.String(event.Category.Value) != "Policy" { + b, err := json.MarshalIndent(myEventData(event), "", " ") + if err != nil { + Byf("Got error converting activity logs data to json: %v", err) + } + if _, err = out.WriteString(string(b) + "\n"); err != nil { + Byf("Got error while writing activity logs for resource group %s: %v", groupName, err) + } + } + } +} diff --git a/test/e2e/azure_logcollector.go b/test/e2e/azure_logcollector.go index ee18cbc30748..d8cacc614571 100644 --- a/test/e2e/azure_logcollector.go +++ b/test/e2e/azure_logcollector.go @@ -176,6 +176,17 @@ func getHostname(m *clusterv1.Machine, isWindows bool) string { return hostname } +func getAzureCluster(ctx context.Context, managementClusterClient client.Client, namespace, name string) (*v1beta1.AzureCluster, error) { + key := client.ObjectKey{ + Namespace: namespace, + Name: name, + } + + azCluster := &v1beta1.AzureCluster{} + err := managementClusterClient.Get(context.TODO(), key, azCluster) + return azCluster, err +} + func getAzureMachine(ctx context.Context, managementClusterClient client.Client, m *clusterv1.Machine) (*v1beta1.AzureMachine, error) { key := client.ObjectKey{ Namespace: m.Spec.InfrastructureRef.Namespace, diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index b61c80bcc91a..11756b1faabe 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -20,230 +20,24 @@ limitations under the License. package e2e import ( - "bufio" "context" - "encoding/json" "flag" "fmt" - "io" "os" - "path" "path/filepath" "strings" "testing" - "time" - aadpodv1 "github.com/Azure/aad-pod-identity/pkg/apis/aadpodidentity/v1" - "github.com/Azure/azure-sdk-for-go/services/preview/monitor/mgmt/2019-06-01/insights" - "github.com/Azure/go-autorest/autorest/azure/auth" - "github.com/Azure/go-autorest/autorest/to" . "github.com/onsi/ginkgo" "github.com/onsi/ginkgo/config" "github.com/onsi/ginkgo/reporters" . "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" - expv1 "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1" - clusterv1exp "sigs.k8s.io/cluster-api/exp/api/v1beta1" capi_e2e "sigs.k8s.io/cluster-api/test/e2e" "sigs.k8s.io/cluster-api/test/framework" "sigs.k8s.io/cluster-api/test/framework/bootstrap" "sigs.k8s.io/cluster-api/test/framework/clusterctl" - "sigs.k8s.io/controller-runtime/pkg/client" ) -type ( - AzureClusterProxy struct { - framework.ClusterProxy - } - // myEventData is used to be able to Marshal insights.EventData into JSON - // see https://github.com/Azure/azure-sdk-for-go/issues/8224#issuecomment-614777550 - myEventData insights.EventData -) - -func NewAzureClusterProxy(name string, kubeconfigPath string, scheme *runtime.Scheme, options ...framework.Option) *AzureClusterProxy { - proxy, ok := framework.NewClusterProxy(name, kubeconfigPath, scheme, options...).(framework.ClusterProxy) - Expect(ok).To(BeTrue(), "framework.NewClusterProxy must implement capi_e2e.ClusterProxy") - return &AzureClusterProxy{ - ClusterProxy: proxy, - } -} - -func (acp *AzureClusterProxy) CollectWorkloadClusterLogs(ctx context.Context, namespace, name, outputPath string) { - Byf("Dumping workload cluster %s/%s logs", namespace, name) - acp.ClusterProxy.CollectWorkloadClusterLogs(ctx, namespace, name, outputPath) - - aboveMachinesPath := strings.Replace(outputPath, "/machines", "", 1) - - Byf("Dumping workload cluster %s/%s kube-system pod logs", namespace, name) - start := time.Now() - acp.collectPodLogs(ctx, namespace, name, aboveMachinesPath) - Byf("Fetching kube-system pod logs took %s", time.Since(start).String()) - - Byf("Dumping workload cluster %s/%s Azure activity log", namespace, name) - start = time.Now() - acp.collectActivityLogs(ctx, aboveMachinesPath) - Byf("Fetching activity logs took %s", time.Since(start).String()) -} - -func (acp *AzureClusterProxy) collectPodLogs(ctx context.Context, namespace string, name string, aboveMachinesPath string) { - workload := acp.GetWorkloadCluster(ctx, namespace, name) - pods := &corev1.PodList{} - Expect(workload.GetClient().List(ctx, pods, client.InNamespace(kubesystem))).To(Succeed()) - - events, err := workload.GetClientSet().CoreV1().Events(kubesystem).List(ctx, metav1.ListOptions{}) - eventMsgs := map[string]string{} - if err != nil { - Byf("failed to get events in kube-system namespace: %v", err) - } else { - for _, event := range events.Items { - if event.InvolvedObject.Kind == "Pod" { - if _, ok := eventMsgs[event.InvolvedObject.Name]; !ok { - eventMsgs[event.InvolvedObject.Name] = event.Message - } else { - eventMsgs[event.InvolvedObject.Name] += fmt.Sprintf("\n%s", event.Message) - } - } - } - } - - for _, pod := range pods.Items { - for _, container := range pod.Spec.Containers { - // Watch each container's logs in a goroutine so we can stream them all concurrently. - go func(pod corev1.Pod, container corev1.Container) { - defer GinkgoRecover() - - Byf("Creating log watcher for controller %s/%s, container %s", kubesystem, pod.Name, container.Name) - logFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, container.Name+".log") - if os.MkdirAll(filepath.Dir(logFile), 0755); err != nil { - // Failing to mkdir should not cause the test to fail - Byf("Error mkdir: %v", err) - return - } - - f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - // Failing to fetch logs should not cause the test to fail - Byf("Error opening file to write pod logs: %v", err) - return - } - defer f.Close() - - opts := &corev1.PodLogOptions{ - Container: container.Name, - Follow: true, - } - - podLogs, err := workload.GetClientSet().CoreV1().Pods(kubesystem).GetLogs(pod.Name, opts).Stream(ctx) - if err != nil { - // Failing to stream logs should not cause the test to fail - Byf("Error starting logs stream for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) - return - } - defer podLogs.Close() - - out := bufio.NewWriter(f) - defer out.Flush() - _, err = out.ReadFrom(podLogs) - if err != nil && err != io.ErrUnexpectedEOF { - // Failing to stream logs should not cause the test to fail - Byf("Got error while streaming logs for pod %s/%s, container %s: %v", kubesystem, pod.Name, container.Name, err) - } - }(pod, container) - } - - go func(pod corev1.Pod) { - defer GinkgoRecover() - - Byf("Collecting events for Pod %s/%s", kubesystem, pod.Name) - eventFile := path.Join(aboveMachinesPath, kubesystem, pod.Name, "pod-events.txt") - if err := os.MkdirAll(filepath.Dir(eventFile), 0755); err != nil { - // Failing to mkdir should not cause the test to fail - Byf("Error mkdir: %v", err) - return - } - - f, err := os.OpenFile(eventFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - // Failing to open the file should not cause the test to fail - Byf("Error opening file to write Pod events: %v", err) - return - } - defer f.Close() - - msg, ok := eventMsgs[pod.Name] - if !ok { - Byf("failed to find events of Pod %q", pod.Name) - return - } - - out := bufio.NewWriter(f) - defer out.Flush() - _, err = out.WriteString(msg) - if err != nil && err != io.ErrUnexpectedEOF { - // Failing to collect event message should not cause the test to fail - Byf("failed to collect event message of pod %s/%s: %v", kubesystem, pod.Name, err) - } - }(pod) - } -} - -func (acp *AzureClusterProxy) collectActivityLogs(ctx context.Context, aboveMachinesPath string) { - timeoutctx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - settings, err := auth.GetSettingsFromEnvironment() - Expect(err).NotTo(HaveOccurred()) - subscriptionID := settings.GetSubscriptionID() - authorizer, err := settings.GetAuthorizer() - Expect(err).NotTo(HaveOccurred()) - activityLogsClient := insights.NewActivityLogsClient(subscriptionID) - activityLogsClient.Authorizer = authorizer - - groupName := os.Getenv(AzureResourceGroup) - start := time.Now().Add(-2 * time.Hour).UTC().Format(time.RFC3339) - end := time.Now().UTC().Format(time.RFC3339) - - itr, err := activityLogsClient.ListComplete(timeoutctx, fmt.Sprintf("eventTimestamp ge '%s' and eventTimestamp le '%s' and resourceGroupName eq '%s'", start, end, groupName), "") - if err != nil { - // Failing to fetch logs should not cause the test to fail - Byf("Error fetching activity logs for resource group %s: %v", groupName, err) - return - } - - logFile := path.Join(aboveMachinesPath, activitylog, groupName+".log") - Expect(os.MkdirAll(filepath.Dir(logFile), 0755)).To(Succeed()) - - f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - // Failing to fetch logs should not cause the test to fail - Byf("Error opening file to write activity logs: %v", err) - return - } - defer f.Close() - out := bufio.NewWriter(f) - defer out.Flush() - - for ; itr.NotDone(); err = itr.NextWithContext(timeoutctx) { - if err != nil { - Byf("Got error while iterating over activity logs for resource group %s: %v", groupName, err) - return - } - event := itr.Value() - if to.String(event.Category.Value) != "Policy" { - b, err := json.MarshalIndent(myEventData(event), "", " ") - if err != nil { - Byf("Got error converting activity logs data to json: %v", err) - } - if _, err = out.WriteString(string(b) + "\n"); err != nil { - Byf("Got error while writing activity logs for resource group %s: %v", groupName, err) - } - } - } -} - func init() { flag.StringVar(&configPath, "e2e.config", "", "path to the e2e config file") flag.StringVar(&artifactFolder, "e2e.artifacts-folder", "", "folder where e2e test artifact should be stored") @@ -270,9 +64,6 @@ var _ = SynchronizedBeforeSuite(func() []byte { Expect(configPath).To(BeAnExistingFile(), "Invalid test suite argument. e2e.config should be an existing file.") Expect(os.MkdirAll(artifactFolder, 0755)).To(Succeed(), "Invalid test suite argument. Can't create e2e.artifacts-folder %q", artifactFolder) - By("Initializing a runtime.Scheme with all the GVK relevant for this test") - scheme := initScheme() - Byf("Loading the e2e test configuration from %q", configPath) e2eConfig = loadE2EConfig(configPath) @@ -280,7 +71,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { clusterctlConfigPath = createClusterctlLocalRepository(e2eConfig, filepath.Join(artifactFolder, "repository")) By("Setting up the bootstrap cluster") - bootstrapClusterProvider, bootstrapClusterProxy = setupBootstrapCluster(e2eConfig, scheme, useExistingCluster) + bootstrapClusterProvider, bootstrapClusterProxy = setupBootstrapCluster(e2eConfig, useExistingCluster) By("Initializing the bootstrap cluster") initBootstrapCluster(bootstrapClusterProxy, e2eConfig, clusterctlConfigPath, artifactFolder) @@ -305,8 +96,7 @@ var _ = SynchronizedBeforeSuite(func() []byte { kubeconfigPath := parts[3] e2eConfig = loadE2EConfig(configPath) - bootstrapClusterProxy = NewAzureClusterProxy("bootstrap", kubeconfigPath, initScheme(), - framework.WithMachineLogCollector(AzureLogCollector{})) + bootstrapClusterProxy = NewAzureClusterProxy("bootstrap", kubeconfigPath, framework.WithMachineLogCollector(AzureLogCollector{})) }) // Using a SynchronizedAfterSuite for controlling how to delete resources shared across ParallelNodes (~ginkgo threads). @@ -323,28 +113,6 @@ var _ = SynchronizedAfterSuite(func() { } }) -func initScheme() *runtime.Scheme { - scheme := runtime.NewScheme() - framework.TryAddDefaultSchemes(scheme) - Expect(infrav1.AddToScheme(scheme)).To(Succeed()) - Expect(expv1.AddToScheme(scheme)).To(Succeed()) - Expect(clusterv1exp.AddToScheme(scheme)).To(Succeed()) - // Add aadpodidentity v1 to the scheme. - aadPodIdentityGroupVersion := schema.GroupVersion{Group: aadpodv1.GroupName, Version: "v1"} - scheme.AddKnownTypes(aadPodIdentityGroupVersion, - &aadpodv1.AzureIdentity{}, - &aadpodv1.AzureIdentityList{}, - &aadpodv1.AzureIdentityBinding{}, - &aadpodv1.AzureIdentityBindingList{}, - &aadpodv1.AzureAssignedIdentity{}, - &aadpodv1.AzureAssignedIdentityList{}, - &aadpodv1.AzurePodIdentityException{}, - &aadpodv1.AzurePodIdentityExceptionList{}, - ) - metav1.AddToGroupVersion(scheme, aadPodIdentityGroupVersion) - return scheme -} - func loadE2EConfig(configPath string) *clusterctl.E2EConfig { config := clusterctl.LoadE2EConfig(context.TODO(), clusterctl.LoadE2EConfigInput{ConfigPath: configPath}) Expect(config).NotTo(BeNil(), "Failed to load E2E config from %s", configPath) @@ -371,7 +139,7 @@ func createClusterctlLocalRepository(config *clusterctl.E2EConfig, repositoryFol return clusterctlConfig } -func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme, useExistingCluster bool) (bootstrap.ClusterProvider, framework.ClusterProxy) { +func setupBootstrapCluster(config *clusterctl.E2EConfig, useExistingCluster bool) (bootstrap.ClusterProvider, framework.ClusterProxy) { var clusterProvider bootstrap.ClusterProvider kubeconfigPath := "" if !useExistingCluster { @@ -386,7 +154,7 @@ func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme, Expect(kubeconfigPath).To(BeAnExistingFile(), "Failed to get the kubeconfig file for the bootstrap cluster") } - clusterProxy := NewAzureClusterProxy("bootstrap", kubeconfigPath, scheme) + clusterProxy := NewAzureClusterProxy("bootstrap", kubeconfigPath) Expect(clusterProxy).NotTo(BeNil(), "Failed to get a bootstrap cluster proxy") return clusterProvider, clusterProxy diff --git a/test/logger.go b/test/logger.go new file mode 100644 index 000000000000..70d4fc9dc80d --- /dev/null +++ b/test/logger.go @@ -0,0 +1,114 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "flag" + "fmt" + "os" + "path" + "path/filepath" + + . "github.com/onsi/gomega" + + "sigs.k8s.io/cluster-api-provider-azure/test/e2e" + "sigs.k8s.io/cluster-api/test/framework" +) + +func Fail(message string, callerSkip ...int) { + panic(message) +} + +func main() { + // needed for ginkgo/gomega which is used by the capi test framework + RegisterFailHandler(Fail) + + // there is a package that is imported that has already defined kubeconfig + // this way we don't collide with any other packages as well + fs := flag.NewFlagSet("logger", flag.ExitOnError) + + // required flags + clustername := fs.String("name", "", "Name of the workload cluster to collect logs for") + + // optional flags that default + namespace := fs.String("namespace", "", "namot include the command name. Must be called after all flags in the FlagSet are defined and before flags are accessed by the program. The return value will be ErrHelp if -help or -h were set buespace on management cluster to collect logs for") + artifactFolder := fs.String("artifacts-folder", getArtifactsFolder(), "folder to store cluster logs") + kubeconfigPath := fs.String("kubeconfig", getKubeConfigPath(), "The kubeconfig for the management cluster") + + if err := fs.Parse(os.Args[1:]); err != nil { + fmt.Println("Unable to parse command flags") + os.Exit(1) + } + + // use the cluster name as the namespace which is default in e2e tests + if *namespace == "" { + namespace = clustername + } + + bootstrapClusterProxy := e2e.NewAzureClusterProxy("bootstrap", *kubeconfigPath, framework.WithMachineLogCollector(e2e.AzureLogCollector{})) + + // Set up log paths + clusterLogPath := filepath.Join(*artifactFolder, "clusters", *clustername) + resourcesYaml := filepath.Join(*artifactFolder, "clusters", "bootstrap", "resources") + managementClusterLogPath := filepath.Join(*artifactFolder, "clusters", "bootstrap", "controllers") + + fmt.Printf("Collecting logs for cluster %s in namespace %s and dumping logs to %s\n", *clustername, *namespace, *artifactFolder) + collectManagementClusterLogs(bootstrapClusterProxy, managementClusterLogPath, namespace, resourcesYaml) + bootstrapClusterProxy.CollectWorkloadClusterLogs(context.TODO(), *namespace, *clustername, clusterLogPath) +} + +func collectManagementClusterLogs(bootstrapClusterProxy *e2e.AzureClusterProxy, managementClusterLogPath string, namespace *string, workLoadClusterLogPath string) { + controllersDeployments := framework.GetControllerDeployments(context.TODO(), framework.GetControllerDeploymentsInput{ + Lister: bootstrapClusterProxy.GetClient(), + }) + for _, deployment := range controllersDeployments { + framework.WatchDeploymentLogs(context.TODO(), framework.WatchDeploymentLogsInput{ + GetLister: bootstrapClusterProxy.GetClient(), + ClientSet: bootstrapClusterProxy.GetClientSet(), + Deployment: deployment, + LogPath: managementClusterLogPath, + }) + } + + framework.DumpAllResources(context.TODO(), framework.DumpAllResourcesInput{ + Lister: bootstrapClusterProxy.GetClient(), + Namespace: *namespace, + LogPath: workLoadClusterLogPath, + }) +} + +func getKubeConfigPath() string { + config := os.Getenv("KUBECONFIG") + if config == "" { + d, _ := os.UserHomeDir() + return path.Join(d, ".kube", "config") + } + + return config +} + +func getArtifactsFolder() string { + artifacts := os.Getenv("ARTIFACTS") + if artifacts == "" { + return "_artifacts" + } + return artifacts +}