Skip to content

Commit

Permalink
Code review, gather logs from gcloud
Browse files Browse the repository at this point in the history
  • Loading branch information
sjberman committed Mar 11, 2024
1 parent 020a0d8 commit 116dc20
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 53 deletions.
2 changes: 1 addition & 1 deletion tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ run-tests-on-vm: ## Run the functional tests on a GCP VM

.PHONY: nfr-test
nfr-test: ## Run the NFR tests on a GCP VM
bash scripts/run-tests-gcp-vm.sh true
NFR=true bash scripts/run-tests-gcp-vm.sh

.PHONY: start-longevity-test
start-longevity-test: ## Start the longevity test to run for 4 days in GKE
Expand Down
8 changes: 6 additions & 2 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,17 @@ To start the longevity test, set up your VM (`create-and-setup-vm`) and run
make start-longevity-test
```

<!-- -->
> Note: If you want to change the time period for which the test runs, update the `wrk` commands in `suite/scripts/longevity-wrk.sh` to the time period you want, and run `make sync-files-to-vm`.
<!-- -->
> Note: If you want to re-run the longevity test, you need to clear out the `cafe.example.com` entry from the `/etc/hosts` file on your VM.
You can verify the test is working by checking nginx logs to see traffic flow, and check that the cronjob is running and redeploying apps.

To complete the longevity test and collect results, first visit the [GCP Monitoring Dashboards](https://console.cloud.google.com/monitoring/dashboards) page and select the `NGF Longevity Test` dashboard. Take PNG screenshots of each chart for the time period in which your test ran, and save those to be added to the results file.
After 4 days (96h), you can complete the longevity tests and collect results. To ensure that the traffic has stopped flowing, you can ssh to the VM using `gcloud compute ssh` and run `ps aux | grep wrk` to verify the `wrk` commands are no longer running. Then, visit the [GCP Monitoring Dashboards](https://console.cloud.google.com/monitoring/dashboards) page and select the `NGF Longevity Test` dashboard. Take PNG screenshots of each chart for the time period in which your test ran, and save those to be added to the results file.

Next, run:
Finally, run

```makefile
make stop-longevity-test
Expand Down
3 changes: 2 additions & 1 deletion tests/scripts/create-gke-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ gcloud container clusters create ${GKE_CLUSTER_NAME} \
--enable-private-nodes \
--master-ipv4-cidr 172.16.${ip_random_digit}.32/28 \
--metadata=block-project-ssh-keys=TRUE \
--monitoring=SYSTEM,POD,DEPLOYMENT
--monitoring=SYSTEM,POD,DEPLOYMENT \
--logging=SYSTEM,WORKLOAD

# Add current IP to GKE master control node access, if this script is not invoked during a CI run.
if [ "${IS_CI}" = "false" ]; then
Expand Down
26 changes: 24 additions & 2 deletions tests/scripts/run-tests-gcp-vm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

NFR=${1:-false}

source scripts/vars.env

SCRIPT=run-tests.sh
Expand All @@ -21,3 +19,27 @@ gcloud compute ssh --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} username@
if [ "${NFR}" = "true" ]; then
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
fi

## If tearing down the longevity test, we need to collect logs from gcloud and add to the results
if [ "${STOP_LONGEVITY}" = "true" ]; then
version=${NGF_VERSION}
if [ "$version" = "" ]; then
version=${TAG}
fi

results="${SCRIPT_DIR}/../results/longevity/$version/$version.md"
printf "\n## Error Logs\n\n" >> $results

## ngf error logs
ngfErrText=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx-gateway AND severity=ERROR AND SEARCH("error")' --format "value(textPayload)")
ngfErrJSON=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx-gateway AND severity=ERROR AND SEARCH("error")' --format "value(jsonPayload)")
printf "### nginx-gateway\n$ngfErrText\n$ngfErrJSON\n\n" >> $results

## nginx error logs
ngxErr=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx AND severity=ERROR AND SEARCH("`[warn]`") OR SEARCH("`[error]`") OR SEARCH("`[emerg]`")' --format "value(textPayload)")
printf "### nginx\n$ngxErr\n\n" >> $results

## nginx non-200 responses (also filter out 499 since wrk cancels connections)
ngxNon200=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx AND "GET" "HTTP/1.1" -"200" -"499" -"client prematurely closed connection"' --format "value(textPayload)")
printf "$ngxNon200\n\n" >> $results
fi
54 changes: 7 additions & 47 deletions tests/suite/longevity_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
package suite

import (
"bufio"
"bytes"
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -81,57 +77,21 @@ var _ = Describe("Longevity", Label("longevity-setup", "longevity-teardown"), fu
Expect(err).ToNot(HaveOccurred())

Expect(framework.WriteContent(resultsFile, "\n## Traffic\n"))
writeTrafficResults(resultsFile, homeDir, "coffee.txt", "HTTP")
writeTrafficResults(resultsFile, homeDir, "tea.txt", "HTTPS")

// gather any error logs
names, err := framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetTimeout)
Expect(err).ToNot(HaveOccurred())

Expect(framework.WriteContent(resultsFile, "\n## Error Logs\n"))
writeErrorLogs(resultsFile, names[0], "nginx-gateway")
writeErrorLogs(resultsFile, names[0], "nginx")
Expect(writeTrafficResults(resultsFile, homeDir, "coffee.txt", "HTTP")).To(Succeed())
Expect(writeTrafficResults(resultsFile, homeDir, "tea.txt", "HTTPS")).To(Succeed())

Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed())
Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed())
})
})

func writeTrafficResults(resultsFile *os.File, homeDir, filename, testname string) {
func writeTrafficResults(resultsFile *os.File, homeDir, filename, testname string) error {
file := fmt.Sprintf("%s/%s", homeDir, filename)
content, err := os.ReadFile(file)
Expect(err).ToNot(HaveOccurred())

formattedContent := fmt.Sprintf("%s:\n\n```text\n%s```\n", testname, string(content))
Expect(framework.WriteContent(resultsFile, formattedContent)).To(Succeed())
}

func writeErrorLogs(resultsFile *os.File, pod, container string) {
logReq := clientGoClient.CoreV1().Pods(ngfNamespace).GetLogs(pod, &core.PodLogOptions{Container: container})

ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout)
defer cancel()

logs, err := logReq.Stream(ctx)
Expect(err).ToNot(HaveOccurred())
defer logs.Close()

buf := new(bytes.Buffer)
_, err = io.Copy(buf, logs)
Expect(err).ToNot(HaveOccurred())

Expect(framework.WriteContent(resultsFile, fmt.Sprintf("\n### %s\n", container)))

scanner := bufio.NewScanner(strings.NewReader(buf.String()))
for scanner.Scan() {
line := scanner.Text()
if isError(line) {
Expect(framework.WriteContent(resultsFile, line)).To(Succeed())
}
if err != nil {
return err
}
Expect(scanner.Err()).ToNot(HaveOccurred())
}

func isError(line string) bool {
return strings.Contains(line, "error") || strings.Contains(line, "warn") || strings.Contains(line, "emerg")
formattedContent := fmt.Sprintf("%s:\n\n```text\n%s```\n", testname, string(content))
return framework.WriteContent(resultsFile, formattedContent)
}

0 comments on commit 116dc20

Please sign in to comment.