From d38a90df2b68daf7b96f1300e2fca308dc5498cc Mon Sep 17 00:00:00 2001 From: David Arthur Date: Sun, 6 Oct 2024 02:09:24 -0400 Subject: [PATCH] KAFKA-17672 Run quarantined tests separately (#17329) Introduce new quarantinedTest that excludes tests tagged with "flaky". Also introduce two new build parameters "maxQuarantineTestRetries" and "maxQuarantineTestRetryFailures". Reviewers: Chia-Ping Tsai --- .github/workflows/build.yml | 9 +-- .github/workflows/deflake.yml | 4 +- README.md | 24 ++++--- build.gradle | 63 ++++++++++++++++++- .../OffsetsApiIntegrationTest.java | 2 + .../transaction/ProducerIdManagerTest.scala | 2 + .../apache/kafka/common/test/api/Flaky.java | 40 ++++++++++++ 7 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 test-common/test-common-api/src/main/java/org/apache/kafka/common/test/api/Flaky.java diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2dc3b5c6793f3..3b0feba61583b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -130,18 +130,19 @@ jobs: timeout ${TIMEOUT_MINUTES}m ./gradlew --build-cache --continue --no-scan \ -PtestLoggingEvents=started,passed,skipped,failed \ -PmaxParallelForks=2 \ - -PmaxTestRetries=1 -PmaxTestRetryFailures=10 \ + -PmaxTestRetries=1 -PmaxTestRetryFailures=3 \ + -PmaxQuarantineTestRetries=3 -PmaxQuarantineTestRetryFailures=0 \ -PcommitId=xxxxxxxxxxxxxxxx \ - test + quarantinedTest test exitcode="$?" echo "exitcode=$exitcode" >> $GITHUB_OUTPUT - - name: Archive JUnit reports + - name: Archive JUnit HTML reports uses: actions/upload-artifact@v4 id: junit-upload-artifact with: name: junit-reports-${{ matrix.java }} path: | - **/build/reports/tests/test/* + **/build/reports/tests/* compression-level: 9 if-no-files-found: ignore - name: Archive Thread Dumps diff --git a/.github/workflows/deflake.yml b/.github/workflows/deflake.yml index d41cacdf96cc1..85e4ce959f0c7 100644 --- a/.github/workflows/deflake.yml +++ b/.github/workflows/deflake.yml @@ -60,7 +60,9 @@ jobs: -PtestLoggingEvents=started,passed,skipped,failed \ -PignoreFailures=true -PmaxParallelForks=2 \ -Pkafka.cluster.test.repeat=${{ inputs.test-repeat }} \ - ${{ inputs.test-module }}:test --tests ${{ inputs.test-pattern }} + -PmaxTestRetries=${{ inputs.test-repeat }} -PmaxTestRetryFailures=0 \ + -PmaxQuarantineTestRetries=${{ inputs.test-repeat }} -PmaxQuarantineTestRetryFailures=0 \ + ${{ inputs.test-module }}:test ${{ inputs.test-module }}:quarantinedTest --tests ${{ inputs.test-pattern }} exitcode="$?" echo "exitcode=$exitcode" >> $GITHUB_OUTPUT - name: Archive JUnit reports diff --git a/README.md b/README.md index 7bacfee75d825..6e90cac5c520c 100644 --- a/README.md +++ b/README.md @@ -34,14 +34,16 @@ Follow instructions in https://kafka.apache.org/quickstart ./gradlew docsJar # builds both (if applicable) javadoc and scaladoc jars for each module ### Run unit/integration tests ### - ./gradlew test # runs both unit and integration tests + ./gradlew test # runs both unit and integration tests ./gradlew unitTest ./gradlew integrationTest + ./gradlew quarantinedTest # runs the quarantined tests + ### Force re-running tests without code change ### - ./gradlew test --rerun - ./gradlew unitTest --rerun - ./gradlew integrationTest --rerun + ./gradlew test --rerun-tasks + ./gradlew unitTest --rerun-tasks + ./gradlew integrationTest --rerun-tasks ### Running a particular unit/integration test ### ./gradlew clients:test --tests RequestResponseTest @@ -64,11 +66,17 @@ to `log4j.logger.org.apache.kafka=INFO` and then run: And you should see `INFO` level logs in the file under the `clients/build/test-results/test` directory. ### Specifying test retries ### -By default, each failed test is retried once up to a maximum of five retries per test run. Tests are retried at the end of the test task. Adjust these parameters in the following way: +By default, each failed test is retried once up to a maximum of three total retries per test run. +Tests are retried at the end of the test task. Adjust these parameters in the following way: - ./gradlew test -PmaxTestRetries=1 -PmaxTestRetryFailures=5 - -See [Test Retry Gradle Plugin](https://github.com/gradle/test-retry-gradle-plugin) for more details. + ./gradlew test -PmaxTestRetries=1 -PmaxTestRetryFailures=3 + +Additionally, quarantined tests are automatically retried three times up to a total of +20 retries per run. This is controlled by similar parameters. + + ./gradlew test -PmaxQuarantineTestRetries=3 -PmaxQuarantineTestRetryFailures=20 + +See [Test Retry Gradle Plugin](https://github.com/gradle/test-retry-gradle-plugin) for and [build.yml](.github/workflows/build.yml) more details. ### Generating test coverage reports ### Generate coverage reports for the whole project: diff --git a/build.gradle b/build.gradle index ce47cc538d6cc..ec55a370a7bd8 100644 --- a/build.gradle +++ b/build.gradle @@ -85,6 +85,9 @@ ext { userMaxTestRetries = project.hasProperty('maxTestRetries') ? maxTestRetries.toInteger() : 0 userMaxTestRetryFailures = project.hasProperty('maxTestRetryFailures') ? maxTestRetryFailures.toInteger() : 0 + userMaxQuarantineTestRetries = project.hasProperty('maxQuarantineTestRetries') ? maxQuarantineTestRetries.toInteger() : 0 + userMaxQuarantineTestRetryFailures = project.hasProperty('maxQuarantineTestRetryFailures') ? maxQuarantineTestRetryFailures.toInteger() : 0 + skipSigning = project.hasProperty('skipSigning') && skipSigning.toBoolean() shouldSign = !skipSigning && !version.endsWith("SNAPSHOT") @@ -501,6 +504,7 @@ subprojects { useJUnitPlatform { includeEngines 'junit-jupiter' + excludeTags 'flaky' } develocity { @@ -524,7 +528,7 @@ subprojects { if (ext.isGithubActions) { def dest = rootProject.layout.buildDirectory.dir("junit-xml/${project.name}").get().asFile println "Copy JUnit XML for ${project.name} to $dest" - ant.copy(todir: "$dest") { + ant.copy(todir: "$dest/test") { ant.fileset(dir: "${test.reports.junitXml.entryPoint}") } @@ -537,6 +541,62 @@ subprojects { } } + task quarantinedTest(type: Test, dependsOn: compileJava) { + ext { + isGithubActions = System.getenv('GITHUB_ACTIONS') != null + } + + // Disable caching and up-to-date for this task. We always want quarantined tests + // to run and never want to cache their results. Since we do this, we can avoid + // explicitly failing the build like we do in "test" with ext.hadFailure. + outputs.upToDateWhen { false } + outputs.cacheIf { false } + + maxParallelForks = maxTestForks + ignoreFailures = userIgnoreFailures + + maxHeapSize = defaultMaxHeapSize + jvmArgs = defaultJvmArgs + + // KAFKA-17433 Used by deflake.yml github action to repeat individual tests + systemProperty("kafka.cluster.test.repeat", project.findProperty("kafka.cluster.test.repeat")) + + testLogging { + events = userTestLoggingEvents ?: testLoggingEvents + showStandardStreams = userShowStandardStreams ?: testShowStandardStreams + exceptionFormat = testExceptionFormat + displayGranularity = 0 + } + logTestStdout.rehydrate(delegate, owner, this)() + + useJUnitPlatform { + includeEngines 'junit-jupiter' + includeTags 'flaky' + } + + develocity { + testRetry { + maxRetries = userMaxQuarantineTestRetries + maxFailures = userMaxQuarantineTestRetryFailures + } + } + + // This closure will copy JUnit XML files out of the sub-project's build directory and into + // a top-level build/junit-xml directory. This is necessary to avoid reporting on tests which + // were not run, but instead were restored via FROM-CACHE. See KAFKA-17479 for more details. + doLast { + if (ext.isGithubActions) { + def dest = rootProject.layout.buildDirectory.dir("junit-xml/${project.name}").get().asFile + println "Copy JUnit XML for ${project.name} to $dest" + ant.copy(todir: "$dest/quarantinedTest", failonerror: "false") { + ant.fileset(dir: "${quarantinedTest.reports.junitXml.entryPoint}") { + ant.include(name: "**/*.xml") + } + } + } + } + } + task integrationTest(type: Test, dependsOn: compileJava) { maxParallelForks = maxTestForks ignoreFailures = userIgnoreFailures @@ -3507,6 +3567,7 @@ project(':connect:runtime') { testImplementation project(':storage') testImplementation project(':connect:test-plugins') testImplementation project(':server-common').sourceSets.test.output + testImplementation project(':test-common:test-common-api') testImplementation libs.junitJupiter testImplementation libs.mockitoCore diff --git a/connect/runtime/src/test/java/org/apache/kafka/connect/integration/OffsetsApiIntegrationTest.java b/connect/runtime/src/test/java/org/apache/kafka/connect/integration/OffsetsApiIntegrationTest.java index 0ac514039f084..0f013129c7514 100644 --- a/connect/runtime/src/test/java/org/apache/kafka/connect/integration/OffsetsApiIntegrationTest.java +++ b/connect/runtime/src/test/java/org/apache/kafka/connect/integration/OffsetsApiIntegrationTest.java @@ -19,6 +19,7 @@ import org.apache.kafka.clients.CommonClientConfigs; import org.apache.kafka.clients.admin.Admin; import org.apache.kafka.clients.admin.ConsumerGroupListing; +import org.apache.kafka.common.test.api.Flaky; import org.apache.kafka.connect.runtime.ConnectorConfig; import org.apache.kafka.connect.runtime.SourceConnectorConfig; import org.apache.kafka.connect.runtime.distributed.DistributedConfig; @@ -331,6 +332,7 @@ public void testAlterSinkConnectorOffsets() throws Exception { alterAndVerifySinkConnectorOffsets(baseSinkConnectorConfigs(), connect.kafka()); } + @Flaky("KAFKA-15914") @Test public void testAlterSinkConnectorOffsetsOverriddenConsumerGroupId() throws Exception { Map connectorConfigs = baseSinkConnectorConfigs(); diff --git a/core/src/test/scala/unit/kafka/coordinator/transaction/ProducerIdManagerTest.scala b/core/src/test/scala/unit/kafka/coordinator/transaction/ProducerIdManagerTest.scala index eef0b31e415a3..f74d0d55c90a9 100644 --- a/core/src/test/scala/unit/kafka/coordinator/transaction/ProducerIdManagerTest.scala +++ b/core/src/test/scala/unit/kafka/coordinator/transaction/ProducerIdManagerTest.scala @@ -24,6 +24,7 @@ import org.apache.kafka.common.errors.CoordinatorLoadInProgressException import org.apache.kafka.common.message.AllocateProducerIdsResponseData import org.apache.kafka.common.protocol.Errors import org.apache.kafka.common.requests.AllocateProducerIdsResponse +import org.apache.kafka.common.test.api.Flaky import org.apache.kafka.common.utils.{MockTime, Time} import org.apache.kafka.server.NodeToControllerChannelManager import org.apache.kafka.server.common.ProducerIdsBlock @@ -40,6 +41,7 @@ import java.util.concurrent.{ConcurrentLinkedQueue, CountDownLatch, Executors, T import scala.collection.mutable import scala.util.{Failure, Success, Try} +@Flaky("KAFKA-17654") class ProducerIdManagerTest { var brokerToController: NodeToControllerChannelManager = mock(classOf[NodeToControllerChannelManager]) diff --git a/test-common/test-common-api/src/main/java/org/apache/kafka/common/test/api/Flaky.java b/test-common/test-common-api/src/main/java/org/apache/kafka/common/test/api/Flaky.java new file mode 100644 index 0000000000000..c277ad5b8dc4c --- /dev/null +++ b/test-common/test-common-api/src/main/java/org/apache/kafka/common/test/api/Flaky.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kafka.common.test.api; + +import org.junit.jupiter.api.Tag; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Target({ElementType.TYPE, ElementType.METHOD}) +@Retention(RetentionPolicy.RUNTIME) +@Tag("flaky") +public @interface Flaky { + /** + * Required reference to a KAFKA Jira ticket. + */ + String value(); + + /** + * Optional comment describing the reason for quarantined. + */ + String comment() default ""; +}