Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace the usage of tianon/true images with docker compose up --wait #1720

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions internal/compose/compose.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ type Project struct {
name string
composeFilePaths []string

dockerComposeV1 bool
dockerComposeStandalone bool
disableANSI bool
disablePullProgressInformation bool
Expand Down Expand Up @@ -199,20 +198,19 @@ func NewProject(name string, paths ...string) (*Project, error) {
// Passing a nil context here because we are on initialization.
ver, err := c.dockerComposeVersion(context.Background())
if err != nil {
logger.Errorf("Unable to determine Docker Compose version: %v. Defaulting to 1.x", err)
c.dockerComposeV1 = true
logger.Errorf("Unable to determine Docker Compose version: %v. Defaulting to 2.x", err)
return &c, nil
}

versionMessage := fmt.Sprintf("Determined Docker Compose version: %v", ver)
logger.Debug(versionMessage)

if ver.Major() == 1 {
versionMessage = fmt.Sprintf("%s, the tool will use Compose V1", versionMessage)
c.dockerComposeV1 = true
return nil, fmt.Errorf("the docker compose version %v is not supported", ver)
}
logger.Debug(versionMessage)

v, ok = os.LookupEnv(DisableVerboseOutputComposeEnv)
if !c.dockerComposeV1 && ok && strings.ToLower(v) != "false" {
if ok && strings.ToLower(v) != "false" {
if c.composeVersion.LessThan(semver.MustParse("2.19.0")) {
c.disableANSI = true
} else {
Expand Down Expand Up @@ -543,8 +541,5 @@ func (p *Project) dockerComposeVersion(ctx context.Context) (*semver.Version, er

// ContainerName method the container name for the service.
func (p *Project) ContainerName(serviceName string) string {
if p.dockerComposeV1 {
return fmt.Sprintf("%s_%s_1", p.name, serviceName)
}
return fmt.Sprintf("%s-%s-1", p.name, serviceName)
}
42 changes: 0 additions & 42 deletions internal/stack/_static/docker-compose-stack.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,6 @@ services:
ports:
- "127.0.0.1:9200:9200"

elasticsearch_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
elasticsearch:
condition: service_healthy

kibana:
image: "${KIBANA_IMAGE_REF}"
depends_on:
Expand All @@ -50,13 +43,6 @@ services:
ports:
- "127.0.0.1:5601:5601"

kibana_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
kibana:
condition: service_healthy

package-registry:
build:
context: ../../../
Expand All @@ -83,13 +69,6 @@ services:
- "127.0.0.1:8080:8080"
- "127.0.0.1:9000:9000"

package-registry_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
package-registry:
condition: service_healthy

fleet-server:
image: "${ELASTIC_AGENT_IMAGE_REF}"
depends_on:
Expand Down Expand Up @@ -124,13 +103,6 @@ services:
- "127.0.0.1:8200:8200"
{{ end }}

fleet-server_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
fleet-server:
condition: service_healthy

elastic-agent:
image: "${ELASTIC_AGENT_IMAGE_REF}"
depends_on:
Expand All @@ -154,13 +126,6 @@ services:
source: ../../../tmp/service_logs/
target: /run/service_logs/

elastic-agent_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
elastic-agent:
condition: service_healthy
pkoutsovasilis marked this conversation as resolved.
Show resolved Hide resolved

{{ $logstash_enabled := fact "logstash_enabled" }}
{{ if eq $logstash_enabled "true" }}
logstash:
Expand All @@ -187,11 +152,4 @@ services:
- ELASTIC_USER=elastic
- ELASTIC_PASSWORD=changeme
- ELASTIC_HOSTS=https://127.0.0.1:9200

logstash_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
logstash:
condition: service_healthy
{{ end }}
14 changes: 0 additions & 14 deletions internal/stack/_static/serverless-docker-compose.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,6 @@ services:
target: /run/service_logs/
- "../certs/ca-cert.pem:/etc/ssl/certs/elastic-package.pem"

elastic-agent_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
elastic-agent:
condition: service_healthy

{{ $logstash_enabled := fact "logstash_enabled" }}
{{ if eq $logstash_enabled "true" }}
logstash:
Expand All @@ -50,11 +43,4 @@ services:
- ELASTIC_USER={{ fact "username" }}
- ELASTIC_PASSWORD={{ fact "password" }}
- ELASTIC_HOSTS={{ fact "elasticsearch_host" }}

logstash_is_ready:
image: tianon/true
platform: linux/amd64
depends_on:
logstash:
condition: service_healthy
{{ end }}
20 changes: 4 additions & 16 deletions internal/stack/compose.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func dockerComposeBuild(ctx context.Context, options Options) error {
withEnv(stackVariantAsEnv(options.StackVersion)).
withEnvs(options.Profile.ComposeEnvVars()).
build(),
Services: withIsReadyServices(withDependentServices(options.Services)),
Services: withDependentServices(options.Services),
}

if err := c.Build(ctx, opts); err != nil {
Expand All @@ -95,7 +95,7 @@ func dockerComposePull(ctx context.Context, options Options) error {
withEnv(stackVariantAsEnv(options.StackVersion)).
withEnvs(options.Profile.ComposeEnvVars()).
build(),
Services: withIsReadyServices(withDependentServices(options.Services)),
Services: withDependentServices(options.Services),
}

if err := c.Pull(ctx, opts); err != nil {
Expand All @@ -112,7 +112,7 @@ func dockerComposeUp(ctx context.Context, options Options) error {

var args []string
if options.DaemonMode {
args = append(args, "-d")
args = append(args, "-d", "--wait", "--wait-timeout", fmt.Sprintf("%d", 600))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Testing in main branch without using detach mode (-d), if elastic-agent container fails... it is being re-tried the docker-compose up

dependency failed to start: container elastic-package-stack-elastic-agent-1 exited (1)
2024/03/13 19:13:05 DEBUG output command: /usr/bin/docker ps -a --filter label=com.docker.compose.project=elastic-package-stack --format {{.ID}}
2024/03/13 19:13:05 DEBUG output command: /usr/bin/docker inspect d5df6764fb51 09f479d5637e 644fbbaa8557 341a68995249 831674f8c23f 920f6d104b80 25de6ef50f35 8cd0417a9622 afa7b068ae1b 44f10e51526b
Elastic Agent failed to start, trying again in 10s.
2024/03/13 19:13:16 DEBUG running command: /usr/bin/docker compose version --short
2024/03/13 19:13:16 DEBUG Determined Docker Compose version: 2.24.6
2024/03/13 19:13:16 DEBUG running command: /usr/bin/docker compose -f /home/mariorodriguez/.elastic-package/profiles/default/stack/snapshot.yml -p elastic-package-stack up
[+] Running 9/0

With this change, it looks like elastic-package does not get the error (elastic-agent failed to start) and it cannot retry the docker-compose up. Leaving the scenario with the elastic agent with exited status. It looks like is_ready containers help in this case, not sure how

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mrodm this is weird as the e2e-test in the CI have caught one case that this happens. Can you help with an example that hopefully reproduces what you see?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is weird as the e2e-test in the CI have caught one case that this happens.

Steps in CI use detached mode (-d):

elastic-package stack up -d -v

Can you help with an example that hopefully reproduces what you see?

Sure ! Here I was referring to the case where that flag is not used:

elastic-package stack up -v

Not all runs of this command fail with this, so it needs to be repeated until it is hit that error.

After some retries running the above command, elastic-agent-1 container could not start and elastic-package did not try to re-start the container as it would happen :

elastic-agent-1     | {"log.level":"info","@timestamp":"2024-03-26T11:51:50.649Z","log.origin":{"file.name":"cmd/enroll_cmd.go","file.line":505},"message":"1st enrollment attempt failed, retrying for 10m0s, every 1m0s enrolling to URL: https://fleet-server:8220/","ecs.version":"1.6.0"}
elastic-agent-1     | Error: fail to enroll: fail to execute request to fleet-server: EOF
elastic-agent-1     | For help, please see our troubleshooting guide at https://www.elastic.co/guide/en/fleet/8.12/fleet-troubleshooting.html
elastic-agent-1     | Error: enrollment failed: exit status 1
elastic-agent-1     | For help, please see our troubleshooting guide at https://www.elastic.co/guide/en/fleet/8.12/fleet-troubleshooting.html
elastic-agent-1 exited with code 1

And the status of the cluster:

 $ elastic-package stack status
Status of Elastic stack services:
╭──────────────────┬─────────┬───────────────────╮
│ SERVICE          │ VERSION │ STATUS            │
├──────────────────┼─────────┼───────────────────┤
│ elastic-agent    │ 8.12.2  │ exited (1)        │
│ elasticsearch    │ 8.12.2  │ running (healthy) │
│ fleet-server     │ 8.12.2  │ running (healthy) │
│ kibana           │ 8.12.2  │ running (healthy) │
│ package-registry │ latest  │ running (healthy) │
╰──────────────────┴─────────┴───────────────────╯

With the same options running with the latest published version, it does the retry:

elastic-agent-1              | {"log.level":"info","@timestamp":"2024-03-26T12:01:12.540Z","log.origin":{"file.name":"cmd/enroll_cmd.go","file.line":505},"message":"1st enrollment attempt failed, retrying for 10m0s, every 1m0s enrolling to URL: https://fleet-server:8220/","ecs.version":"1.6.0"}
elastic-agent-1              | Error: fail to enroll: fail to execute request to fleet-server: dial tcp 192.168.192.6:8220: connect: connection refused
elastic-agent-1              | For help, please see our troubleshooting guide at https://www.elastic.co/guide/en/fleet/8.12/fleet-troubleshooting.html
elastic-agent-1              | Error: enrollment failed: exit status 1
elastic-agent-1              | For help, please see our troubleshooting guide at https://www.elastic.co/guide/en/fleet/8.12/fleet-troubleshooting.html
elastic-agent-1 exited with code 1
dependency failed to start: container elastic-package-stack-elastic-agent-1 exited (1)
2024/03/26 13:01:12 DEBUG output command: /usr/bin/docker ps -a --filter label=com.docker.compose.project=elastic-package-stack --format {{.ID}}
2024/03/26 13:01:12 DEBUG output command: /usr/bin/docker inspect 0ebea2626c5e b5873a01c18c 94ec7181f20c 0449f8bf38ef a8c2d3eaf656 cc5633104400 1916c6b5f1dc 25baea050703 b1af9e72d4f2 2f268ae81ff3
Elastic Agent failed to start, trying again in 10s.
2024/03/26 13:01:22 DEBUG running command: /usr/bin/docker compose version --short
2024/03/26 13:01:22 DEBUG Determined Docker Compose version: 2.25.0
2024/03/26 13:01:22 DEBUG running command: /usr/bin/docker compose -f /home/mariorodriguez/.elastic-package/profiles/default/stack/snapshot.yml -p elastic-package-stack up
WARN[0000] /home/mariorodriguez/.elastic-package/profiles/default/stack/snapshot.yml: `version` is obsolete 
[+] Running 9/0
 ✔ Container elastic-package-stack-package-registry-1           Running                                                                           0.0s 
 ✔ Container elastic-package-stack-elasticsearch-1              Running                                                                           0.0s 
 ✔ Container elastic-package-stack-package-registry_is_ready-1  Created                                                                           0.0s 
 ✔ Container elastic-package-stack-elasticsearch_is_ready-1     Created                                                                           0.0s 
 ✔ Container elastic-package-stack-kibana-1                     Running                                                                           0.0s 
 ✔ Container elastic-package-stack-kibana_is_ready-1            Created                                                                           0.0s 
 ✔ Container elastic-package-stack-fleet-server-1               Running                                                                           0.0s 
 ✔ Container elastic-package-stack-fleet-server_is_ready-1      Created                                                                           0.0s 
 ✔ Container elastic-package-stack-elastic-agent-1              Created                                                                           0.0s 
Attaching to elastic-agent-1, elastic-agent_is_ready-1, elasticsearch-1, elasticsearch_is_ready-1, fleet-server-1, fleet-server_is_ready-1, kibana-1, kibana_is_ready-1, package-registry-1, package-registry_is_ready-1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's odd 🥲 hmmm I see, I think this is what happens; with the *_is_ready services in place we have dependencies on the actual services, so when elastic-agent fails the respective _is_ready service can't start because it's dependency failed completely and thus docker compose up return an error in the sense sorry I couldn't bring up all the services. However when we remove the *_is_ready services then this isn't triggering as nothing depends on elastic-agent and thus up considers that everything had been brought up and the user will deal with any errors visible in the logs... I don't have an immediate fix for that

}

appConfig, err := install.Configuration()
Expand All @@ -127,7 +127,7 @@ func dockerComposeUp(ctx context.Context, options Options) error {
withEnvs(options.Profile.ComposeEnvVars()).
build(),
ExtraArgs: args,
Services: withIsReadyServices(withDependentServices(options.Services)),
Services: withDependentServices(options.Services),
}

if err := c.Up(ctx, opts); err != nil {
Expand Down Expand Up @@ -171,18 +171,6 @@ func withDependentServices(services []string) []string {
return services
}

func withIsReadyServices(services []string) []string {
if len(services) == 0 {
return services // load all defined services
}

var allServices []string
for _, aService := range services {
allServices = append(allServices, aService, fmt.Sprintf("%s_%s", aService, readyServicesSuffix))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is not needed to support docker-compose V1 and it can be removed all the _is_ready containers.
It looks like that readyServicesSuffix constant could be deleted.

If that is the case, the code of Status function in internal/stack/status.go could also be simplified.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those container definitions are part of the services required for testing (system tests).

Related to the status command, it should not affect since that command is intended to show just information about the containers related to the Stack (Kibana, Package Registry, Elasticsearch, etc.). It uses the docker-compose project from the profile:

containerIDs, err := docker.ContainerIDsWithLabel(projectLabelDockerCompose, DockerComposeProjectName(options.Profile))

About the test packages, not sure what it would be the best option.
Currently, the servicedeployer run in detached mode, using -d docker-compose flag:

ExtraArgs: []string{"--build", "-d"},

ExtraArgs: []string{"--build", "-d"},

ExtraArgs: []string{"--build", "-d"},

About the test packages, if servicedeployer is not updated with the new flags, those new flags those container should be kept. And it is also run a explicit method to wait for the containers being ready/healthy:

err = p.WaitForHealthy(ctx, opts)

err = p.WaitForHealthy(ctx, opts)

@jsoriano Should servicedeployer be updated too (Up options) with these new flags? Or keep the current implementation ? As they are running with -d , it looks safe.

About the test package , it could be removed... but in the integrations repository they would keep using that container. Probably, it could be kept the tianon/true container to be sure that it is tested also with that. WDYT ?

}
return allServices
}

func dockerComposeStatus(ctx context.Context, options Options) ([]ServiceStatus, error) {
var services []ServiceStatus
// query directly to docker to avoid load environment variables (e.g. STACK_VERSION_VARIANT) and profiles
Expand Down
2 changes: 1 addition & 1 deletion internal/stack/serverless.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ func (sp *serverlessProvider) startLocalServices(ctx context.Context, options Op
}

if options.DaemonMode {
opts.ExtraArgs = append(opts.ExtraArgs, "-d")
opts.ExtraArgs = append(opts.ExtraArgs, "-d", "--wait", "--wait-timeout", fmt.Sprintf("%d", 600))
}
if err := project.Up(ctx, opts); err != nil {
// At least starting on 8.6.0, fleet-server may be reconfigured or
Expand Down