diff --git a/nix/workbench/backend/nomad.nix b/nix/workbench/backend/nomad.nix index d44ca3e3a0a..d856818fc52 100644 --- a/nix/workbench/backend/nomad.nix +++ b/nix/workbench/backend/nomad.nix @@ -44,6 +44,7 @@ let pkgs lib stateDir basePort extraBackendConfig; + unixHttpServerPort = "/tmp/supervisor.sock"; }; nomadConf = import ./nomad-conf.nix diff --git a/nix/workbench/backend/nomad.sh b/nix/workbench/backend/nomad.sh index 2b37957eafe..8de6890901f 100644 --- a/nix/workbench/backend/nomad.sh +++ b/nix/workbench/backend/nomad.sh @@ -43,7 +43,7 @@ case "$op" in # inside the container I get (from journald): # Nov 02 11:44:36 hostname cluster-18f3852f-e067-6394-8159-66a7b8da2ecc[1088457]: Error: Cannot open an HTTP server: socket.error reported -2 # Nov 02 11:44:36 hostname cluster-18f3852f-e067-6394-8159-66a7b8da2ecc[1088457]: For help, use /nix/store/izqhlj5i1x9ldyn43d02kcy4mafmj3ci-python3.9-supervisor-4.2.4/bin/supervisord -h - setenvjqstr 'supervisord_url' "http://127.0.0.1:9001" + setenvjqstr 'supervisord_url' "unix:///tmp/supervisor.sock" # Look up `cluster` OCI image's name and tag (also Nix profile). setenvjqstr 'oci_image_name' ${WB_OCI_IMAGE_NAME:-$(cat "$profile_dir/clusterImageName")} setenvjqstr 'oci_image_tag' ${WB_OCI_IMAGE_TAG:-$(cat "$profile_dir/clusterImageTag")} @@ -169,6 +169,7 @@ case "$op" in # constraints, resource exhaustion, etc), then the exit code will be 2. # Any other errors, including client connection issues or internal # errors, are indicated by exit code 1. + # FIXME: Timeout for "Deployment "XXX" in progress..." nomad job run -verbose "$dir/nomad/job-cluster.hcl" # Assuming that `nomad` placement is enough wait. local nomad_alloc_id=$(nomad job allocs -json cluster | jq -r '.[0].ID') @@ -180,9 +181,9 @@ case "$op" in local container_supervisord_conf=$(envjqr 'container_supervisord_conf') msg "Supervisor status inside container ..." # Print the command used for debugging purposes. - msg "'nomad alloc --task cluster exec --task cluster \"$nomad_alloc_id\" \"$container_supervisor_nix\"/bin/supervisorctl --serverurl \"$supervisord_url\" --configuration \"$container_supervisord_conf\" status'" + msg "'nomad alloc exec --task node-0 \"$nomad_alloc_id\" \"$container_supervisor_nix\"/bin/supervisorctl --serverurl \"$supervisord_url\" --configuration \"$container_supervisord_conf\" status'" # Execute the actual command. - nomad alloc exec --task cluster "$nomad_alloc_id" "$container_supervisor_nix"/bin/supervisorctl --serverurl "$supervisord_url" --configuration "$container_supervisord_conf" status || true + nomad alloc exec --task node-0 "$nomad_alloc_id" "$container_supervisor_nix"/bin/supervisorctl --serverurl "$supervisord_url" --configuration "$container_supervisord_conf" status || true ;; describe-run ) @@ -198,7 +199,7 @@ case "$op" in local dir=${1:?$usage}; shift local service=${1:?$usage}; shift - backend_nomad nomad-alloc-exec-supervisorctl "$dir" cluster start "$service" + backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$service" start "$service" ;; # Nomad-specific @@ -207,7 +208,7 @@ case "$op" in local dir=${1:?$usage}; shift local service=${1:?$usage}; shift - backend_nomad nomad-alloc-exec-supervisorctl "$dir" cluster stop "$service" + backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$service" stop "$service" ;; # Nomad-specific @@ -216,7 +217,7 @@ case "$op" in local dir=${1:?$usage}; shift local service=${1:?$usage}; shift - backend_nomad nomad-alloc-exec-supervisorctl "$dir" cluster status "$service" > /dev/null && true + backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$service" status "$service" > /dev/null && true ;; # Nomad-specific @@ -388,11 +389,12 @@ case "$op" in local usage="USAGE: wb nomad $op RUN-DIR" local dir=${1:?$usage}; shift - local nomad_alloc_id=$(envjqr 'nomad_alloc_id') - local supervisord_url=$(envjqr 'supervisord_url') - local container_supervisor_nix=$(envjqr 'container_supervisor_nix') - local container_supervisord_conf=$(envjqr 'container_supervisord_conf') - nomad alloc exec --task cluster "$nomad_alloc_id" "$container_supervisor_nix"/bin/supervisorctl --serverurl "$supervisord_url" --configuration "$container_supervisord_conf" stop all || true > /dev/null + backend_nomad nomad-alloc-exec-supervisorctl "$dir" generator stop all + backend_nomad nomad-alloc-exec-supervisorctl "$dir" tracer stop all + for node in $(jq_tolist 'keys' "$dir"/node-specs.json) + do + backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$node" stop all + done nomad job stop -global -no-shutdown-delay -purge -yes cluster local nomad_pid=$(envjqr 'nomad_pid') @@ -477,6 +479,10 @@ nomad_create_folders_and_config() { volumes { enabled = true } + recover_stopped = false + gc { + container = false + } } } EOF @@ -583,16 +589,30 @@ job "cluster" { mode = "host" } EOF - local task_stanza_name="cluster" - local task_stanza_file="$dir/nomad/job-cluster-task-$task_stanza_name.hcl" - nomad_create_task_stanza "$task_stanza_file" "$task_stanza_name" "$podman_volumes" -cat "$task_stanza_file" >> "$dir/nomad/job-cluster.hcl" - - local task_stanza_name="cluster2" - local task_stanza_file="$dir/nomad/job-cluster-task-$task_stanza_name.hcl" - nomad_create_task_stanza "$task_stanza_file" "$task_stanza_name" "$podman_volumes" + # Cluster +# local task_stanza_name_c="cluster" +# local task_stanza_file_c="$dir/nomad/job-cluster-task-$task_stanza_name_c.hcl" +# nomad_create_task_stanza "$task_stanza_file_c" "$task_stanza_name_c" "$podman_volumes" +#cat "$task_stanza_file_c" >> "$dir/nomad/job-cluster.hcl" + # Nodes + for node in $(jq_tolist 'keys' "$dir"/node-specs.json) + do + local task_stanza_name="$node" + local task_stanza_file="$dir/nomad/job-cluster-task-$task_stanza_name.hcl" + nomad_create_task_stanza "$task_stanza_file" "$task_stanza_name" "$podman_volumes" cat "$task_stanza_file" >> "$dir/nomad/job-cluster.hcl" - + done + # Tracer + local task_stanza_name_t="tracer" + local task_stanza_file_t="$dir/nomad/job-cluster-task-$task_stanza_name_t.hcl" + nomad_create_task_stanza "$task_stanza_file_t" "$task_stanza_name_t" "$podman_volumes" +cat "$task_stanza_file_t" >> "$dir/nomad/job-cluster.hcl" + # Generator + local task_stanza_name_g="generator" + local task_stanza_file_g="$dir/nomad/job-cluster-task-$task_stanza_name_g.hcl" + nomad_create_task_stanza "$task_stanza_file_g" "$task_stanza_name_g" "$podman_volumes" +cat "$task_stanza_file_g" >> "$dir/nomad/job-cluster.hcl" + # The end. cat >> "$dir/nomad/job-cluster.hcl" <<- EOF } } @@ -622,12 +642,13 @@ task "$name" { #logging = { # driver = "nomad" #} + hostname = "$name" + network_mode = "host" tmpfs = [ "/tmp" ] volumes = ${podman_volumes} working_dir = "${container_workdir}" - hostname = "$name" } env = { SUPERVISOR_NIX = "${container_supervisor_nix}" diff --git a/nix/workbench/backend/supervisor-conf.nix b/nix/workbench/backend/supervisor-conf.nix index 7ad0b53dec4..9b03b38880e 100644 --- a/nix/workbench/backend/supervisor-conf.nix +++ b/nix/workbench/backend/supervisor-conf.nix @@ -3,6 +3,8 @@ , stateDir , basePort , node-services +, unixHttpServerPort ? null +, inetHttpServerPort ? null ## Last-moment overrides: , extraBackendConfig }: @@ -23,14 +25,24 @@ let strip_ansi = true; }; supervisorctl = {}; - inet_http_server = { - port = "127.0.0.1:9001"; - }; "rpcinterface:supervisor" = { "supervisor.rpcinterface_factory" = "supervisor.rpcinterface:make_main_rpcinterface"; }; } // + lib.attrsets.optionalAttrs (unixHttpServerPort != null) { + unix_http_server = { + file = unixHttpServerPort; + chmod = "0777"; + }; + } + // + lib.attrsets.optionalAttrs (inetHttpServerPort != null) { + inet_http_server = { + port = inetHttpServerPort; + }; + } + // listToAttrs (mapAttrsToList (_: nodeSvcSupervisorProgram) node-services) // diff --git a/nix/workbench/backend/supervisor.nix b/nix/workbench/backend/supervisor.nix index 0ca15e32467..dae55e0fc3d 100644 --- a/nix/workbench/backend/supervisor.nix +++ b/nix/workbench/backend/supervisor.nix @@ -55,6 +55,7 @@ let pkgs lib stateDir basePort extraBackendConfig; + inetHttpServerPort = "127.0.0.1:9001"; }; } ''