Skip to content

Commit

Permalink
workbench: make nomad backend use multiple containers
Browse files Browse the repository at this point in the history
  • Loading branch information
fmaste committed Nov 24, 2022
1 parent 92ee9eb commit ffaa6d1
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 24 deletions.
1 change: 1 addition & 0 deletions nix/workbench/backend/nomad.nix
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ let
pkgs lib stateDir
basePort
extraBackendConfig;
unixHttpServerPort = "/tmp/supervisor.sock";
};
nomadConf =
import ./nomad-conf.nix
Expand Down
63 changes: 42 additions & 21 deletions nix/workbench/backend/nomad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ case "$op" in
# inside the container I get (from journald):
# Nov 02 11:44:36 hostname cluster-18f3852f-e067-6394-8159-66a7b8da2ecc[1088457]: Error: Cannot open an HTTP server: socket.error reported -2
# Nov 02 11:44:36 hostname cluster-18f3852f-e067-6394-8159-66a7b8da2ecc[1088457]: For help, use /nix/store/izqhlj5i1x9ldyn43d02kcy4mafmj3ci-python3.9-supervisor-4.2.4/bin/supervisord -h
setenvjqstr 'supervisord_url' "http://127.0.0.1:9001"
setenvjqstr 'supervisord_url' "unix:///tmp/supervisor.sock"
# Look up `cluster` OCI image's name and tag (also Nix profile).
setenvjqstr 'oci_image_name' ${WB_OCI_IMAGE_NAME:-$(cat "$profile_dir/clusterImageName")}
setenvjqstr 'oci_image_tag' ${WB_OCI_IMAGE_TAG:-$(cat "$profile_dir/clusterImageTag")}
Expand Down Expand Up @@ -169,6 +169,7 @@ case "$op" in
# constraints, resource exhaustion, etc), then the exit code will be 2.
# Any other errors, including client connection issues or internal
# errors, are indicated by exit code 1.
# FIXME: Timeout for "Deployment "XXX" in progress..."
nomad job run -verbose "$dir/nomad/job-cluster.hcl"
# Assuming that `nomad` placement is enough wait.
local nomad_alloc_id=$(nomad job allocs -json cluster | jq -r '.[0].ID')
Expand All @@ -180,9 +181,9 @@ case "$op" in
local container_supervisord_conf=$(envjqr 'container_supervisord_conf')
msg "Supervisor status inside container ..."
# Print the command used for debugging purposes.
msg "'nomad alloc --task cluster exec --task cluster \"$nomad_alloc_id\" \"$container_supervisor_nix\"/bin/supervisorctl --serverurl \"$supervisord_url\" --configuration \"$container_supervisord_conf\" status'"
msg "'nomad alloc exec --task node-0 \"$nomad_alloc_id\" \"$container_supervisor_nix\"/bin/supervisorctl --serverurl \"$supervisord_url\" --configuration \"$container_supervisord_conf\" status'"
# Execute the actual command.
nomad alloc exec --task cluster "$nomad_alloc_id" "$container_supervisor_nix"/bin/supervisorctl --serverurl "$supervisord_url" --configuration "$container_supervisord_conf" status || true
nomad alloc exec --task node-0 "$nomad_alloc_id" "$container_supervisor_nix"/bin/supervisorctl --serverurl "$supervisord_url" --configuration "$container_supervisord_conf" status || true
;;

describe-run )
Expand All @@ -198,7 +199,7 @@ case "$op" in
local dir=${1:?$usage}; shift
local service=${1:?$usage}; shift

backend_nomad nomad-alloc-exec-supervisorctl "$dir" cluster start "$service"
backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$service" start "$service"
;;

# Nomad-specific
Expand All @@ -207,7 +208,7 @@ case "$op" in
local dir=${1:?$usage}; shift
local service=${1:?$usage}; shift

backend_nomad nomad-alloc-exec-supervisorctl "$dir" cluster stop "$service"
backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$service" stop "$service"
;;

# Nomad-specific
Expand All @@ -216,7 +217,7 @@ case "$op" in
local dir=${1:?$usage}; shift
local service=${1:?$usage}; shift

backend_nomad nomad-alloc-exec-supervisorctl "$dir" cluster status "$service" > /dev/null && true
backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$service" status "$service" > /dev/null && true
;;

# Nomad-specific
Expand Down Expand Up @@ -388,11 +389,12 @@ case "$op" in
local usage="USAGE: wb nomad $op RUN-DIR"
local dir=${1:?$usage}; shift

local nomad_alloc_id=$(envjqr 'nomad_alloc_id')
local supervisord_url=$(envjqr 'supervisord_url')
local container_supervisor_nix=$(envjqr 'container_supervisor_nix')
local container_supervisord_conf=$(envjqr 'container_supervisord_conf')
nomad alloc exec --task cluster "$nomad_alloc_id" "$container_supervisor_nix"/bin/supervisorctl --serverurl "$supervisord_url" --configuration "$container_supervisord_conf" stop all || true > /dev/null
backend_nomad nomad-alloc-exec-supervisorctl "$dir" generator stop all
backend_nomad nomad-alloc-exec-supervisorctl "$dir" tracer stop all
for node in $(jq_tolist 'keys' "$dir"/node-specs.json)
do
backend_nomad nomad-alloc-exec-supervisorctl "$dir" "$node" stop all
done

nomad job stop -global -no-shutdown-delay -purge -yes cluster
local nomad_pid=$(envjqr 'nomad_pid')
Expand Down Expand Up @@ -477,6 +479,10 @@ nomad_create_folders_and_config() {
volumes {
enabled = true
}
recover_stopped = false
gc {
container = false
}
}
}
EOF
Expand Down Expand Up @@ -583,16 +589,30 @@ job "cluster" {
mode = "host"
}
EOF
local task_stanza_name="cluster"
local task_stanza_file="$dir/nomad/job-cluster-task-$task_stanza_name.hcl"
nomad_create_task_stanza "$task_stanza_file" "$task_stanza_name" "$podman_volumes"
cat "$task_stanza_file" >> "$dir/nomad/job-cluster.hcl"

local task_stanza_name="cluster2"
local task_stanza_file="$dir/nomad/job-cluster-task-$task_stanza_name.hcl"
nomad_create_task_stanza "$task_stanza_file" "$task_stanza_name" "$podman_volumes"
# Cluster
# local task_stanza_name_c="cluster"
# local task_stanza_file_c="$dir/nomad/job-cluster-task-$task_stanza_name_c.hcl"
# nomad_create_task_stanza "$task_stanza_file_c" "$task_stanza_name_c" "$podman_volumes"
#cat "$task_stanza_file_c" >> "$dir/nomad/job-cluster.hcl"
# Nodes
for node in $(jq_tolist 'keys' "$dir"/node-specs.json)
do
local task_stanza_name="$node"
local task_stanza_file="$dir/nomad/job-cluster-task-$task_stanza_name.hcl"
nomad_create_task_stanza "$task_stanza_file" "$task_stanza_name" "$podman_volumes"
cat "$task_stanza_file" >> "$dir/nomad/job-cluster.hcl"

done
# Tracer
local task_stanza_name_t="tracer"
local task_stanza_file_t="$dir/nomad/job-cluster-task-$task_stanza_name_t.hcl"
nomad_create_task_stanza "$task_stanza_file_t" "$task_stanza_name_t" "$podman_volumes"
cat "$task_stanza_file_t" >> "$dir/nomad/job-cluster.hcl"
# Generator
local task_stanza_name_g="generator"
local task_stanza_file_g="$dir/nomad/job-cluster-task-$task_stanza_name_g.hcl"
nomad_create_task_stanza "$task_stanza_file_g" "$task_stanza_name_g" "$podman_volumes"
cat "$task_stanza_file_g" >> "$dir/nomad/job-cluster.hcl"
# The end.
cat >> "$dir/nomad/job-cluster.hcl" <<- EOF
}
}
Expand Down Expand Up @@ -622,12 +642,13 @@ task "$name" {
#logging = {
# driver = "nomad"
#}
hostname = "$name"
network_mode = "host"
tmpfs = [
"/tmp"
]
volumes = ${podman_volumes}
working_dir = "${container_workdir}"
hostname = "$name"
}
env = {
SUPERVISOR_NIX = "${container_supervisor_nix}"
Expand Down
18 changes: 15 additions & 3 deletions nix/workbench/backend/supervisor-conf.nix
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
, stateDir
, basePort
, node-services
, unixHttpServerPort ? null
, inetHttpServerPort ? null
## Last-moment overrides:
, extraBackendConfig
}:
Expand All @@ -23,14 +25,24 @@ let
strip_ansi = true;
};
supervisorctl = {};
inet_http_server = {
port = "127.0.0.1:9001";
};
"rpcinterface:supervisor" = {
"supervisor.rpcinterface_factory" = "supervisor.rpcinterface:make_main_rpcinterface";
};
}
//
lib.attrsets.optionalAttrs (unixHttpServerPort != null) {
unix_http_server = {
file = unixHttpServerPort;
chmod = "0777";
};
}
//
lib.attrsets.optionalAttrs (inetHttpServerPort != null) {
inet_http_server = {
port = inetHttpServerPort;
};
}
//
listToAttrs
(mapAttrsToList (_: nodeSvcSupervisorProgram) node-services)
//
Expand Down
1 change: 1 addition & 0 deletions nix/workbench/backend/supervisor.nix
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ let
pkgs lib stateDir
basePort
extraBackendConfig;
inetHttpServerPort = "127.0.0.1:9001";
};
}
''
Expand Down

0 comments on commit ffaa6d1

Please sign in to comment.