Skip to content

Commit

Permalink
add settings for working with cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
ksugar committed May 30, 2023
1 parent 2de50cc commit c7155cc
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 8 deletions.
25 changes: 18 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: help rebuild build launch bash bashroot notebook warmup test apptainer-build apptainer-launch apptainer-shell apptainer-stop
.PHONY: help rebuild build launch bash bashroot notebook warmup test apptainer-build apptainer-launch apptainer-shell apptainer-create-rabbitmq-user apptainer-stop

help:
@cat Makefile
Expand All @@ -8,7 +8,10 @@ ELEPHANT_WORKSPACE?=${PWD}/workspace
ELEPHANT_IMAGE_NAME?=elephant-server:0.5.0
ELEPHANT_NVIDIA_GID?=$$(ls -n /dev/nvidia0 2>/dev/null | awk '{print $$4}')
ELEPHANT_DOCKER?=docker
ELEPHANT_RABBITMQ_NODENAME?=rabbit@localhost
ELEPHANT_RABBITMQ_NODE_PORT?=5672
ELEPHANT_REDIS_PORT?=6379
ELEPHANT_BATCH_ID?=

rebuild:
@IMAGEID=$$($(ELEPHANT_DOCKER) images -q $(ELEPHANT_IMAGE_NAME)); \
Expand Down Expand Up @@ -40,7 +43,8 @@ warmup:

launch: warmup
$(ELEPHANT_DOCKER) run -it --rm $(GPU_ARG) --shm-size=8g -v $(ELEPHANT_WORKSPACE):/workspace -p 8080:80 -p 5672:5672 \
-e LOCAL_UID=$(shell id -u) -e LOCAL_GID=$(shell id -g) -e NVIDIA_GID=$(ELEPHANT_NVIDIA_GID) -e ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT)\
-e LOCAL_UID=$(shell id -u) -e LOCAL_GID=$(shell id -g) -e NVIDIA_GID=$(ELEPHANT_NVIDIA_GID) \
-e RABBITMQ_NODENAME=$(ELEPHANT_RABBITMQ_NODENAME) -e RABBITMQ_NODE_PORT=$(ELEPHANT_RABBITMQ_NODE_PORT) -e ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT) \
$(ELEPHANT_IMAGE_NAME)

bash: warmup
Expand All @@ -66,15 +70,22 @@ apptainer-build:
apptainer run --fakeroot --bind $(HOME):/root elephant.sif

apptainer-launch:
apptainer instance start --nv --bind $(HOME),$(HOME)/.elephant_binds/var/lib:/var/lib,$(HOME)/.elephant_binds/var/log:/var/log,$(HOME)/.elephant_binds/var/run:/var/run,$(HOME)/.elephant_binds/etc/nginx:/etc/nginx,$(HOME)/.elephant_binds/etc/rabbitmq:/etc/rabbitmq,$(ELEPHANT_WORKSPACE):/workspace elephant.sif elephant
apptainer instance start --nv --bind $(HOME),$(HOME)/.elephant_binds/var/lib:/var/lib,$(HOME)/.elephant_binds/var/log:/var/log,$(HOME)/.elephant_binds/var/run:/var/run,$(HOME)/.elephant_binds/etc/nginx:/etc/nginx,$(ELEPHANT_WORKSPACE):/workspace elephant.sif elephant$(ELEPHANT_BATCH_ID)
if [ $(ELEPHANT_GPU) = all ]; then \
apptainer exec --env ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT) instance://elephant /start.sh; \
apptainer exec --env RABBITMQ_NODENAME=$(ELEPHANT_RABBITMQ_NODENAME),RABBITMQ_NODE_PORT=$(ELEPHANT_RABBITMQ_NODE_PORT),ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT) instance://elephant$(ELEPHANT_BATCH_ID) /start.sh; \
else \
apptainer exec --env CUDA_VISIBLE_DEVICES=$(ELEPHANT_GPU),ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT) instance://elephant /start.sh; \
fi
apptainer exec --env CUDA_VISIBLE_DEVICES=$(ELEPHANT_GPU),RABBITMQ_NODENAME=$(ELEPHANT_RABBITMQ_NODENAME),RABBITMQ_NODE_PORT=$(ELEPHANT_RABBITMQ_NODE_PORT),ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT) instance://elephant$(ELEPHANT_BATCH_ID) /start.sh; \
fi

apptainer-shell:
apptainer shell --env ELEPHANT_REDIS_PORT=$(ELEPHANT_REDIS_PORT) --bind $(HOME),$(HOME)/.elephant_binds/var/lib:/var/lib,$(HOME)/.elephant_binds/var/log:/var/log,$(HOME)/.elephant_binds/var/run:/var/run,$(HOME)/.elephant_binds/etc/nginx:/etc/nginx,$(HOME)/.elephant_binds/etc/rabbitmq:/etc/rabbitmq elephant.sif
if [ $(ELEPHANT_GPU) = all ]; then \
apptainer shell --fakeroot --nv --env RABBITMQ_NODENAME=$(ELEPHANT_RABBITMQ_NODENAME) --bind $(HOME),$(HOME)/.elephant_binds/var/lib:/var/lib,$(HOME)/.elephant_binds/var/log:/var/log,$(HOME)/.elephant_binds/var/run:/var/run,$(HOME)/.elephant_binds/etc/nginx:/etc/nginx elephant.sif; \
else \
apptainer shell --fakeroot --nv --env CUDA_VISIBLE_DEVICES=$(ELEPHANT_GPU) --bind $(HOME),$(HOME)/.elephant_binds/var/lib:/var/lib,$(HOME)/.elephant_binds/var/log:/var/log,$(HOME)/.elephant_binds/var/run:/var/run,$(HOME)/.elephant_binds/etc/nginx:/etc/nginx elephant.sif; \
fi

apptainer-create-rabbitmq-user:
apptainer exec --fakeroot --env RABBITMQ_NODENAME=$(ELEPHANT_RABBITMQ_NODENAME) --bind $(HOME),$(HOME)/.elephant_binds/var/lib:/var/lib,$(HOME)/.elephant_binds/var/log:/var/log,$(HOME)/.elephant_binds/var/run:/var/run elephant.sif docker/create_rabbitmq_user.sh

apptainer-stop:
apptainer instance stop elephant
6 changes: 6 additions & 0 deletions docker/create_rabbitmq_user.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
/etc/init.d/rabbitmq-server start
rabbitmqctl add_user user user 2>/dev/null
rabbitmqctl set_user_tags user administrator
rabbitmqctl set_permissions -p / user ".*" ".*" ".*"
/etc/init.d/rabbitmq-server stop
3 changes: 3 additions & 0 deletions docker/supervisord.conf
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[unix_http_server]
file=/var/run/supervisor%(ENV_ELEPHANT_BATCH_ID)s.sock

[supervisord]
nodaemon=true

Expand Down
4 changes: 3 additions & 1 deletion elephant-core/elephant/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

RUN_ON_FLASK = "RUN_ON_FLASK" in os.environ

RABBITMQ_NODE_PORT = int(os.environ.get("RABBITMQ_NODE_PORT", 5672))


class RabbitMQHandler(logging.StreamHandler):
"""
Expand Down Expand Up @@ -68,7 +70,7 @@ def logger():
def publish_mq(queue, body):
if RUN_ON_FLASK:
with pika.BlockingConnection(pika.ConnectionParameters(
host='localhost', heartbeat=0)) as connection:
host='localhost', port=RABBITMQ_NODE_PORT, heartbeat=0)) as connection:
connection.channel().queue_declare(queue=queue)
connection.channel().basic_publish(exchange='',
routing_key=queue,
Expand Down

0 comments on commit c7155cc

Please sign in to comment.