diff --git a/README.md b/README.md index 11d2fb2..2c707af 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ and can be used to install any pre-requisites. {{mapred_local_dirs}} {{spark_local_dirs}} {{default_spark_mem}} + {{spark_worker_instances}} + {{spark_worker_cores}} + {{spark_master_opts}} You can add new variables by modifying `deploy_templates.py` diff --git a/deploy_templates.py b/deploy_templates.py index a2840f7..99b5568 100755 --- a/deploy_templates.py +++ b/deploy_templates.py @@ -12,6 +12,7 @@ # Find system memory in KB and compute Spark's default limit from that mem_command = "cat /proc/meminfo | grep MemTotal | awk '{print $2}'" +cpu_command = "nproc" master_ram_kb = int( os.popen(mem_command).read().strip()) @@ -20,8 +21,14 @@ slave_mem_command = "ssh -t -o StrictHostKeyChecking=no %s %s" %\ (first_slave, mem_command) + +slave_cpu_command = "ssh -t -o StrictHostKeyChecking=no %s %s" %\ + (first_slave, cpu_command) + slave_ram_kb = int(os.popen(slave_mem_command).read().strip()) +slave_cpus = int(os.popen(slave_cpu_command).read().strip()) + system_ram_kb = min(slave_ram_kb, master_ram_kb) system_ram_mb = system_ram_kb / 1024 @@ -42,6 +49,10 @@ # Make tachyon_mb as spark_mb for now. tachyon_mb = spark_mb +worker_instances = int(os.getenv("SPARK_WORKER_INSTANCES", 1)) +# Distribute equally cpu cores among worker instances +worker_cores = max(slave_cpus / worker_instances, 1) + template_vars = { "master_list": os.getenv("MASTERS"), "active_master": os.getenv("MASTERS").split("\n")[0], @@ -50,6 +61,9 @@ "mapred_local_dirs": os.getenv("MAPRED_LOCAL_DIRS"), "spark_local_dirs": os.getenv("SPARK_LOCAL_DIRS"), "default_spark_mem": "%dm" % spark_mb, + "spark_worker_instances": "%d" % worker_instances, + "spark_worker_cores": "%d" % worker_cores, + "spark_master_opts": os.getenv("SPARK_MASTER_OPTS"), "spark_version": os.getenv("SPARK_VERSION"), "shark_version": os.getenv("SHARK_VERSION"), "hadoop_major_version": os.getenv("HADOOP_MAJOR_VERSION"), diff --git a/templates/root/spark/conf/spark-env.sh b/templates/root/spark/conf/spark-env.sh index d5077ea..d5ecc89 100755 --- a/templates/root/spark/conf/spark-env.sh +++ b/templates/root/spark/conf/spark-env.sh @@ -20,12 +20,17 @@ export SPARK_MEM={{default_spark_mem}} SPARK_JAVA_OPTS+=" -Dspark.local.dir={{spark_local_dirs}}" export SPARK_JAVA_OPTS +export SPARK_MASTER_OPTS="{{spark_master_opts}}" + export HADOOP_HOME="/root/ephemeral-hdfs" export SPARK_LIBRARY_PATH="/root/ephemeral-hdfs/lib/native/" export SPARK_MASTER_IP={{active_master}} export MASTER=`cat /root/spark-ec2/cluster-url` export SPARK_CLASSPATH=$SPARK_CLASSPATH":/root/ephemeral-hdfs/conf" +export SPARK_WORKER_INSTANCES={{spark_worker_instances}} +export SPARK_WORKER_CORES={{spark_worker_cores}} + # Bind Spark's web UIs to this machine's public EC2 hostname: export SPARK_PUBLIC_DNS=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname`