EleutherAI · Quentin-Anthony · Feb 23, 2024 · Jan 12, 2024 · Jan 13, 2024 · Jan 13, 2024
@@ -4,12 +4,12 @@ on: [pull_request]
 
 jobs:
   pre-commit:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3.10
           cache: "pip"
           cache-dependency-path: "**/requirements*.txt"
       # Need the right version of clang-format
@@ -24,7 +24,7 @@ jobs:
         uses: docker/build-push-action@v2
 
   update-documentation:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
         with:

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -21,20 +21,20 @@ LABEL org.opencontainers.image.version = "2.0"
 LABEL org.opencontainers.image.authors = "[email protected]"
 LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
 LABEL org.opencontainers.image.licenses = " Apache-2.0"
-LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.7.1-devel-ubuntu20.04"
+LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:12.1.1-devel-ubuntu22.04"
 
 #### System package (uses default Python 3 version in Ubuntu 20.04)
 RUN apt-get update -y && \
     apt-get install -y \
-    git python3.9 python3-dev libpython3-dev python3-pip sudo pdsh \
-    htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \
+    git python3-dev libpython3-dev python3-pip sudo pdsh \
+    htop tmux zstd software-properties-common build-essential autotools-dev \
     nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
     rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \
     rdmacm-utils perftest rdma-core nano && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
-    pip install --upgrade pip && \
-    pip install gpustat
+    python -m pip install --upgrade pip && \
+    python -m pip install gpustat
 
 ### SSH
 RUN mkdir /var/run/sshd && \
@@ -88,24 +88,31 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
     echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
 
 #### Python packages
-RUN pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 && pip cache purge
-COPY requirements/requirements.txt .
-COPY requirements/requirements-wandb.txt .
-COPY requirements/requirements-onebitadam.txt .
-COPY requirements/requirements-sparseattention.txt .
-COPY requirements/requirements-flashattention.txt .
-RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt
-RUN pip install -r requirements-sparseattention.txt
-RUN pip install -r requirements-flashattention.txt
-RUN pip install -r requirements-wandb.txt
-RUN pip install protobuf==3.20.*
-RUN pip cache purge
+RUN python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+COPY requirements/* ./
+RUN python -m pip install --no-cache-dir -r requirements.txt && pip install -r requirements-onebitadam.txt
+RUN python -m pip install -r requirements-sparseattention.txt
+RUN python -m pip install -r requirements-flashattention.txt
+RUN python -m pip install -r requirements-wandb.txt
+RUN python -m pip install protobuf==3.20.*
+RUN python -m pip cache purge
 
 ## Install APEX
-RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
+# Detect the architecture and install Apex accordingly
+RUN ARCH=$(uname -m) && \
+    if [ "$ARCH" = "x86_64" ]; then \
+        wget https://github.com/segyges/not-nvidia-apex/releases/download/jan-2024/apex-0.1-cp310-cp310-linux_x86_64.zip && \
+        unzip ./apex-0.1-cp310-cp310-linux_x86_64.zip && \
+        python -m pip install ./apex-0.1-cp310-cp310-linux_x86_64.whl; \
+    else \
+    # Install Apex directly from source for other architectures
+        python -m pip install -r requirements-apex-pip.txt && \
+        python -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings --global-option=--cpp_ext --config-settings --global-option=--cuda_ext git+https://github.com/NVIDIA/apex.git@141bbf1cf362d4ca4d94f4284393e91dda5105a5; \
+    fi
 
 COPY megatron/fused_kernels/ megatron/fused_kernels
-RUN python megatron/fused_kernels/setup.py install
+WORKDIR /megatron/fused_kernels
+RUN python setup.py install
 
 # Clear staging
 RUN mkdir -p /tmp && chmod 0777 /tmp

@@ -976,7 +976,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
-    Default = 
+    Default =
 
 
     a single prompt's end. Defaults to newline
@@ -1018,7 +1018,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default = 
+    Default =
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1770,7 +1770,7 @@ Args for deepspeed config
 
     Default = None
 
-    
+
 
 
 
@@ -2070,4 +2070,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
     Default = None
 
     Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
-
@@ -1,7 +1,7 @@
 version: '3'
 services:
   gpt-neox:
-    command: nvidia-smi -q --loop=10
+    command: nvidia-smi dmon
     image: leogao2/gpt-neox:main
     shm_size: 1g
     ulimits:

@@ -1,7 +1,7 @@
 version: '3'
 services:
   gpt-neox:
-    command: nvidia-smi -q --loop=10
+    command: nvidia-smi dmon
     image: gpt-neox
     build:
       context: .

@@ -0,0 +1 @@
+pip==23.3.2
@@ -4,4 +4,5 @@ pre-commit>=2.17.0
 pytest>=6.2.3
 pytest-cov>=2.11.1
 pytest-forked>=1.3.0
+pytest-html==4.1.1
 pytest-xdist
@@ -1 +1 @@
-triton==2.0.0.dev20221202
+triton==2.1.0
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 import pytest
-from tools.datasets import preprocess_data
-import train
+
+import eval
 import generate
-import evaluate
+import train
 from megatron.neox_arguments import NeoXArgs
-from tests.common import simulate_deepy_env, save_random_model
+from tests.common import save_random_model, simulate_deepy_env
+from tools.datasets import preprocess_data
 
 
 @pytest.fixture(
@@ -84,7 +85,7 @@ def test_evaluate(monkeypatch, tmpdir, tmp_path):
         "eval_tasks": ["lambada"],  # ["lambada", "hellaswag", "piqa", "sciq"],
         "eval_results_prefix": sample_output_file,
     }
-    evaluate.main(input_args=deepspeed_main_args, overwrite_values=evaluate_args)
+    eval.main(input_args=deepspeed_main_args, overwrite_values=evaluate_args)
 
 
 def test_finetuning(monkeypatch, tmpdir, tmp_path):