This is the docker image I am using for Axolotl R&D. You may need to tweak TORCH_CUDA_ARCH_LIST to include your GPU architecture. There are several improvments over the official axolotl image including:
- Non-root user
- Cleaner dependency management
- Better caching at buildtime
- Highly parameterized build, useful for testing new Python dependencies / versions
- Final stage uses Nvidia's runtime container which (hypothetically) should be smaller
The Dockerfile below was shamelessly taken from here:
###############################################################################
# base-builder
###############################################################################
ARG CONTAINER_CUDA_VERSION
ARG CONTAINER_CUDNN_VERSION
ARG CONTAINER_UBUNTU_VERSION
# nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
FROM nvidia/cuda:${CONTAINER_CUDA_VERSION}-cudnn${CONTAINER_CUDNN_VERSION}-devel-ubuntu${CONTAINER_UBUNTU_VERSION} as base
ARG PYTORCH_VERSION
ARG PYTORCH_REPO
ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
# Install OS dependencies
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update && \
apt-get install -y \
software-properties-common \
git \
build-essential \
ninja-build \
libaio-dev \
pip && \
ln -s /usr/bin/python3 /usr/bin/python
# This doesn't seem to help with anything...
# Note the build is HIGHLY PARALLELIZED, takes forever , and currently fails
# RUN apt-get install cmake && \
# git clone --branch v2.0.0 --recursive https://github.com/pytorch/pytorch && \
# cd pytorch && \
# pip install -r requirements.txt && \
# MAX_JOBS=32 python setup.py bdist_wheel
# Other container dependencies need to have PyTorch installed
RUN --mount=type=cache,target=/root/.cache \
pip install --extra-index-url ${PYTORCH_REPO} -U \
"torch==${PYTORCH_VERSION}" \
packaging
###############################################################################
# builder-deepspeed
###############################################################################
FROM base as builder-deepspeed
ARG DEEPSPEED_VERSION
ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
RUN git clone --branch v${DEEPSPEED_VERSION} --depth 1 https://github.com/microsoft/DeepSpeed.git && \
cd DeepSpeed && \
DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 python setup.py bdist_wheel
###############################################################################
# builder-bitsandbytes
###############################################################################
FROM base as builder-bitsandbytes
ARG BITSANDBYTES_VERSION
ARG BITSANDBYTES_CUDA_VERSION
ARG BITSANDBYTES_MAKE_TARGET
ARG MAX_CONCURRENCY
ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
RUN git clone --branch ${BITSANDBYTES_VERSION} --depth 1 https://github.com/TimDettmers/bitsandbytes.git && \
cd bitsandbytes && \
CUDA_VERSION=${BITSANDBYTES_CUDA_VERSION} make -j ${MAX_CONCURRENCY} ${BITSANDBYTES_MAKE_TARGET} && \
python setup.py bdist_wheel
###############################################################################
# builder-apex
###############################################################################
# Note that this takes forever...
FROM base as builder-apex
ARG APEX_VERSION
ARG MAX_CONCURRENCY
ARG APEX_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=${APEX_CUDA_ARCH_LIST}
RUN python -m pip uninstall -y apex && \
git clone --branch ${APEX_VERSION} --depth 1 https://github.com/NVIDIA/apex && \
cd apex && \
MAX_JOBS=${MAX_CONCURRENCY} python setup.py bdist_wheel --cpp_ext --cuda_ext
###############################################################################
# builder-flash-attn
###############################################################################
# Note that this takes forever...
FROM base as builder-flash-attn
ARG FLASH_ATTN_VERSION
ARG APEX_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=${APEX_CUDA_ARCH_LIST}
RUN git clone --branch v${FLASH_ATTN_VERSION} --depth 1 https://github.com/HazyResearch/flash-attention.git && \
cd flash-attention && \
MAX_JOBS=${MAX_CONCURRENCY} python setup.py bdist_wheel
###############################################################################
# main
###############################################################################
FROM nvidia/cuda:${CONTAINER_CUDA_VERSION}-cudnn${CONTAINER_CUDNN_VERSION}-runtime-ubuntu${CONTAINER_UBUNTU_VERSION}
# Standard labels
LABEL maintainer="The Objective Dad <theobjectivedad@gmail.com>"
LABEL version="1.0.0"
LABEL description="LLM training environment optimized for Axolotl"
LABEL url="https://www.theobjectivedad.com"
ARG HF_ACCELERATE_VERSION
ARG HF_TRANSFORMERS_VERSION
ARG HF_OPTIMUM_VERSION
ARG HF_PEFT_VERSION
ARG PYTORCH_VERSION
ARG PYTORCH_REPO
# Install runtime OS tools
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update && \
apt-get install -y \
git \
git-lfs \
libaio1 \
pip && \
ln -s /usr/bin/python3 /usr/bin/python
# Install pre-build wheels
ARG WHL_HOME=/tmp/wheels
RUN mkdir ${WHL_HOME}
COPY --from=builder-apex /apex/dist/*.whl ${WHL_HOME}
COPY --from=builder-flash-attn /flash-attention/dist/*.whl ${WHL_HOME}
COPY --from=builder-bitsandbytes /bitsandbytes/dist/*.whl ${WHL_HOME}
COPY --from=builder-deepspeed /DeepSpeed/dist/*.whl ${WHL_HOME}
# This is stupid but the apex setup.py version is hardcoded to 0.1 so we need to force-reinstall,
# see: https://github.com/NVIDIA/apex/blob/0da3ffb92ee6fbe5336602f0e3989db1cd16f880/setup.py#L797
RUN pip install --force-reinstall $(find ${WHL_HOME} -name "apex-*.whl" -printf "%p ")
# Install extra python packages, note that we are intentionally locking in versions of all the
# custom-build libraries, again apex is missing because it has a version hardcoded in setup.py.
# See note above.
RUN pip install --extra-index-url ${PYTORCH_REPO} \
"torch==${PYTORCH_VERSION}" \
"peft@git+https://github.com/huggingface/peft.git@main" \
"accelerate@git+https://github.com/huggingface/accelerate.git@main" \
"transformers@git+https://github.com/huggingface/transformers.git@main" \
"optimum==${HF_OPTIMUM_VERSION}" \
addict \
fire \
PyYAML==6.0 \
datasets \
sentencepiece \
wandb \
einops \
xformers \
bert-score==0.3.13 \
evaluate==0.4.0 \
rouge-score==0.1.2 \
scipy \
scikit-learn==1.2.2 \
$(find ${WHL_HOME} -name "*.whl" -printf "file://%p ")
ARG USERNAME
ARG USER_ID
ARG GROUP_ID
# Add container user & environment
RUN groupadd -g ${GROUP_ID} ${USERNAME} && \
useradd -u ${USER_ID} -g ${GROUP_ID} -ms /bin/bash ${USERNAME} && \
echo 'export PS1="\h:\W $ "' >> /home/${USERNAME}/.bashrc
# Install Axolotl scripts
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git /opt/axolotl
ENV PYTHONPATH=/opt/axolotl/src:${PYTHONPATH}
ENV WORKSPACE=/workspace
RUN mkdir ${WORKSPACE}
USER ${USERNAME}
# Configure git
RUN git lfs install --skip-repo && \
git config --global credential.helper store
WORKDIR ${WORKSPACE}
ENTRYPOINT ["/bin/bash"]
This is the Makefile I am using to build:
###############################################################################
# Configuration: Training environment build arguments
###############################################################################
USER_ID:=$(shell id -u)
GROUP_ID:=$(shell id -g)
USERNAME:=developer
CONTAINER_CUDA_VERSION:="11.8.0"
CONTAINER_CUDNN_VERSION:="8"
CONTAINER_UBUNTU_VERSION:="22.04"
PYTORCH_CUDA:="cu118"
PYTORCH_VERSION:="2.0.0"
# PYTORCH_REPO:="https://download.pytorch.org/whl/nightly/$(PYTORCH_CUDA)"
PYTORCH_REPO:="https://download.pytorch.org/whl/$(PYTORCH_CUDA)"
TORCHVISION_VERSION:=""
TORCHAUDIO_VERSION:=""
TORCH_CUDA_ARCH_LIST:="8.0;8.6;8.7"
DEEPSPEED_VERSION:="0.9.5"
BITSANDBYTES_VERSION:="0.40.0"
BITSANDBYTES_CUDA_VERSION:="118"
BITSANDBYTES_MAKE_TARGET:="cuda11x"
# Note that Apex 23.05 doesn't recognize 8.7, likely this will change in some future
# version but we need to make this its own variable for now.
APEX_VERSION:="23.05"
APEX_CUDA_ARCH_LIST:="8.0;8.6"
FLASH_ATTN_VERSION:=1.0.8
HF_ACCELERATE_VERSION:="0.20.3"
HF_TRANSFORMERS_VERSION:="4.30.2"
HF_PEFT_VERSION:="0.3.0"
HF_OPTIMUM_VERSION:="1.9.1"
MAX_CONCURRENCY:=4
###############################################################################
# Configuration: Image / repository
###############################################################################
BUILD_REPO_NAME:=quay.io
BUILD_REPO_USER:=theobjectivedad
BUILD_IMAGE_NAME:=axolotl-main
BUILD_TAG_NAME:=latest
BUILD_FULL_NAME:=$(BUILD_REPO_NAME)/$(BUILD_REPO_USER)/$(BUILD_IMAGE_NAME):$(BUILD_TAG_NAME)
###############################################################################
# Build
###############################################################################
clean:
@docker rmi $(BUILD_FULL_NAME)
login:
@docker login --username=$(BUILD_REPO_USER) --password-stdin $(BUILD_REPO_NAME)
build:
@docker build \
--progress=plain \
--build-arg=USER_ID=$(USER_ID) \
--build-arg=GROUP_ID=$(GROUP_ID) \
--build-arg=USERNAME=$(USERNAME) \
--build-arg=CONTAINER_CUDA_VERSION=$(CONTAINER_CUDA_VERSION) \
--build-arg=CONTAINER_CUDNN_VERSION=$(CONTAINER_CUDNN_VERSION) \
--build-arg=CONTAINER_UBUNTU_VERSION=$(CONTAINER_UBUNTU_VERSION) \
--build-arg=PYTORCH_VERSION=$(PYTORCH_VERSION) \
--build-arg=PYTORCH_REPO=$(PYTORCH_REPO) \
--build-arg=TORCH_CUDA_ARCH_LIST=$(TORCH_CUDA_ARCH_LIST) \
--build-arg=DEEPSPEED_VERSION=$(DEEPSPEED_VERSION) \
--build-arg=BITSANDBYTES_VERSION=$(BITSANDBYTES_VERSION) \
--build-arg=BITSANDBYTES_CUDA_VERSION=$(BITSANDBYTES_CUDA_VERSION) \
--build-arg=BITSANDBYTES_MAKE_TARGET=$(BITSANDBYTES_MAKE_TARGET) \
--build-arg=APEX_VERSION=$(APEX_VERSION) \
--build-arg=APEX_CUDA_ARCH_LIST=$(APEX_CUDA_ARCH_LIST) \
--build-arg=FLASH_ATTN_VERSION=$(FLASH_ATTN_VERSION) \
--build-arg=HF_ACCELERATE_VERSION=$(HF_ACCELERATE_VERSION) \
--build-arg=HF_TRANSFORMERS_VERSION=$(HF_TRANSFORMERS_VERSION) \
--build-arg=HF_OPTIMUM_VERSION=$(HF_OPTIMUM_VERSION) \
--build-arg=HF_PEFT_VERSION=$(HF_PEFT_VERSION) \
--build-arg=MAX_CONCURRENCY=$(MAX_CONCURRENCY) \
-t $(BUILD_FULL_NAME) .
push: build
@docker push $(BUILD_FULL_NAME)
# TODO: you will likely need to change the paths...
run:
docker run --gpus='all' -it --rm \
--volume=$(WORKSPACE_HOST_PATH):/workspace \
--volume=$(MODELS_HOST_PATH):/models \
--volume=$(DATA_HOST_PATH):/data \
--volume=$(WORK_HOST_PATH):/work \
--volume=$(WORKSPACE_HOST_PATH)/extern/axolotl:/opt/axolotl \
--env-file=$(CURDIR)/.env \
--entrypoint=accelerate \
$(BUILD_FULL_NAME) \
launch \
--config_file /work/accelerate/basic.yaml \
/opt/axolotl/scripts/finetune.py \
/work/atheos/config.yaml \
--inference
.PHONY: clean build login push run Once the files are in place just run make build to create the image or make run to run.