|
# Clone repositories |
|
FROM docker.io/bitnami/git:2.39.0@sha256:8802a1053f0a75c948da43c0d04e591b500381447745f0b5f75d3cf85509626c as git_base |
|
|
|
# Basic git configuration |
|
RUN git config --global advice.detachedHead false \ |
|
&& git config --global init.defaultBranch main |
|
|
|
# Install xz-utils for decompressing tarballs |
|
RUN --mount=type=cache,target=/var/cache/apt \ |
|
export DEBIAN_FRONTEND=noninteractive \ |
|
&& apt update \ |
|
&& apt install -y --no-install-recommends xz-utils \ |
|
&& rm -rf /var/lib/apt/lists/* \ |
|
&& apt clean \ |
|
&& apt autoremove |
|
|
|
# Clone mimalloc |
|
# https://github.com/microsoft/mimalloc/commit/dd7348066fe40e8bf372fa4e9538910a5e24a75f |
|
FROM git_base as mimalloc_src |
|
WORKDIR /mimalloc |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/microsoft/mimalloc" \ |
|
&& git fetch origin dd7348066fe40e8bf372fa4e9538910a5e24a75f \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
# Clone mold |
|
# https://github.com/rui314/mold/commit/ad0b6d0ac6a9b269935c3fbf4dae2815395431a4 |
|
FROM git_base as mold_src |
|
WORKDIR /mold |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/rui314/mold" \ |
|
&& git fetch origin ad0b6d0ac6a9b269935c3fbf4dae2815395431a4 \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
# Clone cpython |
|
# https://github.com/python/cpython/commit/5aa8b9e70c44862cf3f600bdc329a20790b67056 |
|
FROM git_base as cpython_src |
|
WORKDIR /cpython |
|
# NOTE: 3.11 adoption is blocked by PyTorch because it uses the old opcodes. |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/python/cpython" \ |
|
&& git fetch origin 5aa8b9e70c44862cf3f600bdc329a20790b67056 \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
# Clone pillow-simd |
|
# https://github.com/uploadcare/pillow-simd/commit/58acec3312fb8671c9d84829197e1c8150085589 |
|
FROM git_base as pillow-simd_src |
|
WORKDIR /pillow-simd |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/uploadcare/pillow-simd" \ |
|
&& git fetch origin 58acec3312fb8671c9d84829197e1c8150085589 \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
# Clone magma |
|
# https://bitbucket.org/icl/magma/commits/0c7321435fe81527f41bad708659f94630a3625f |
|
FROM git_base as magma_src |
|
WORKDIR /magma |
|
RUN git init \ |
|
&& git remote add origin "https://bitbucket.org/icl/magma" \ |
|
&& git fetch origin 0c7321435fe81527f41bad708659f94630a3625f \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
# Patching to remove tests. Some of them don't play well with LTO. |
|
COPY ./magma_remove_tests.diff . |
|
RUN git apply ./magma_remove_tests.diff |
|
|
|
# Clone pytorch |
|
# https://github.com/pytorch/pytorch/commit/ce50a8de7535b5e359f7ed7ead4285414a966d3f |
|
FROM git_base as torch_src |
|
WORKDIR /torch |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/pytorch/pytorch" \ |
|
&& git fetch origin ce50a8de7535b5e359f7ed7ead4285414a966d3f \ |
|
--recurse-submodules=yes \ |
|
--jobs=16 \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD \ |
|
# PyTorch has submodules we need to fetch. |
|
&& git submodule update \ |
|
--init \ |
|
--recursive \ |
|
--jobs=16 \ |
|
--depth=1 |
|
|
|
# Patch to change flag which is very sensitive on Clang from error to warning. |
|
RUN sed -i -e 's/Werror=cast/Wcast/g' CMakeLists.txt |
|
|
|
# Clone torchvision |
|
# https://github.com/pytorch/vision/commit/90cfb10dc49187842247d3bffb25a06af0b1e826 |
|
FROM git_base as torchvision_src |
|
WORKDIR /torchvision |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/pytorch/vision" \ |
|
&& git fetch origin 90cfb10dc49187842247d3bffb25a06af0b1e826 \ |
|
--depth=1 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
# Clone triton |
|
# https://github.com/openai/triton/commit/0f5c6e619c35d22507f6202600a78f5781495496 |
|
FROM git_base as triton_src |
|
# Download pybind11 |
|
WORKDIR /root/.triton/pybind11 |
|
RUN curl -sL "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz" \ |
|
| tar -xz |
|
|
|
# Download llvm |
|
WORKDIR /root/.triton/llvm |
|
RUN curl -sL "https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/clang+llvm-14.0.0-x86_64-linux-gnu-ubuntu-18.04.tar.xz" \ |
|
| tar -xJ |
|
|
|
WORKDIR /triton |
|
RUN git init \ |
|
&& git remote add origin "https://github.com/openai/triton" \ |
|
&& git fetch origin 0f5c6e619c35d22507f6202600a78f5781495496 \ |
|
&& git checkout FETCH_HEAD |
|
|
|
|
|
|
|
FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04@sha256:0e9ecf52bdde401829b39b5fd38093d4254bf1b24aae9bf644d212fba347458b as os_base |
|
SHELL ["/bin/bash", "-c"] |
|
RUN --mount=type=cache,target=/var/cache/apt \ |
|
export DEBIAN_FRONTEND=noninteractive \ |
|
&& apt-mark unhold libcudnn8 \ |
|
&& apt update \ |
|
&& apt upgrade -y \ |
|
&& apt install -y --no-install-recommends \ |
|
build-essential \ |
|
cmake \ |
|
curl \ |
|
git \ |
|
lcov \ |
|
libbz2-dev \ |
|
libffi-dev \ |
|
libgdbm-compat-dev \ |
|
libgdbm-dev \ |
|
libjpeg-turbo8-dev \ |
|
liblzma-dev \ |
|
libncurses5-dev \ |
|
libnuma-dev \ |
|
libopenblas-dev \ |
|
libpng-dev \ |
|
libprotobuf-dev \ |
|
libreadline6-dev \ |
|
libsqlite3-dev \ |
|
libssl-dev \ |
|
lzma-dev \ |
|
ninja-build \ |
|
numactl \ |
|
pkg-config \ |
|
protobuf-compiler \ |
|
tk-dev \ |
|
uuid-dev \ |
|
wget \ |
|
zlib1g-dev \ |
|
zstd \ |
|
&& rm -rf /var/lib/apt/lists/* \ |
|
&& apt clean \ |
|
&& apt autoremove |
|
|
|
RUN \ |
|
# Add Intel OneAPI repository |
|
wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" \ |
|
| gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \ |
|
&& echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ |
|
| tee /etc/apt/sources.list.d/oneAPI.list > /dev/null \ |
|
# Add LLVM repository |
|
&& wget -qO- "https://apt.llvm.org/llvm-snapshot.gpg.key" \ |
|
| gpg --dearmor -o /usr/share/keyrings/llvm-snapshot-keyring.gpg \ |
|
&& echo "deb [signed-by=/usr/share/keyrings/llvm-snapshot-keyring.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" \ |
|
| tee /etc/apt/sources.list.d/llvm.list > /dev/null \ |
|
# Add Kitware (CMake) repository |
|
&& wget -qO- "https://apt.kitware.com/keys/kitware-archive-latest.asc" \ |
|
| gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg \ |
|
&& echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main" \ |
|
| tee /etc/apt/sources.list.d/kitware.list > /dev/null |
|
|
|
RUN --mount=type=cache,target=/var/cache/apt \ |
|
export DEBIAN_FRONTEND=noninteractive \ |
|
&& apt update \ |
|
&& apt remove --purge --auto-remove cmake -y \ |
|
&& apt install -y --no-install-recommends \ |
|
clang-16 \ |
|
cmake \ |
|
intel-oneapi-mkl-devel-2023.0.0 \ |
|
intel-oneapi-openmp-2023.0.0 \ |
|
intel-oneapi-runtime-openmp \ |
|
libomp-16-dev \ |
|
libomp5-16 \ |
|
lld-16 \ |
|
lldb-16 \ |
|
&& rm -rf /var/lib/apt/lists/* \ |
|
&& apt clean \ |
|
&& apt autoremove |
|
|
|
# Create a symbolic link for libiomp5.so and the actual backing shared library |
|
RUN ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \ |
|
/usr/lib/llvm-16/lib/libiomp5.so \ |
|
&& ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \ |
|
/usr/lib/llvm-16/lib/libomp.so \ |
|
&& ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \ |
|
/usr/lib/llvm-16/lib/libomp.so.5 |
|
|
|
# Update environment variables |
|
ENV PATH="/usr/lib/llvm-16/bin:$PATH" \ |
|
|
|
LANG="C.UTF-8" \ |
|
LC_ALL="C.UTF-8" \ |
|
|
|
CC="clang" \ |
|
CXX="clang++" \ |
|
LD="ld.lld" \ |
|
|
|
# CUDA-related environment variables |
|
LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH" \ |
|
CUDA_HOME="/usr/local/cuda" \ |
|
CUDA_MODULE_LOADING="LAZY" \ |
|
CUDA_USE_STATIC_CUDA_RUNTIME="ON" \ |
|
CUDAHOSTCXX="clang++" \ |
|
|
|
AR="llvm-ar" \ |
|
AS="llvm-as" \ |
|
NM="llvm-nm" \ |
|
OBJCOPY="llvm-objcopy" \ |
|
OBJDUMP="llvm-objdump" \ |
|
RANLIB="llvm-ranlib" \ |
|
READELF="llvm-readelf" \ |
|
STRIP="llvm-strip" \ |
|
|
|
# Intel MKL location |
|
MKLROOT="/opt/intel/oneapi/mkl/latest" \ |
|
|
|
# Useful flags |
|
NO_WARN_FLAGS="-Wno-deprecated -Wno-unused-command-line-argument" \ |
|
LTO_FLAGS="-flto=thin -fsplit-lto-unit -fwhole-program-vtables" \ |
|
OPT_FLAGS="-O3 -march=native --pipe -falign-functions=32 -fno-semantic-interposition -fsplit-machine-functions -fslp-vectorize -ffunction-sections -fdata-sections -fforce-emit-vtables -fstrict-vtable-pointers -fno-plt -fno-common" \ |
|
LINK_FLAGS="-Wl,-O2 -Wl,-znow -Wl,--sort-common -Wl,--gc-sections -Wl,--hash-style=gnu" |
|
|
|
# Register clang as the default compiler |
|
RUN update-alternatives --install /usr/bin/cc cc /usr/bin/clang-16 100 \ |
|
&& update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++-16 100 \ |
|
|
|
# NOTE: We will replace this with mold so we don't set priority to 100 |
|
&& update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-16 75 \ |
|
|
|
&& update-alternatives --install /usr/bin/ar ar /usr/bin/llvm-ar-16 100 \ |
|
&& update-alternatives --install /usr/bin/as as /usr/bin/llvm-as-16 100 \ |
|
&& update-alternatives --install /usr/bin/nm nm /usr/bin/llvm-nm-16 100 \ |
|
&& update-alternatives --install /usr/bin/objcopy objcopy /usr/bin/llvm-objcopy-16 100 \ |
|
&& update-alternatives --install /usr/bin/objdump objdump /usr/bin/llvm-objdump-16 100 \ |
|
&& update-alternatives --install /usr/bin/ranlib ranlib /usr/bin/llvm-ranlib-16 100 \ |
|
&& update-alternatives --install /usr/bin/readelf readelf /usr/bin/llvm-readelf-16 100 \ |
|
&& update-alternatives --install /usr/bin/strip strip /usr/bin/llvm-strip-16 100 |
|
|
|
# TODO: Experimental CUDA setting |
|
# --default-stream=per-thread: |
|
# Use a separate stream for each thread. Default is legacy. |
|
# ENV CUDAFLAGS="$CUDAFLAGS --default-stream=per-thread" |
|
|
|
FROM os_base as mimalloc_base |
|
COPY --from=mimalloc_src /mimalloc /mimalloc |
|
WORKDIR /mimalloc |
|
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
&& export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \ |
|
&& cmake -S. -Bbuild -GNinja \ |
|
# Base CMake settings |
|
-DBUILD_SHARED_LIBS="ON" \ |
|
-DCMAKE_BUILD_TYPE="Release" \ |
|
-DCMAKE_INSTALL_PREFIX="/usr/local" \ |
|
-DCMAKE_VERBOSE_MAKEFILE="OFF" \ |
|
|
|
# Set C standard and flags |
|
-DCMAKE_C_STANDARD="17" \ |
|
-DCMAKE_C_STANDARD_REQUIRED="OFF" \ |
|
-DCMAKE_C_EXTENSIONS="OFF" \ |
|
-DCMAKE_C_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set C++ standard and flags |
|
-DCMAKE_CXX_STANDARD="20" \ |
|
-DCMAKE_CXX_STANDARD_REQUIRED="OFF" \ |
|
-DCMAKE_CXX_EXTENSIONS="OFF" \ |
|
-DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set linker flags |
|
-DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
-DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
-DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \ |
|
|
|
# LTO policies |
|
-DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \ |
|
-DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \ |
|
-DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \ |
|
|
|
# LTO |
|
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \ |
|
-DCMAKE_POSITION_INDEPENDENT_CODE="OFF" \ |
|
|
|
# Mimalloc-specific flags |
|
-DMI_BUILD_OBJECT="OFF" \ |
|
-DMI_BUILD_SHARED="ON" \ |
|
-DMI_BUILD_STATIC="OFF" \ |
|
-DMI_BUILD_TESTS="OFF" \ |
|
-DMI_OVERRIDE="ON" \ |
|
&& cmake --build build --target install |
|
|
|
ENV LD_PRELOAD="/usr/local/lib/libmimalloc.so:$LD_PRELOAD" |
|
WORKDIR / |
|
RUN rm -rf /mimalloc |
|
|
|
FROM mimalloc_base as mold_base |
|
COPY --from=mold_src /mold /mold |
|
WORKDIR /mold |
|
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
&& export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \ |
|
&& cmake -S. -Bbuild -GNinja \ |
|
# Base CMake settings |
|
-DBUILD_SHARED_LIBS="OFF" \ |
|
-DCMAKE_BUILD_TYPE="Release" \ |
|
-DCMAKE_INSTALL_PREFIX="/usr/local" \ |
|
-DCMAKE_VERBOSE_MAKEFILE="OFF" \ |
|
|
|
# Set C standard and flags |
|
-DCMAKE_C_STANDARD="17" \ |
|
-DCMAKE_C_STANDARD_REQUIRED="ON" \ |
|
# Mold uses GNU extensions, so we need to use gnu17 and gnu++20. |
|
-DCMAKE_C_EXTENSIONS="ON" \ |
|
-DCMAKE_C_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set C++ standard and flags |
|
-DCMAKE_CXX_STANDARD="20" \ |
|
-DCMAKE_CXX_STANDARD_REQUIRED="ON" \ |
|
# Mold uses GNU extensions, so we need to use gnu17 and gnu++20. |
|
-DCMAKE_CXX_EXTENSIONS="ON" \ |
|
-DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set linker flags |
|
-DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
-DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
-DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \ |
|
|
|
# LTO policies |
|
-DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \ |
|
-DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \ |
|
-DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \ |
|
|
|
# LTO |
|
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \ |
|
-DCMAKE_POSITION_INDEPENDENT_CODE="OFF" \ |
|
|
|
# Mold-specific flags |
|
-DMOLD_LTO="ON" \ |
|
-DMOLD_USE_MIMALLOC="ON" \ |
|
-DMOLD_USE_SYSTEM_MIMALLOC="ON" \ |
|
-DZSTD_LEGACY_SUPPORT="OFF" \ |
|
&& cmake --build build --target install |
|
|
|
# Set the linker to mold |
|
WORKDIR / |
|
ENV LD="ld.mold" |
|
RUN update-alternatives --install /usr/bin/ld ld /usr/local/bin/mold 100 \ |
|
&& rm -rf /mold |
|
|
|
FROM mold_base as cpython_base |
|
COPY --from=cpython_src /cpython /cpython |
|
WORKDIR /cpython |
|
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \ |
|
&& export CFLAGS="-std=c17 $COMMON_FLAGS" \ |
|
&& export CXXFLAGS="-std=c++20 $COMMON_FLAGS" \ |
|
&& export LDFLAGS="$LINK_FLAGS" \ |
|
&& ./configure \ |
|
--enable-ipv6=yes \ |
|
--enable-optimizations \ |
|
--with-computed-gotos \ |
|
--with-lto=thin \ |
|
--with-pymalloc \ |
|
--with-system-expat \ |
|
--with-ensurepip=upgrade \ |
|
ax_cv_c_float_words_bigendian=no \ |
|
&& make -j \ |
|
&& make install |
|
|
|
# Set the python3 and pip3 symlinks |
|
WORKDIR / |
|
RUN ln -sf /usr/local/bin/python3 /usr/local/bin/python \ |
|
&& ln -sf /usr/local/bin/pip3 /usr/local/bin/pip \ |
|
&& rm -rf /cpython |
|
|
|
FROM cpython_base as pip_requirements_install |
|
RUN pip install \ |
|
# TODO: Since not compiling numpy from source, maybe we don't need Cython? |
|
Cython==0.29.33 \ |
|
# NOTE: Need Jinja2 for some templating done by torch.compile |
|
Jinja2==3.1.2 \ |
|
networkx==2.8.8 \ |
|
# NOTE: numpy is a hassle to compile with LTO, so we use the precompiled version |
|
numpy==1.21.4 \ |
|
packaging==22.0 \ |
|
pyyaml==6.0 \ |
|
setuptools==65.6.3 \ |
|
sympy==1.11.1 \ |
|
typing_extensions==4.4.0 \ |
|
wheel==0.38.4 \ |
|
--verbose \ |
|
--no-build-isolation \ |
|
--no-cache-dir |
|
|
|
# Build and install pillow-simd |
|
FROM pip_requirements_install as pillow_install |
|
COPY --from=pillow-simd_src /pillow-simd /pillow-simd |
|
WORKDIR /pillow-simd |
|
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \ |
|
&& export CFLAGS="-std=c17 $COMMON_FLAGS" \ |
|
&& export CXXFLAGS="-std=c++20 $COMMON_FLAGS" \ |
|
&& export LDFLAGS="$LINK_FLAGS" \ |
|
&& pip install . \ |
|
--verbose \ |
|
--no-build-isolation \ |
|
--no-cache-dir |
|
|
|
WORKDIR / |
|
RUN rm -rf /pillow-simd |
|
|
|
# Build and install magma |
|
# TODO: Try with CUDA 12 because our version of Magma includes this PR: |
|
# https://bitbucket.org/icl/magma/pull-requests/30/new-gemm-kernel-without-texture-memory |
|
FROM pillow_install as magma_install |
|
COPY --from=magma_src /magma /magma |
|
WORKDIR /magma |
|
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
|
|
&& export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/}" \ |
|
&& export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \ |
|
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \ |
|
|
|
# Must set --allow-unsupported-compiler to use newer versions of clang. |
|
# The CMAKE_CUDA_FLAGS variable is used by CMake to check the compiler version. |
|
# It's okay that we don't specify all of $CUDA_NVCC_FLAGS here, because |
|
# these is only used for the detection of the compiler version. |
|
# NOTE: We must also specify --std, otherwise CMake seems to ignore the |
|
# --allow-unsupported-compiler flag! |
|
&& export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -O3 --threads=0 --extra-device-vectorization" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \ |
|
|
|
&& echo -e "BACKEND=cuda\nFORT=false\nGPU_TARGET=Ampere" > make.inc \ |
|
&& make generate -j &> /dev/null \ |
|
|
|
&& cmake -S. -Bbuild -GNinja \ |
|
# Base CMake settings |
|
-DBUILD_SHARED_LIBS="OFF" \ |
|
-DCMAKE_BUILD_TYPE="Release" \ |
|
-DCMAKE_INSTALL_PREFIX="/usr/local" \ |
|
-DCMAKE_VERBOSE_MAKEFILE="OFF" \ |
|
|
|
# Set C standard and flags |
|
-DCMAKE_C_STANDARD="17" \ |
|
-DCMAKE_C_STANDARD_REQUIRED="OFF" \ |
|
-DCMAKE_C_EXTENSIONS="ON" \ |
|
-DCMAKE_C_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set C++ standard and flags |
|
-DCMAKE_CXX_STANDARD="17" \ |
|
-DCMAKE_CXX_STANDARD_REQUIRED="OFF" \ |
|
-DCMAKE_CXX_EXTENSIONS="ON" \ |
|
-DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set linker flags |
|
-DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
-DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
-DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \ |
|
|
|
# LTO policies |
|
-DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \ |
|
-DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \ |
|
-DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \ |
|
|
|
# LTO |
|
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \ |
|
# Must use PIC to link against PyTorch (which always uses PIC) |
|
-DCMAKE_POSITION_INDEPENDENT_CODE="ON" \ |
|
|
|
# CUDA LTO |
|
# NOTE: Make sure RESOLVE_DEVICE_SYMBOLS is OFF! |
|
# https://gitlab.kitware.com/cmake/cmake/-/issues/22225 |
|
-DCMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS="OFF" \ |
|
-DCMAKE_CUDA_SEPARABLE_COMPILATION="ON" \ |
|
-DCMAKE_CUDA_ARCHITECTURES="89-real" \ |
|
|
|
-DCMAKE_CUDA_HOST_COMPILER="$CXX" \ |
|
-DCMAKE_CUDA_STANDARD="17" \ |
|
-DCMAKE_CUDA_FLAGS="$CUDA_FLAGS" \ |
|
|
|
# Magma-specific flags |
|
-DGPU_TARGET="Ampere" \ |
|
-DLAPACK_LIBRARIES="${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a;${MKLROOT}/lib/intel64/libmkl_blacs_intelmpi_ilp64.a;-lm;-ldl;-lpthread" \ |
|
-DMAGMA_ENABLE_CUDA="ON" \ |
|
-DUSE_FORTRAN="OFF" \ |
|
&& cmake --build build --target install |
|
|
|
WORKDIR / |
|
RUN rm -rf /magma |
|
|
|
# Build and install pytorch |
|
# TODO: |
|
# - Can we get a performance improvement by updating the libraries in third_party? |
|
# - CUB and NNPack are both fairly outdated |
|
FROM magma_install as torch_install |
|
COPY --from=torch_src /torch /torch |
|
WORKDIR /torch |
|
|
|
RUN \ |
|
# TODO: Building with -fforce-emit-vtables causes missing symbols when importing torch. |
|
export OPT_FLAGS="${OPT_FLAGS// -fforce-emit-vtables/}" \ |
|
# TODO: Why do we need -fPIC specified for libnnpack? |
|
&& export COMMON_FLAGS="$OPT_FLAGS $NO_WARN_FLAGS -fPIC" \ |
|
|
|
# TODO: Why do we need -fPIC and -shared specified for libnnpack/libpytorch_cpu? |
|
# NOTE: We can pass LTO_FLAGS to the linker so we can use the LTO objects from Magma, but we |
|
# can't use them in COMMON_FLAGS because we can't compile torch with LTO. |
|
&& export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/} $LTO_FLAGS -fPIC -shared" \ |
|
&& export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \ |
|
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $LTO_FLAGS $COMMON_FLAGS" \ |
|
|
|
# NOTE: Redefining the standard causes CMake to pick up these flags, so we must. |
|
&& export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -O3 --threads=0 --extra-device-vectorization" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \ |
|
|
|
# Base CMake settings |
|
&& export BUILD_SHARED_LIBS="ON" \ |
|
&& export CMAKE_BUILD_TYPE="Release" \ |
|
&& export CMAKE_VERBOSE_MAKEFILE="ON" \ |
|
|
|
# Set C standard and flags |
|
&& export CMAKE_C_STANDARD="17" \ |
|
&& export CMAKE_C_STANDARD_REQUIRED="ON" \ |
|
&& export CMAKE_C_EXTENSIONS="ON" \ |
|
&& export CMAKE_C_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set C++ standard and flags |
|
&& export CMAKE_CXX_STANDARD="17" \ |
|
&& export CMAKE_CXX_STANDARD_REQUIRED="ON" \ |
|
&& export CMAKE_CXX_EXTENSIONS="ON" \ |
|
&& export CMAKE_CXX_FLAGS="$COMMON_FLAGS" \ |
|
|
|
# Set linker flags |
|
&& export CMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
&& export CMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \ |
|
&& export CMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \ |
|
|
|
# LTO policies |
|
&& export CMAKE_POLICY_DEFAULT_CMP0069="NEW" \ |
|
&& export CMAKE_POLICY_DEFAULT_CMP0105="NEW" \ |
|
&& export CMAKE_POLICY_DEFAULT_CMP0138="NEW" \ |
|
|
|
# LTO |
|
&& export CMAKE_INTERPROCEDURAL_OPTIMIZATION="OFF" \ |
|
# PyTorch always uses fPIC, so don't confuse CMake. |
|
&& export CMAKE_POSITION_INDEPENDENT_CODE="ON" \ |
|
|
|
# CUDA LTO |
|
# NOTE: Make sure RESOLVE_DEVICE_SYMBOLS is OFF! |
|
# https://gitlab.kitware.com/cmake/cmake/-/issues/22225 |
|
&& export CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS="OFF" \ |
|
&& export CMAKE_CUDA_SEPARABLE_COMPILATION="OFF" \ |
|
&& export CUDA_SEPARABLE_COMPILATION="OFF" \ |
|
|
|
|
|
&& export CMAKE_CUDA_ARCHITECTURES="89-real" \ |
|
&& export CMAKE_CUDA_FLAGS="$CUDA_FLAGS" \ |
|
&& export CMAKE_CUDA_HOST_COMPILER="$CXX" \ |
|
&& export CMAKE_CUDA_STANDARD="17" \ |
|
|
|
# Torch-specific flags |
|
&& export ATEN_NO_TEST="ON" \ |
|
&& export ATEN_STATIC_CUDA="ON" \ |
|
&& export BLAS="MKL" \ |
|
&& export BUILD_CUSTOM_PROTOBUF="OFF" \ |
|
&& export BUILD_TEST="OFF" \ |
|
&& export INSTALL_TEST="OFF" \ |
|
&& export INTEL_MKL_DIR="$MKLROOT" \ |
|
&& export MAGMA_INCLUDE_DIR="/usr/local/include" \ |
|
&& export NCCL_ROOT="/usr" \ |
|
&& export Protobuf_USE_STATIC_LIBS="ON" \ |
|
&& export TH_BINARY_BUILD="ON" \ |
|
&& export TORCH_ALLOW_TF32_CUBLAS_OVERRIDE="1" \ |
|
&& export TORCH_CUDA_ARCH_LIST="8.9" \ |
|
&& export TORCH_NVCC_FLAGS="$CUDA_FLAGS" \ |
|
&& export USE_CUDA_STATIC_LINK="ON" \ |
|
&& export USE_CUDNN="ON" \ |
|
&& export USE_CUPTI_SO="OFF" \ |
|
&& export USE_EXPERIMENTAL_CUDNN_V8_API="ON" \ |
|
&& export USE_GLOO="OFF" \ |
|
&& export USE_KINETO="ON" \ |
|
&& export USE_MKLDNN="OFF" \ |
|
&& export USE_NCCL="ON" \ |
|
&& export USE_STATIC_CUDNN="OFF" \ |
|
&& export USE_STATIC_MKL="ON" \ |
|
&& export USE_STATIC_NCCL="ON" \ |
|
&& export USE_SYSTEM_NCCL="ON" \ |
|
|
|
# Build and install torch |
|
&& python setup.py build --cmake-only \ |
|
&& python setup.py install \ |
|
&& rm -rf /torch |
|
|
|
# NOTE: Torchvision is incredibly sensitive to spaces in NVCC's flags. If there are any leading or |
|
# trailing spaces, it will fail to build. If there are any double spaces, it will fail to build. |
|
# Build and install torchvision |
|
FROM torch_install as torchvision_install |
|
COPY --from=torchvision_src /torchvision /torchvision |
|
WORKDIR /torchvision |
|
RUN \ |
|
# TODO: Torchvision doesn't seem to use the CMAKE variables we set in the environment. |
|
export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
|
|
&& export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/}" \ |
|
&& export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \ |
|
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \ |
|
|
|
# TODO: Can we compile the LTO objects from Torch and then link them here? |
|
# NOTE: Redefining the standard causes CMake to pick up these flags, so we must. |
|
&& export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -gencode=arch=compute_89,code=sm_89 -O3 --threads=0 --extra-device-vectorization" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \ |
|
&& export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \ |
|
|
|
# Set C standard and flags |
|
&& export CFLAGS="-std=c17 $COMMON_FLAGS" \ |
|
|
|
# Set C++ standard and flags |
|
&& export CXXFLAGS="-std=c++17 $COMMON_FLAGS" \ |
|
|
|
# Set linker flags |
|
&& LDFLAGS="$LINK_FLAGS" \ |
|
|
|
# Torchvision settings |
|
&& export DEBUG="0" \ |
|
&& export FORCE_CUDA="1" \ |
|
&& export NVCC_FLAGS="$CUDA_FLAGS" \ |
|
&& export TORCH_CUDA_ARCH_LIST="8.9" \ |
|
&& export TORCHVISION_USE_FFMPEG="0" \ |
|
&& export TORCHVISION_USE_JPEG="1" \ |
|
&& export TORCHVISION_USE_NVJPEG="0" \ |
|
&& export TORCHVISION_USE_PNG="1" \ |
|
&& export TORCHVISION_USE_VIDEO_CODEC="0" \ |
|
|
|
&& python setup.py install \ |
|
&& rm -rf /torchvision |
|
|
|
# Build and install triton |
|
FROM torchvision_install as triton_install |
|
COPY --from=triton_src /triton /triton |
|
|
|
# TODO: Unable to link with mold for some reason. Use lld instead. Report to mold maintainer. |
|
WORKDIR /triton |
|
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \ |
|
&& export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \ |
|
&& export CFLAGS="-std=gnu17 $COMMON_FLAGS" \ |
|
&& export CXXFLAGS="-std=gnu++17 $COMMON_FLAGS" \ |
|
&& export LDFLAGS="$LINK_FLAGS" \ |
|
&& cd python \ |
|
# Must be editable install to allow for cyclic imports |
|
&& pip install -e . \ |
|
--verbose \ |
|
--no-build-isolation \ |
|
--no-cache-dir |
|
|
|
# NOTE: We cannot delete triton because we did an editable install and we need to do that to allow |
|
# for cyclic imports. |
|
|
|
|
|
WORKDIR / |
|
CMD ["/bin/bash"] |