Skip to content

Instantly share code, notes, and snippets.

@ConnorBaker
Last active January 10, 2023 23:45
Show Gist options
  • Select an option

  • Save ConnorBaker/10988159828943187d75737566c7e342 to your computer and use it in GitHub Desktop.

Select an option

Save ConnorBaker/10988159828943187d75737566c7e342 to your computer and use it in GitHub Desktop.
Dockerfile for building Mimalloc, Mold, CPython, Magma, PyTorch, Torchvision, and Triton from source

README

Building and Using

NOTE: No warranty of any kind provided, I'm not liable for anything that occurs as a result of you using this, etc., etc.

The build may take a while! On my i9 13900k I'm seeing it take about 30m or so.

Make sure Dockerfile and magma_remove_tests.diff are in the same directory. Build with

sudo docker buildx build . --progress=plain --tag=temp:latest

and run with

sudo docker run --gpus=all --ipc=host -it

optionally using Docker's mount argument as you wish.

# Clone repositories
FROM docker.io/bitnami/git:2.39.0@sha256:8802a1053f0a75c948da43c0d04e591b500381447745f0b5f75d3cf85509626c as git_base
# Basic git configuration
RUN git config --global advice.detachedHead false \
&& git config --global init.defaultBranch main
# Install xz-utils for decompressing tarballs
RUN --mount=type=cache,target=/var/cache/apt \
export DEBIAN_FRONTEND=noninteractive \
&& apt update \
&& apt install -y --no-install-recommends xz-utils \
&& rm -rf /var/lib/apt/lists/* \
&& apt clean \
&& apt autoremove
# Clone mimalloc
# https://github.com/microsoft/mimalloc/commit/dd7348066fe40e8bf372fa4e9538910a5e24a75f
FROM git_base as mimalloc_src
WORKDIR /mimalloc
RUN git init \
&& git remote add origin "https://github.com/microsoft/mimalloc" \
&& git fetch origin dd7348066fe40e8bf372fa4e9538910a5e24a75f \
--depth=1 \
&& git checkout FETCH_HEAD
# Clone mold
# https://github.com/rui314/mold/commit/ad0b6d0ac6a9b269935c3fbf4dae2815395431a4
FROM git_base as mold_src
WORKDIR /mold
RUN git init \
&& git remote add origin "https://github.com/rui314/mold" \
&& git fetch origin ad0b6d0ac6a9b269935c3fbf4dae2815395431a4 \
--depth=1 \
&& git checkout FETCH_HEAD
# Clone cpython
# https://github.com/python/cpython/commit/5aa8b9e70c44862cf3f600bdc329a20790b67056
FROM git_base as cpython_src
WORKDIR /cpython
# NOTE: 3.11 adoption is blocked by PyTorch because it uses the old opcodes.
RUN git init \
&& git remote add origin "https://github.com/python/cpython" \
&& git fetch origin 5aa8b9e70c44862cf3f600bdc329a20790b67056 \
--depth=1 \
&& git checkout FETCH_HEAD
# Clone pillow-simd
# https://github.com/uploadcare/pillow-simd/commit/58acec3312fb8671c9d84829197e1c8150085589
FROM git_base as pillow-simd_src
WORKDIR /pillow-simd
RUN git init \
&& git remote add origin "https://github.com/uploadcare/pillow-simd" \
&& git fetch origin 58acec3312fb8671c9d84829197e1c8150085589 \
--depth=1 \
&& git checkout FETCH_HEAD
# Clone magma
# https://bitbucket.org/icl/magma/commits/0c7321435fe81527f41bad708659f94630a3625f
FROM git_base as magma_src
WORKDIR /magma
RUN git init \
&& git remote add origin "https://bitbucket.org/icl/magma" \
&& git fetch origin 0c7321435fe81527f41bad708659f94630a3625f \
--depth=1 \
&& git checkout FETCH_HEAD
# Patching to remove tests. Some of them don't play well with LTO.
COPY ./magma_remove_tests.diff .
RUN git apply ./magma_remove_tests.diff
# Clone pytorch
# https://github.com/pytorch/pytorch/commit/ce50a8de7535b5e359f7ed7ead4285414a966d3f
FROM git_base as torch_src
WORKDIR /torch
RUN git init \
&& git remote add origin "https://github.com/pytorch/pytorch" \
&& git fetch origin ce50a8de7535b5e359f7ed7ead4285414a966d3f \
--recurse-submodules=yes \
--jobs=16 \
--depth=1 \
&& git checkout FETCH_HEAD \
# PyTorch has submodules we need to fetch.
&& git submodule update \
--init \
--recursive \
--jobs=16 \
--depth=1
# Patch to change flag which is very sensitive on Clang from error to warning.
RUN sed -i -e 's/Werror=cast/Wcast/g' CMakeLists.txt
# Clone torchvision
# https://github.com/pytorch/vision/commit/90cfb10dc49187842247d3bffb25a06af0b1e826
FROM git_base as torchvision_src
WORKDIR /torchvision
RUN git init \
&& git remote add origin "https://github.com/pytorch/vision" \
&& git fetch origin 90cfb10dc49187842247d3bffb25a06af0b1e826 \
--depth=1 \
&& git checkout FETCH_HEAD
# Clone triton
# https://github.com/openai/triton/commit/0f5c6e619c35d22507f6202600a78f5781495496
FROM git_base as triton_src
# Download pybind11
WORKDIR /root/.triton/pybind11
RUN curl -sL "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz" \
| tar -xz
# Download llvm
WORKDIR /root/.triton/llvm
RUN curl -sL "https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/clang+llvm-14.0.0-x86_64-linux-gnu-ubuntu-18.04.tar.xz" \
| tar -xJ
WORKDIR /triton
RUN git init \
&& git remote add origin "https://github.com/openai/triton" \
&& git fetch origin 0f5c6e619c35d22507f6202600a78f5781495496 \
&& git checkout FETCH_HEAD
FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04@sha256:0e9ecf52bdde401829b39b5fd38093d4254bf1b24aae9bf644d212fba347458b as os_base
SHELL ["/bin/bash", "-c"]
RUN --mount=type=cache,target=/var/cache/apt \
export DEBIAN_FRONTEND=noninteractive \
&& apt-mark unhold libcudnn8 \
&& apt update \
&& apt upgrade -y \
&& apt install -y --no-install-recommends \
build-essential \
cmake \
curl \
git \
lcov \
libbz2-dev \
libffi-dev \
libgdbm-compat-dev \
libgdbm-dev \
libjpeg-turbo8-dev \
liblzma-dev \
libncurses5-dev \
libnuma-dev \
libopenblas-dev \
libpng-dev \
libprotobuf-dev \
libreadline6-dev \
libsqlite3-dev \
libssl-dev \
lzma-dev \
ninja-build \
numactl \
pkg-config \
protobuf-compiler \
tk-dev \
uuid-dev \
wget \
zlib1g-dev \
zstd \
&& rm -rf /var/lib/apt/lists/* \
&& apt clean \
&& apt autoremove
RUN \
# Add Intel OneAPI repository
wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" \
| gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
| tee /etc/apt/sources.list.d/oneAPI.list > /dev/null \
# Add LLVM repository
&& wget -qO- "https://apt.llvm.org/llvm-snapshot.gpg.key" \
| gpg --dearmor -o /usr/share/keyrings/llvm-snapshot-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/llvm-snapshot-keyring.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" \
| tee /etc/apt/sources.list.d/llvm.list > /dev/null \
# Add Kitware (CMake) repository
&& wget -qO- "https://apt.kitware.com/keys/kitware-archive-latest.asc" \
| gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main" \
| tee /etc/apt/sources.list.d/kitware.list > /dev/null
RUN --mount=type=cache,target=/var/cache/apt \
export DEBIAN_FRONTEND=noninteractive \
&& apt update \
&& apt remove --purge --auto-remove cmake -y \
&& apt install -y --no-install-recommends \
clang-16 \
cmake \
intel-oneapi-mkl-devel-2023.0.0 \
intel-oneapi-openmp-2023.0.0 \
intel-oneapi-runtime-openmp \
libomp-16-dev \
libomp5-16 \
lld-16 \
lldb-16 \
&& rm -rf /var/lib/apt/lists/* \
&& apt clean \
&& apt autoremove
# Create a symbolic link for libiomp5.so and the actual backing shared library
RUN ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \
/usr/lib/llvm-16/lib/libiomp5.so \
&& ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \
/usr/lib/llvm-16/lib/libomp.so \
&& ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \
/usr/lib/llvm-16/lib/libomp.so.5
# Update environment variables
ENV PATH="/usr/lib/llvm-16/bin:$PATH" \
LANG="C.UTF-8" \
LC_ALL="C.UTF-8" \
CC="clang" \
CXX="clang++" \
LD="ld.lld" \
# CUDA-related environment variables
LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH" \
CUDA_HOME="/usr/local/cuda" \
CUDA_MODULE_LOADING="LAZY" \
CUDA_USE_STATIC_CUDA_RUNTIME="ON" \
CUDAHOSTCXX="clang++" \
AR="llvm-ar" \
AS="llvm-as" \
NM="llvm-nm" \
OBJCOPY="llvm-objcopy" \
OBJDUMP="llvm-objdump" \
RANLIB="llvm-ranlib" \
READELF="llvm-readelf" \
STRIP="llvm-strip" \
# Intel MKL location
MKLROOT="/opt/intel/oneapi/mkl/latest" \
# Useful flags
NO_WARN_FLAGS="-Wno-deprecated -Wno-unused-command-line-argument" \
LTO_FLAGS="-flto=thin -fsplit-lto-unit -fwhole-program-vtables" \
OPT_FLAGS="-O3 -march=native --pipe -falign-functions=32 -fno-semantic-interposition -fsplit-machine-functions -fslp-vectorize -ffunction-sections -fdata-sections -fforce-emit-vtables -fstrict-vtable-pointers -fno-plt -fno-common" \
LINK_FLAGS="-Wl,-O2 -Wl,-znow -Wl,--sort-common -Wl,--gc-sections -Wl,--hash-style=gnu"
# Register clang as the default compiler
RUN update-alternatives --install /usr/bin/cc cc /usr/bin/clang-16 100 \
&& update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++-16 100 \
# NOTE: We will replace this with mold so we don't set priority to 100
&& update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-16 75 \
&& update-alternatives --install /usr/bin/ar ar /usr/bin/llvm-ar-16 100 \
&& update-alternatives --install /usr/bin/as as /usr/bin/llvm-as-16 100 \
&& update-alternatives --install /usr/bin/nm nm /usr/bin/llvm-nm-16 100 \
&& update-alternatives --install /usr/bin/objcopy objcopy /usr/bin/llvm-objcopy-16 100 \
&& update-alternatives --install /usr/bin/objdump objdump /usr/bin/llvm-objdump-16 100 \
&& update-alternatives --install /usr/bin/ranlib ranlib /usr/bin/llvm-ranlib-16 100 \
&& update-alternatives --install /usr/bin/readelf readelf /usr/bin/llvm-readelf-16 100 \
&& update-alternatives --install /usr/bin/strip strip /usr/bin/llvm-strip-16 100
# TODO: Experimental CUDA setting
# --default-stream=per-thread:
# Use a separate stream for each thread. Default is legacy.
# ENV CUDAFLAGS="$CUDAFLAGS --default-stream=per-thread"
FROM os_base as mimalloc_base
COPY --from=mimalloc_src /mimalloc /mimalloc
WORKDIR /mimalloc
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \
&& cmake -S. -Bbuild -GNinja \
# Base CMake settings
-DBUILD_SHARED_LIBS="ON" \
-DCMAKE_BUILD_TYPE="Release" \
-DCMAKE_INSTALL_PREFIX="/usr/local" \
-DCMAKE_VERBOSE_MAKEFILE="OFF" \
# Set C standard and flags
-DCMAKE_C_STANDARD="17" \
-DCMAKE_C_STANDARD_REQUIRED="OFF" \
-DCMAKE_C_EXTENSIONS="OFF" \
-DCMAKE_C_FLAGS="$COMMON_FLAGS" \
# Set C++ standard and flags
-DCMAKE_CXX_STANDARD="20" \
-DCMAKE_CXX_STANDARD_REQUIRED="OFF" \
-DCMAKE_CXX_EXTENSIONS="OFF" \
-DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \
# Set linker flags
-DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
-DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
-DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \
# LTO policies
-DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \
-DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \
-DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \
# LTO
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \
-DCMAKE_POSITION_INDEPENDENT_CODE="OFF" \
# Mimalloc-specific flags
-DMI_BUILD_OBJECT="OFF" \
-DMI_BUILD_SHARED="ON" \
-DMI_BUILD_STATIC="OFF" \
-DMI_BUILD_TESTS="OFF" \
-DMI_OVERRIDE="ON" \
&& cmake --build build --target install
ENV LD_PRELOAD="/usr/local/lib/libmimalloc.so:$LD_PRELOAD"
WORKDIR /
RUN rm -rf /mimalloc
FROM mimalloc_base as mold_base
COPY --from=mold_src /mold /mold
WORKDIR /mold
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \
&& cmake -S. -Bbuild -GNinja \
# Base CMake settings
-DBUILD_SHARED_LIBS="OFF" \
-DCMAKE_BUILD_TYPE="Release" \
-DCMAKE_INSTALL_PREFIX="/usr/local" \
-DCMAKE_VERBOSE_MAKEFILE="OFF" \
# Set C standard and flags
-DCMAKE_C_STANDARD="17" \
-DCMAKE_C_STANDARD_REQUIRED="ON" \
# Mold uses GNU extensions, so we need to use gnu17 and gnu++20.
-DCMAKE_C_EXTENSIONS="ON" \
-DCMAKE_C_FLAGS="$COMMON_FLAGS" \
# Set C++ standard and flags
-DCMAKE_CXX_STANDARD="20" \
-DCMAKE_CXX_STANDARD_REQUIRED="ON" \
# Mold uses GNU extensions, so we need to use gnu17 and gnu++20.
-DCMAKE_CXX_EXTENSIONS="ON" \
-DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \
# Set linker flags
-DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
-DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
-DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \
# LTO policies
-DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \
-DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \
-DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \
# LTO
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \
-DCMAKE_POSITION_INDEPENDENT_CODE="OFF" \
# Mold-specific flags
-DMOLD_LTO="ON" \
-DMOLD_USE_MIMALLOC="ON" \
-DMOLD_USE_SYSTEM_MIMALLOC="ON" \
-DZSTD_LEGACY_SUPPORT="OFF" \
&& cmake --build build --target install
# Set the linker to mold
WORKDIR /
ENV LD="ld.mold"
RUN update-alternatives --install /usr/bin/ld ld /usr/local/bin/mold 100 \
&& rm -rf /mold
FROM mold_base as cpython_base
COPY --from=cpython_src /cpython /cpython
WORKDIR /cpython
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \
&& export CFLAGS="-std=c17 $COMMON_FLAGS" \
&& export CXXFLAGS="-std=c++20 $COMMON_FLAGS" \
&& export LDFLAGS="$LINK_FLAGS" \
&& ./configure \
--enable-ipv6=yes \
--enable-optimizations \
--with-computed-gotos \
--with-lto=thin \
--with-pymalloc \
--with-system-expat \
--with-ensurepip=upgrade \
ax_cv_c_float_words_bigendian=no \
&& make -j \
&& make install
# Set the python3 and pip3 symlinks
WORKDIR /
RUN ln -sf /usr/local/bin/python3 /usr/local/bin/python \
&& ln -sf /usr/local/bin/pip3 /usr/local/bin/pip \
&& rm -rf /cpython
FROM cpython_base as pip_requirements_install
RUN pip install \
# TODO: Since not compiling numpy from source, maybe we don't need Cython?
Cython==0.29.33 \
# NOTE: Need Jinja2 for some templating done by torch.compile
Jinja2==3.1.2 \
networkx==2.8.8 \
# NOTE: numpy is a hassle to compile with LTO, so we use the precompiled version
numpy==1.21.4 \
packaging==22.0 \
pyyaml==6.0 \
setuptools==65.6.3 \
sympy==1.11.1 \
typing_extensions==4.4.0 \
wheel==0.38.4 \
--verbose \
--no-build-isolation \
--no-cache-dir
# Build and install pillow-simd
FROM pip_requirements_install as pillow_install
COPY --from=pillow-simd_src /pillow-simd /pillow-simd
WORKDIR /pillow-simd
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \
&& export CFLAGS="-std=c17 $COMMON_FLAGS" \
&& export CXXFLAGS="-std=c++20 $COMMON_FLAGS" \
&& export LDFLAGS="$LINK_FLAGS" \
&& pip install . \
--verbose \
--no-build-isolation \
--no-cache-dir
WORKDIR /
RUN rm -rf /pillow-simd
# Build and install magma
# TODO: Try with CUDA 12 because our version of Magma includes this PR:
# https://bitbucket.org/icl/magma/pull-requests/30/new-gemm-kernel-without-texture-memory
FROM pillow_install as magma_install
COPY --from=magma_src /magma /magma
WORKDIR /magma
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/}" \
&& export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \
# Must set --allow-unsupported-compiler to use newer versions of clang.
# The CMAKE_CUDA_FLAGS variable is used by CMake to check the compiler version.
# It's okay that we don't specify all of $CUDA_NVCC_FLAGS here, because
# these is only used for the detection of the compiler version.
# NOTE: We must also specify --std, otherwise CMake seems to ignore the
# --allow-unsupported-compiler flag!
&& export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -O3 --threads=0 --extra-device-vectorization" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \
&& echo -e "BACKEND=cuda\nFORT=false\nGPU_TARGET=Ampere" > make.inc \
&& make generate -j &> /dev/null \
&& cmake -S. -Bbuild -GNinja \
# Base CMake settings
-DBUILD_SHARED_LIBS="OFF" \
-DCMAKE_BUILD_TYPE="Release" \
-DCMAKE_INSTALL_PREFIX="/usr/local" \
-DCMAKE_VERBOSE_MAKEFILE="OFF" \
# Set C standard and flags
-DCMAKE_C_STANDARD="17" \
-DCMAKE_C_STANDARD_REQUIRED="OFF" \
-DCMAKE_C_EXTENSIONS="ON" \
-DCMAKE_C_FLAGS="$COMMON_FLAGS" \
# Set C++ standard and flags
-DCMAKE_CXX_STANDARD="17" \
-DCMAKE_CXX_STANDARD_REQUIRED="OFF" \
-DCMAKE_CXX_EXTENSIONS="ON" \
-DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \
# Set linker flags
-DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
-DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
-DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \
# LTO policies
-DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \
-DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \
-DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \
# LTO
-DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \
# Must use PIC to link against PyTorch (which always uses PIC)
-DCMAKE_POSITION_INDEPENDENT_CODE="ON" \
# CUDA LTO
# NOTE: Make sure RESOLVE_DEVICE_SYMBOLS is OFF!
# https://gitlab.kitware.com/cmake/cmake/-/issues/22225
-DCMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS="OFF" \
-DCMAKE_CUDA_SEPARABLE_COMPILATION="ON" \
-DCMAKE_CUDA_ARCHITECTURES="89-real" \
-DCMAKE_CUDA_HOST_COMPILER="$CXX" \
-DCMAKE_CUDA_STANDARD="17" \
-DCMAKE_CUDA_FLAGS="$CUDA_FLAGS" \
# Magma-specific flags
-DGPU_TARGET="Ampere" \
-DLAPACK_LIBRARIES="${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a;${MKLROOT}/lib/intel64/libmkl_blacs_intelmpi_ilp64.a;-lm;-ldl;-lpthread" \
-DMAGMA_ENABLE_CUDA="ON" \
-DUSE_FORTRAN="OFF" \
&& cmake --build build --target install
WORKDIR /
RUN rm -rf /magma
# Build and install pytorch
# TODO:
# - Can we get a performance improvement by updating the libraries in third_party?
# - CUB and NNPack are both fairly outdated
FROM magma_install as torch_install
COPY --from=torch_src /torch /torch
WORKDIR /torch
RUN \
# TODO: Building with -fforce-emit-vtables causes missing symbols when importing torch.
export OPT_FLAGS="${OPT_FLAGS// -fforce-emit-vtables/}" \
# TODO: Why do we need -fPIC specified for libnnpack?
&& export COMMON_FLAGS="$OPT_FLAGS $NO_WARN_FLAGS -fPIC" \
# TODO: Why do we need -fPIC and -shared specified for libnnpack/libpytorch_cpu?
# NOTE: We can pass LTO_FLAGS to the linker so we can use the LTO objects from Magma, but we
# can't use them in COMMON_FLAGS because we can't compile torch with LTO.
&& export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/} $LTO_FLAGS -fPIC -shared" \
&& export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $LTO_FLAGS $COMMON_FLAGS" \
# NOTE: Redefining the standard causes CMake to pick up these flags, so we must.
&& export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -O3 --threads=0 --extra-device-vectorization" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \
# Base CMake settings
&& export BUILD_SHARED_LIBS="ON" \
&& export CMAKE_BUILD_TYPE="Release" \
&& export CMAKE_VERBOSE_MAKEFILE="ON" \
# Set C standard and flags
&& export CMAKE_C_STANDARD="17" \
&& export CMAKE_C_STANDARD_REQUIRED="ON" \
&& export CMAKE_C_EXTENSIONS="ON" \
&& export CMAKE_C_FLAGS="$COMMON_FLAGS" \
# Set C++ standard and flags
&& export CMAKE_CXX_STANDARD="17" \
&& export CMAKE_CXX_STANDARD_REQUIRED="ON" \
&& export CMAKE_CXX_EXTENSIONS="ON" \
&& export CMAKE_CXX_FLAGS="$COMMON_FLAGS" \
# Set linker flags
&& export CMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
&& export CMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
&& export CMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \
# LTO policies
&& export CMAKE_POLICY_DEFAULT_CMP0069="NEW" \
&& export CMAKE_POLICY_DEFAULT_CMP0105="NEW" \
&& export CMAKE_POLICY_DEFAULT_CMP0138="NEW" \
# LTO
&& export CMAKE_INTERPROCEDURAL_OPTIMIZATION="OFF" \
# PyTorch always uses fPIC, so don't confuse CMake.
&& export CMAKE_POSITION_INDEPENDENT_CODE="ON" \
# CUDA LTO
# NOTE: Make sure RESOLVE_DEVICE_SYMBOLS is OFF!
# https://gitlab.kitware.com/cmake/cmake/-/issues/22225
&& export CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS="OFF" \
&& export CMAKE_CUDA_SEPARABLE_COMPILATION="OFF" \
&& export CUDA_SEPARABLE_COMPILATION="OFF" \
&& export CMAKE_CUDA_ARCHITECTURES="89-real" \
&& export CMAKE_CUDA_FLAGS="$CUDA_FLAGS" \
&& export CMAKE_CUDA_HOST_COMPILER="$CXX" \
&& export CMAKE_CUDA_STANDARD="17" \
# Torch-specific flags
&& export ATEN_NO_TEST="ON" \
&& export ATEN_STATIC_CUDA="ON" \
&& export BLAS="MKL" \
&& export BUILD_CUSTOM_PROTOBUF="OFF" \
&& export BUILD_TEST="OFF" \
&& export INSTALL_TEST="OFF" \
&& export INTEL_MKL_DIR="$MKLROOT" \
&& export MAGMA_INCLUDE_DIR="/usr/local/include" \
&& export NCCL_ROOT="/usr" \
&& export Protobuf_USE_STATIC_LIBS="ON" \
&& export TH_BINARY_BUILD="ON" \
&& export TORCH_ALLOW_TF32_CUBLAS_OVERRIDE="1" \
&& export TORCH_CUDA_ARCH_LIST="8.9" \
&& export TORCH_NVCC_FLAGS="$CUDA_FLAGS" \
&& export USE_CUDA_STATIC_LINK="ON" \
&& export USE_CUDNN="ON" \
&& export USE_CUPTI_SO="OFF" \
&& export USE_EXPERIMENTAL_CUDNN_V8_API="ON" \
&& export USE_GLOO="OFF" \
&& export USE_KINETO="ON" \
&& export USE_MKLDNN="OFF" \
&& export USE_NCCL="ON" \
&& export USE_STATIC_CUDNN="OFF" \
&& export USE_STATIC_MKL="ON" \
&& export USE_STATIC_NCCL="ON" \
&& export USE_SYSTEM_NCCL="ON" \
# Build and install torch
&& python setup.py build --cmake-only \
&& python setup.py install \
&& rm -rf /torch
# NOTE: Torchvision is incredibly sensitive to spaces in NVCC's flags. If there are any leading or
# trailing spaces, it will fail to build. If there are any double spaces, it will fail to build.
# Build and install torchvision
FROM torch_install as torchvision_install
COPY --from=torchvision_src /torchvision /torchvision
WORKDIR /torchvision
RUN \
# TODO: Torchvision doesn't seem to use the CMAKE variables we set in the environment.
export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/}" \
&& export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \
&& export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \
# TODO: Can we compile the LTO objects from Torch and then link them here?
# NOTE: Redefining the standard causes CMake to pick up these flags, so we must.
&& export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -gencode=arch=compute_89,code=sm_89 -O3 --threads=0 --extra-device-vectorization" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \
&& export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \
# Set C standard and flags
&& export CFLAGS="-std=c17 $COMMON_FLAGS" \
# Set C++ standard and flags
&& export CXXFLAGS="-std=c++17 $COMMON_FLAGS" \
# Set linker flags
&& LDFLAGS="$LINK_FLAGS" \
# Torchvision settings
&& export DEBUG="0" \
&& export FORCE_CUDA="1" \
&& export NVCC_FLAGS="$CUDA_FLAGS" \
&& export TORCH_CUDA_ARCH_LIST="8.9" \
&& export TORCHVISION_USE_FFMPEG="0" \
&& export TORCHVISION_USE_JPEG="1" \
&& export TORCHVISION_USE_NVJPEG="0" \
&& export TORCHVISION_USE_PNG="1" \
&& export TORCHVISION_USE_VIDEO_CODEC="0" \
&& python setup.py install \
&& rm -rf /torchvision
# Build and install triton
FROM torchvision_install as triton_install
COPY --from=triton_src /triton /triton
# TODO: Unable to link with mold for some reason. Use lld instead. Report to mold maintainer.
WORKDIR /triton
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
&& export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \
&& export CFLAGS="-std=gnu17 $COMMON_FLAGS" \
&& export CXXFLAGS="-std=gnu++17 $COMMON_FLAGS" \
&& export LDFLAGS="$LINK_FLAGS" \
&& cd python \
# Must be editable install to allow for cyclic imports
&& pip install -e . \
--verbose \
--no-build-isolation \
--no-cache-dir
# NOTE: We cannot delete triton because we did an editable install and we need to do that to allow
# for cyclic imports.
WORKDIR /
CMD ["/bin/bash"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6370da..56392d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -616,43 +616,6 @@ endif()
add_custom_target( lib DEPENDS magma )
-# ----------------------------------------
-# compile lapacktest library
-# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
-# else, compile only C++ files, not Fortran files
-if (USE_FORTRAN)
- foreach( filename ${liblapacktest_all} )
- if (filename MATCHES "\\.(f|f90|F90)$")
- list( APPEND liblapacktest_all_f ${filename} )
- endif()
- endforeach()
- add_library( lapacktest ${liblapacktest_all_f} )
-else()
- # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
- foreach( filename ${liblapacktest_all} )
- if (filename MATCHES "\\.(c|cu|cpp)$")
- list( APPEND liblapacktest_all_cpp ${filename} )
- endif()
- endforeach()
- add_library( lapacktest ${liblapacktest_all_cpp} )
-endif()
-target_link_libraries( lapacktest
- ${blas_fix}
- ${LAPACK_LIBRARIES}
-)
-
-
-# ----------------------------------------
-# compile tester library
-add_library( tester ${libtest_all} )
-target_link_libraries( tester
- magma
- lapacktest
- ${blas_fix}
- ${LAPACK_LIBRARIES}
-)
-
-
# ----------------------------------------
# compile MAGMA sparse library
@@ -664,7 +627,6 @@ else()
include_directories( sparse_hip/include )
include_directories( sparse_hip/control )
endif()
-include_directories( testing )
if (MAGMA_ENABLE_CUDA)
add_library( magma_sparse ${libsparse_all} )
@@ -692,54 +654,7 @@ endif()
add_custom_target( sparse-lib DEPENDS magma_sparse )
-# ----------------------------------------
-# compile each tester
-
-# save testers to testing/
-# save tester lib files to testing_lib/ to avoid cluttering lib/
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
-
-# skip Fortran testers, which require an extra file from CUDA
-foreach( filename ${testing_all} )
- if (filename MATCHES "\\.(c|cu|cpp)$")
- list( APPEND testing_all_cpp ${filename} )
- endif()
-endforeach()
-foreach( TEST ${testing_all_cpp} )
- string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
- string( REGEX REPLACE "testing/" "" EXE ${EXE} )
- #message( "${TEST} --> ${EXE}" )
- add_executable( ${EXE} ${TEST} )
- target_link_libraries( ${EXE} tester lapacktest magma )
- list( APPEND testing ${EXE} )
-endforeach()
-add_custom_target( testing DEPENDS ${testing} )
-
-
-# ----------------------------------------
-# compile each sparse tester
-
-if (MAGMA_ENABLE_CUDA)
- set(SPARSE_TEST_DIR "sparse/testing")
-else()
- set(SPARSE_TEST_DIR "sparse_hip/testing")
-endif()
-
-
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
cmake_policy( SET CMP0037 OLD)
-foreach( TEST ${sparse_testing_all} )
- string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
- string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
- #message( "${TEST} --> ${EXE}" )
- add_executable( ${EXE} ${TEST} )
- target_link_libraries( ${EXE} magma_sparse magma )
- list( APPEND sparse-testing ${EXE} )
-endforeach()
-add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
-
# ----------------------------------------
# what to install
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment