ConnorBaker/Dockerfile

## README.md

      
    Raw
  

              README.md
            
          
    README

Building and Using

NOTE: No warranty of any kind provided, I'm not liable for anything that occurs as a result of you using this, etc., etc.
The build may take a while! On my i9 13900k I'm seeing it take about 30m or so.
Make sure Dockerfile and magma_remove_tests.diff are in the same directory. Build with
sudo docker buildx build . --progress=plain --tag=temp:latest
and run with
sudo docker run --gpus=all --ipc=host -it
optionally using Docker's mount argument as you wish.

  
## Dockerfile
# Clone repositories
FROM docker.io/bitnami/git:2.39.0@sha256:8802a1053f0a75c948da43c0d04e591b500381447745f0b5f75d3cf85509626c as git_base

# Basic git configuration
RUN git config --global advice.detachedHead false \
    && git config --global init.defaultBranch main

# Install xz-utils for decompressing tarballs
RUN --mount=type=cache,target=/var/cache/apt \
    export DEBIAN_FRONTEND=noninteractive \
    && apt update \
    && apt install -y --no-install-recommends xz-utils \
    && rm -rf /var/lib/apt/lists/* \
    && apt clean \
    && apt autoremove

# Clone mimalloc
# https://github.com/microsoft/mimalloc/commit/dd7348066fe40e8bf372fa4e9538910a5e24a75f
FROM git_base as mimalloc_src
WORKDIR /mimalloc
RUN git init \
    && git remote add origin "https://github.com/microsoft/mimalloc" \
    && git fetch origin dd7348066fe40e8bf372fa4e9538910a5e24a75f \
        --depth=1 \
    && git checkout FETCH_HEAD

# Clone mold
# https://github.com/rui314/mold/commit/ad0b6d0ac6a9b269935c3fbf4dae2815395431a4
FROM git_base as mold_src
WORKDIR /mold
RUN git init \
    && git remote add origin "https://github.com/rui314/mold" \
    && git fetch origin ad0b6d0ac6a9b269935c3fbf4dae2815395431a4 \
        --depth=1 \
    && git checkout FETCH_HEAD

# Clone cpython
# https://github.com/python/cpython/commit/5aa8b9e70c44862cf3f600bdc329a20790b67056
FROM git_base as cpython_src
WORKDIR /cpython
# NOTE: 3.11 adoption is blocked by PyTorch because it uses the old opcodes.
RUN git init \
    && git remote add origin "https://github.com/python/cpython" \
    && git fetch origin 5aa8b9e70c44862cf3f600bdc329a20790b67056 \
        --depth=1 \
    && git checkout FETCH_HEAD

# Clone pillow-simd
# https://github.com/uploadcare/pillow-simd/commit/58acec3312fb8671c9d84829197e1c8150085589
FROM git_base as pillow-simd_src
WORKDIR /pillow-simd
RUN git init \
    && git remote add origin "https://github.com/uploadcare/pillow-simd" \
    && git fetch origin 58acec3312fb8671c9d84829197e1c8150085589 \
        --depth=1 \
    && git checkout FETCH_HEAD

# Clone magma
# https://bitbucket.org/icl/magma/commits/0c7321435fe81527f41bad708659f94630a3625f
FROM git_base as magma_src
WORKDIR /magma
RUN git init \
    && git remote add origin "https://bitbucket.org/icl/magma" \
    && git fetch origin 0c7321435fe81527f41bad708659f94630a3625f \
        --depth=1 \
    && git checkout FETCH_HEAD

# Patching to remove tests. Some of them don't play well with LTO.
COPY ./magma_remove_tests.diff .
RUN git apply ./magma_remove_tests.diff

# Clone pytorch
# https://github.com/pytorch/pytorch/commit/ce50a8de7535b5e359f7ed7ead4285414a966d3f
FROM git_base as torch_src
WORKDIR /torch
RUN git init \
    && git remote add origin "https://github.com/pytorch/pytorch" \
    && git fetch origin ce50a8de7535b5e359f7ed7ead4285414a966d3f \
        --recurse-submodules=yes \
        --jobs=16 \
        --depth=1 \
    && git checkout FETCH_HEAD \
    # PyTorch has submodules we need to fetch.
    && git submodule update \
        --init \
        --recursive \
        --jobs=16 \
        --depth=1

# Patch to change flag which is very sensitive on Clang from error to warning.
RUN sed -i -e 's/Werror=cast/Wcast/g' CMakeLists.txt

# Clone torchvision
# https://github.com/pytorch/vision/commit/90cfb10dc49187842247d3bffb25a06af0b1e826
FROM git_base as torchvision_src
WORKDIR /torchvision
RUN git init \
    && git remote add origin "https://github.com/pytorch/vision" \
    && git fetch origin 90cfb10dc49187842247d3bffb25a06af0b1e826 \
        --depth=1 \
    && git checkout FETCH_HEAD

# Clone triton
# https://github.com/openai/triton/commit/0f5c6e619c35d22507f6202600a78f5781495496
FROM git_base as triton_src
# Download pybind11
WORKDIR /root/.triton/pybind11
RUN curl -sL "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz" \
        | tar -xz

# Download llvm
WORKDIR /root/.triton/llvm
RUN curl -sL "https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/clang+llvm-14.0.0-x86_64-linux-gnu-ubuntu-18.04.tar.xz" \
        | tar -xJ

WORKDIR /triton
RUN git init \
    && git remote add origin "https://github.com/openai/triton" \
    && git fetch origin 0f5c6e619c35d22507f6202600a78f5781495496 \
    && git checkout FETCH_HEAD


FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04@sha256:0e9ecf52bdde401829b39b5fd38093d4254bf1b24aae9bf644d212fba347458b as os_base
SHELL ["/bin/bash", "-c"]
RUN --mount=type=cache,target=/var/cache/apt \
    export DEBIAN_FRONTEND=noninteractive \
    && apt-mark unhold libcudnn8 \
    && apt update \
    && apt upgrade -y \
    && apt install -y --no-install-recommends \
        build-essential \
        cmake \
        curl \
        git \
        lcov \
        libbz2-dev \
        libffi-dev \
        libgdbm-compat-dev \
        libgdbm-dev \
        libjpeg-turbo8-dev \
        liblzma-dev \
        libncurses5-dev \
        libnuma-dev \
        libopenblas-dev \
        libpng-dev \
        libprotobuf-dev \
        libreadline6-dev \
        libsqlite3-dev \
        libssl-dev \
        lzma-dev \
        ninja-build \
        numactl \
        pkg-config \
        protobuf-compiler \
        tk-dev \
        uuid-dev \
        wget \
        zlib1g-dev \
        zstd \
    && rm -rf /var/lib/apt/lists/* \
    && apt clean \
    && apt autoremove

RUN \
    # Add Intel OneAPI repository
    wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" \
        | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg \
    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
        | tee /etc/apt/sources.list.d/oneAPI.list > /dev/null \
    # Add LLVM repository
    && wget -qO- "https://apt.llvm.org/llvm-snapshot.gpg.key" \
        | gpg --dearmor -o /usr/share/keyrings/llvm-snapshot-keyring.gpg \
    && echo "deb [signed-by=/usr/share/keyrings/llvm-snapshot-keyring.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" \
        | tee /etc/apt/sources.list.d/llvm.list > /dev/null \
    # Add Kitware (CMake) repository
    && wget -qO- "https://apt.kitware.com/keys/kitware-archive-latest.asc" \
        | gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg \
    && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main" \
        | tee /etc/apt/sources.list.d/kitware.list > /dev/null

RUN --mount=type=cache,target=/var/cache/apt \
    export DEBIAN_FRONTEND=noninteractive \
    && apt update \
    && apt remove --purge --auto-remove cmake -y \
    && apt install -y --no-install-recommends \
        clang-16 \
        cmake \
        intel-oneapi-mkl-devel-2023.0.0 \
        intel-oneapi-openmp-2023.0.0 \
        intel-oneapi-runtime-openmp \
        libomp-16-dev \
        libomp5-16 \
        lld-16 \
        lldb-16 \
    && rm -rf /var/lib/apt/lists/* \
    && apt clean \
    && apt autoremove

# Create a symbolic link for libiomp5.so and the actual backing shared library
RUN ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \
        /usr/lib/llvm-16/lib/libiomp5.so \
    && ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \
        /usr/lib/llvm-16/lib/libomp.so \
    && ln -sf /opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so \
        /usr/lib/llvm-16/lib/libomp.so.5

# Update environment variables
ENV PATH="/usr/lib/llvm-16/bin:$PATH" \

    LANG="C.UTF-8" \
    LC_ALL="C.UTF-8" \

    CC="clang" \
    CXX="clang++" \
    LD="ld.lld" \

    # CUDA-related environment variables
    LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH" \
    CUDA_HOME="/usr/local/cuda" \
    CUDA_MODULE_LOADING="LAZY" \
    CUDA_USE_STATIC_CUDA_RUNTIME="ON" \
    CUDAHOSTCXX="clang++" \

    AR="llvm-ar" \
    AS="llvm-as" \
    NM="llvm-nm" \
    OBJCOPY="llvm-objcopy" \
    OBJDUMP="llvm-objdump" \
    RANLIB="llvm-ranlib" \
    READELF="llvm-readelf" \
    STRIP="llvm-strip" \

    # Intel MKL location
    MKLROOT="/opt/intel/oneapi/mkl/latest" \

    # Useful flags
    NO_WARN_FLAGS="-Wno-deprecated -Wno-unused-command-line-argument" \
    LTO_FLAGS="-flto=thin -fsplit-lto-unit -fwhole-program-vtables" \
    OPT_FLAGS="-O3 -march=native --pipe -falign-functions=32 -fno-semantic-interposition -fsplit-machine-functions -fslp-vectorize -ffunction-sections -fdata-sections -fforce-emit-vtables -fstrict-vtable-pointers -fno-plt -fno-common" \
    LINK_FLAGS="-Wl,-O2 -Wl,-znow -Wl,--sort-common -Wl,--gc-sections -Wl,--hash-style=gnu"

# Register clang as the default compiler
RUN update-alternatives --install /usr/bin/cc cc /usr/bin/clang-16 100 \
    && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++-16 100 \

    # NOTE: We will replace this with mold so we don't set priority to 100
    && update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-16 75 \

    && update-alternatives --install /usr/bin/ar ar /usr/bin/llvm-ar-16 100 \
    && update-alternatives --install /usr/bin/as as /usr/bin/llvm-as-16 100 \
    && update-alternatives --install /usr/bin/nm nm /usr/bin/llvm-nm-16 100 \
    && update-alternatives --install /usr/bin/objcopy objcopy /usr/bin/llvm-objcopy-16 100 \
    && update-alternatives --install /usr/bin/objdump objdump /usr/bin/llvm-objdump-16 100 \
    && update-alternatives --install /usr/bin/ranlib ranlib /usr/bin/llvm-ranlib-16 100 \
    && update-alternatives --install /usr/bin/readelf readelf /usr/bin/llvm-readelf-16 100 \
    && update-alternatives --install /usr/bin/strip strip /usr/bin/llvm-strip-16 100

# TODO: Experimental CUDA setting
# --default-stream=per-thread:
#   Use a separate stream for each thread. Default is legacy.
# ENV CUDAFLAGS="$CUDAFLAGS --default-stream=per-thread"

FROM os_base as mimalloc_base
COPY --from=mimalloc_src /mimalloc /mimalloc
WORKDIR /mimalloc
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
    && export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \
    && cmake -S. -Bbuild -GNinja \
        # Base CMake settings
        -DBUILD_SHARED_LIBS="ON" \
        -DCMAKE_BUILD_TYPE="Release" \
        -DCMAKE_INSTALL_PREFIX="/usr/local" \
        -DCMAKE_VERBOSE_MAKEFILE="OFF" \

        # Set C standard and flags
        -DCMAKE_C_STANDARD="17" \
        -DCMAKE_C_STANDARD_REQUIRED="OFF" \
        -DCMAKE_C_EXTENSIONS="OFF" \
        -DCMAKE_C_FLAGS="$COMMON_FLAGS" \

        # Set C++ standard and flags
        -DCMAKE_CXX_STANDARD="20" \
        -DCMAKE_CXX_STANDARD_REQUIRED="OFF" \
        -DCMAKE_CXX_EXTENSIONS="OFF" \
        -DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \

        # Set linker flags
        -DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
        -DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
        -DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \

        # LTO policies
        -DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \
        -DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \
        -DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \

        # LTO
        -DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \
        -DCMAKE_POSITION_INDEPENDENT_CODE="OFF" \

        # Mimalloc-specific flags
        -DMI_BUILD_OBJECT="OFF" \
        -DMI_BUILD_SHARED="ON" \
        -DMI_BUILD_STATIC="OFF" \
        -DMI_BUILD_TESTS="OFF" \
        -DMI_OVERRIDE="ON" \
    && cmake --build build --target install

ENV LD_PRELOAD="/usr/local/lib/libmimalloc.so:$LD_PRELOAD"
WORKDIR /
RUN rm -rf /mimalloc

FROM mimalloc_base as mold_base
COPY --from=mold_src /mold /mold
WORKDIR /mold
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
    && export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \
    && cmake -S. -Bbuild -GNinja \
        # Base CMake settings
        -DBUILD_SHARED_LIBS="OFF" \
        -DCMAKE_BUILD_TYPE="Release" \
        -DCMAKE_INSTALL_PREFIX="/usr/local" \
        -DCMAKE_VERBOSE_MAKEFILE="OFF" \

        # Set C standard and flags
        -DCMAKE_C_STANDARD="17" \
        -DCMAKE_C_STANDARD_REQUIRED="ON" \
        # Mold uses GNU extensions, so we need to use gnu17 and gnu++20.
        -DCMAKE_C_EXTENSIONS="ON" \
        -DCMAKE_C_FLAGS="$COMMON_FLAGS" \

        # Set C++ standard and flags
        -DCMAKE_CXX_STANDARD="20" \
        -DCMAKE_CXX_STANDARD_REQUIRED="ON" \
        # Mold uses GNU extensions, so we need to use gnu17 and gnu++20.
        -DCMAKE_CXX_EXTENSIONS="ON" \
        -DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \

        # Set linker flags
        -DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
        -DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
        -DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \

        # LTO policies
        -DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \
        -DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \
        -DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \

        # LTO
        -DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \
        -DCMAKE_POSITION_INDEPENDENT_CODE="OFF" \

        # Mold-specific flags
        -DMOLD_LTO="ON" \
        -DMOLD_USE_MIMALLOC="ON" \
        -DMOLD_USE_SYSTEM_MIMALLOC="ON" \
        -DZSTD_LEGACY_SUPPORT="OFF" \
    && cmake --build build --target install

# Set the linker to mold
WORKDIR /
ENV LD="ld.mold"
RUN update-alternatives --install /usr/bin/ld ld /usr/local/bin/mold 100 \
    && rm -rf /mold

FROM mold_base as cpython_base
COPY --from=cpython_src /cpython /cpython
WORKDIR /cpython
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
    && export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \
    && export CFLAGS="-std=c17 $COMMON_FLAGS" \
    && export CXXFLAGS="-std=c++20 $COMMON_FLAGS" \
    && export LDFLAGS="$LINK_FLAGS" \
    && ./configure \
        --enable-ipv6=yes \
        --enable-optimizations \
        --with-computed-gotos \
        --with-lto=thin \
        --with-pymalloc \
        --with-system-expat \
        --with-ensurepip=upgrade \
        ax_cv_c_float_words_bigendian=no \
    && make -j \
    && make install

# Set the python3 and pip3 symlinks
WORKDIR /
RUN ln -sf /usr/local/bin/python3 /usr/local/bin/python \
    && ln -sf /usr/local/bin/pip3 /usr/local/bin/pip \
    && rm -rf /cpython

FROM cpython_base as pip_requirements_install
RUN pip install \
    # TODO: Since not compiling numpy from source, maybe we don't need Cython?
    Cython==0.29.33 \
    # NOTE: Need Jinja2 for some templating done by torch.compile
    Jinja2==3.1.2 \
    networkx==2.8.8 \
    # NOTE: numpy is a hassle to compile with LTO, so we use the precompiled version
    numpy==1.21.4 \
    packaging==22.0 \
    pyyaml==6.0 \
    setuptools==65.6.3 \
    sympy==1.11.1 \
    typing_extensions==4.4.0 \
    wheel==0.38.4 \
    --verbose \
    --no-build-isolation \
    --no-cache-dir

# Build and install pillow-simd
FROM pip_requirements_install as pillow_install
COPY --from=pillow-simd_src /pillow-simd /pillow-simd
WORKDIR /pillow-simd
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
    && export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \
    && export CFLAGS="-std=c17 $COMMON_FLAGS" \
    && export CXXFLAGS="-std=c++20 $COMMON_FLAGS" \
    && export LDFLAGS="$LINK_FLAGS" \
    &&  pip install . \
        --verbose \
        --no-build-isolation \
        --no-cache-dir

WORKDIR /
RUN rm -rf /pillow-simd

# Build and install magma
# TODO: Try with CUDA 12 because our version of Magma includes this PR:
#       https://bitbucket.org/icl/magma/pull-requests/30/new-gemm-kernel-without-texture-memory
FROM pillow_install as magma_install
COPY --from=magma_src /magma /magma
WORKDIR /magma
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \

    && export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/}" \
    && export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \
    && export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \

    # Must set --allow-unsupported-compiler to use newer versions of clang.
    # The CMAKE_CUDA_FLAGS variable is used by CMake to check the compiler version.
    # It's okay that we don't specify all of $CUDA_NVCC_FLAGS here, because
    # these is only used for the detection of the compiler version.
    # NOTE: We must also specify --std, otherwise CMake seems to ignore the
    #       --allow-unsupported-compiler flag!
    && export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -O3 --threads=0 --extra-device-vectorization" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \

    && echo -e "BACKEND=cuda\nFORT=false\nGPU_TARGET=Ampere" > make.inc \
    && make generate -j &> /dev/null \

    && cmake -S. -Bbuild -GNinja \
        # Base CMake settings
        -DBUILD_SHARED_LIBS="OFF" \
        -DCMAKE_BUILD_TYPE="Release" \
        -DCMAKE_INSTALL_PREFIX="/usr/local" \
        -DCMAKE_VERBOSE_MAKEFILE="OFF" \

        # Set C standard and flags
        -DCMAKE_C_STANDARD="17" \
        -DCMAKE_C_STANDARD_REQUIRED="OFF" \
        -DCMAKE_C_EXTENSIONS="ON" \
        -DCMAKE_C_FLAGS="$COMMON_FLAGS" \

        # Set C++ standard and flags
        -DCMAKE_CXX_STANDARD="17" \
        -DCMAKE_CXX_STANDARD_REQUIRED="OFF" \
        -DCMAKE_CXX_EXTENSIONS="ON" \
        -DCMAKE_CXX_FLAGS="$COMMON_FLAGS" \

        # Set linker flags
        -DCMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
        -DCMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
        -DCMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \

        # LTO policies
        -DCMAKE_POLICY_DEFAULT_CMP0069="NEW" \
        -DCMAKE_POLICY_DEFAULT_CMP0105="NEW" \
        -DCMAKE_POLICY_DEFAULT_CMP0138="NEW" \

        # LTO
        -DCMAKE_INTERPROCEDURAL_OPTIMIZATION="ON" \
        # Must use PIC to link against PyTorch (which always uses PIC)
        -DCMAKE_POSITION_INDEPENDENT_CODE="ON" \

        # CUDA LTO
        # NOTE: Make sure RESOLVE_DEVICE_SYMBOLS is OFF!
        #       https://gitlab.kitware.com/cmake/cmake/-/issues/22225
        -DCMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS="OFF" \
        -DCMAKE_CUDA_SEPARABLE_COMPILATION="ON" \
        -DCMAKE_CUDA_ARCHITECTURES="89-real" \

        -DCMAKE_CUDA_HOST_COMPILER="$CXX" \
        -DCMAKE_CUDA_STANDARD="17" \
        -DCMAKE_CUDA_FLAGS="$CUDA_FLAGS" \

        # Magma-specific flags
        -DGPU_TARGET="Ampere" \
        -DLAPACK_LIBRARIES="${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a;${MKLROOT}/lib/intel64/libmkl_blacs_intelmpi_ilp64.a;-lm;-ldl;-lpthread" \
        -DMAGMA_ENABLE_CUDA="ON" \
        -DUSE_FORTRAN="OFF" \
    && cmake --build build --target install

WORKDIR /
RUN rm -rf /magma

# Build and install pytorch
# TODO:
# - Can we get a performance improvement by updating the libraries in third_party?
#   - CUB and NNPack are both fairly outdated
FROM magma_install as torch_install
COPY --from=torch_src /torch /torch
WORKDIR /torch

RUN \
    # TODO: Building with -fforce-emit-vtables causes missing symbols when importing torch.
    export OPT_FLAGS="${OPT_FLAGS// -fforce-emit-vtables/}" \
    # TODO: Why do we need -fPIC specified for libnnpack?
    && export COMMON_FLAGS="$OPT_FLAGS $NO_WARN_FLAGS -fPIC" \

    # TODO: Why do we need -fPIC and -shared specified for libnnpack/libpytorch_cpu?
    # NOTE: We can pass LTO_FLAGS to the linker so we can use the LTO objects from Magma, but we
    #       can't use them in COMMON_FLAGS because we can't compile torch with LTO.
    && export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/} $LTO_FLAGS -fPIC -shared" \
    && export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \
    && export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $LTO_FLAGS $COMMON_FLAGS" \

    # NOTE: Redefining the standard causes CMake to pick up these flags, so we must.
    && export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -O3 --threads=0 --extra-device-vectorization" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \

    # Base CMake settings
    && export BUILD_SHARED_LIBS="ON" \
    && export CMAKE_BUILD_TYPE="Release" \
    && export CMAKE_VERBOSE_MAKEFILE="ON" \

    # Set C standard and flags
    && export CMAKE_C_STANDARD="17" \
    && export CMAKE_C_STANDARD_REQUIRED="ON" \
    && export CMAKE_C_EXTENSIONS="ON" \
    && export CMAKE_C_FLAGS="$COMMON_FLAGS" \

    # Set C++ standard and flags
    && export CMAKE_CXX_STANDARD="17" \
    && export CMAKE_CXX_STANDARD_REQUIRED="ON" \
    && export CMAKE_CXX_EXTENSIONS="ON" \
    && export CMAKE_CXX_FLAGS="$COMMON_FLAGS" \

    # Set linker flags
    && export CMAKE_EXE_LINKER_FLAGS="$LINK_FLAGS" \
    && export CMAKE_MODULE_LINKER_FLAGS="$LINK_FLAGS" \
    && export CMAKE_SHARED_LINKER_FLAGS="$LINK_FLAGS" \

    # LTO policies
    && export CMAKE_POLICY_DEFAULT_CMP0069="NEW" \
    && export CMAKE_POLICY_DEFAULT_CMP0105="NEW" \
    && export CMAKE_POLICY_DEFAULT_CMP0138="NEW" \

    # LTO
    && export CMAKE_INTERPROCEDURAL_OPTIMIZATION="OFF" \
    # PyTorch always uses fPIC, so don't confuse CMake.
    && export CMAKE_POSITION_INDEPENDENT_CODE="ON" \

    # CUDA LTO
    # NOTE: Make sure RESOLVE_DEVICE_SYMBOLS is OFF!
    #       https://gitlab.kitware.com/cmake/cmake/-/issues/22225
    && export CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS="OFF" \
    && export CMAKE_CUDA_SEPARABLE_COMPILATION="OFF" \
    && export CUDA_SEPARABLE_COMPILATION="OFF" \


    && export CMAKE_CUDA_ARCHITECTURES="89-real" \
    && export CMAKE_CUDA_FLAGS="$CUDA_FLAGS" \
    && export CMAKE_CUDA_HOST_COMPILER="$CXX" \
    && export CMAKE_CUDA_STANDARD="17" \

    # Torch-specific flags
    && export ATEN_NO_TEST="ON" \
    && export ATEN_STATIC_CUDA="ON" \
    && export BLAS="MKL" \
    && export BUILD_CUSTOM_PROTOBUF="OFF" \
    && export BUILD_TEST="OFF" \
    && export INSTALL_TEST="OFF" \
    && export INTEL_MKL_DIR="$MKLROOT" \
    && export MAGMA_INCLUDE_DIR="/usr/local/include" \
    && export NCCL_ROOT="/usr" \
    && export Protobuf_USE_STATIC_LIBS="ON" \
    && export TH_BINARY_BUILD="ON" \
    && export TORCH_ALLOW_TF32_CUBLAS_OVERRIDE="1" \
    && export TORCH_CUDA_ARCH_LIST="8.9" \
    && export TORCH_NVCC_FLAGS="$CUDA_FLAGS" \
    && export USE_CUDA_STATIC_LINK="ON" \
    && export USE_CUDNN="ON" \
    && export USE_CUPTI_SO="OFF" \
    && export USE_EXPERIMENTAL_CUDNN_V8_API="ON" \
    && export USE_GLOO="OFF" \
    && export USE_KINETO="ON" \
    && export USE_MKLDNN="OFF" \
    && export USE_NCCL="ON" \
    && export USE_STATIC_CUDNN="OFF" \
    && export USE_STATIC_MKL="ON" \
    && export USE_STATIC_NCCL="ON" \
    && export USE_SYSTEM_NCCL="ON" \

    # Build and install torch
    && python setup.py build --cmake-only \
    && python setup.py install \
    && rm -rf /torch

# NOTE: Torchvision is incredibly sensitive to spaces in NVCC's flags. If there are any leading or
# trailing spaces, it will fail to build. If there are any double spaces, it will fail to build.
# Build and install torchvision
FROM torch_install as torchvision_install
COPY --from=torchvision_src /torchvision /torchvision
WORKDIR /torchvision
RUN \
    # TODO: Torchvision doesn't seem to use the CMAKE variables we set in the environment.
    export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \

    && export XLINKER_FLAGS="${LINK_FLAGS//-Wl,/}" \
    && export XLINKER_FLAGS="${XLINKER_FLAGS// /,}" \
    && export LINK_FLAGS="-fuse-ld=mold $LINK_FLAGS $COMMON_FLAGS" \

    # TODO: Can we compile the LTO objects from Torch and then link them here?
    # NOTE: Redefining the standard causes CMake to pick up these flags, so we must.
    && export CUDA_FLAGS="--std=c++17 --allow-unsupported-compiler -gencode=arch=compute_89,code=sm_89 -O3 --threads=0 --extra-device-vectorization" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xfatbin=--compress-all" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xcompiler=${COMMON_FLAGS// /,}" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xlinker=$XLINKER_FLAGS" \
    && export CUDA_FLAGS="$CUDA_FLAGS -Xnvlink=--use-host-info" \

    # Set C standard and flags
    && export CFLAGS="-std=c17 $COMMON_FLAGS" \

    # Set C++ standard and flags
    && export CXXFLAGS="-std=c++17 $COMMON_FLAGS" \

    # Set linker flags
    && LDFLAGS="$LINK_FLAGS" \

    # Torchvision settings
    && export DEBUG="0" \
    && export FORCE_CUDA="1" \
    && export NVCC_FLAGS="$CUDA_FLAGS" \
    && export TORCH_CUDA_ARCH_LIST="8.9" \
    && export TORCHVISION_USE_FFMPEG="0" \
    && export TORCHVISION_USE_JPEG="1" \
    && export TORCHVISION_USE_NVJPEG="0" \
    && export TORCHVISION_USE_PNG="1" \
    && export TORCHVISION_USE_VIDEO_CODEC="0" \

    && python setup.py install \
    && rm -rf /torchvision

# Build and install triton
FROM torchvision_install as triton_install
COPY --from=triton_src /triton /triton

# TODO: Unable to link with mold for some reason. Use lld instead. Report to mold maintainer.
WORKDIR /triton
RUN export COMMON_FLAGS="$OPT_FLAGS $LTO_FLAGS $NO_WARN_FLAGS" \
    && export LINK_FLAGS="-fuse-ld=lld $LINK_FLAGS $COMMON_FLAGS" \
    && export CFLAGS="-std=gnu17 $COMMON_FLAGS" \
    && export CXXFLAGS="-std=gnu++17 $COMMON_FLAGS" \
    && export LDFLAGS="$LINK_FLAGS" \
    && cd python \
    # Must be editable install to allow for cyclic imports
    && pip install -e . \
        --verbose \
        --no-build-isolation \
        --no-cache-dir

# NOTE: We cannot delete triton because we did an editable install and we need to do that to allow
#       for cyclic imports.


WORKDIR /
CMD ["/bin/bash"]

## magma_remove_tests.diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6370da..56392d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -616,43 +616,6 @@ endif()
 add_custom_target( lib DEPENDS magma )


-# ----------------------------------------
-# compile lapacktest library
-# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
-# else,           compile only C++     files, not Fortran files
-if (USE_FORTRAN)
-    foreach( filename ${liblapacktest_all} )
-        if (filename MATCHES "\\.(f|f90|F90)$")
-            list( APPEND liblapacktest_all_f ${filename} )
-        endif()
-    endforeach()
-    add_library( lapacktest ${liblapacktest_all_f} )
-else()
-    # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
-    foreach( filename ${liblapacktest_all} )
-        if (filename MATCHES "\\.(c|cu|cpp)$")
-            list( APPEND liblapacktest_all_cpp ${filename} )
-        endif()
-    endforeach()
-    add_library( lapacktest ${liblapacktest_all_cpp} )
-endif()
-target_link_libraries( lapacktest
-    ${blas_fix}
-    ${LAPACK_LIBRARIES}
-)
-
-
-# ----------------------------------------
-# compile tester library
-add_library( tester ${libtest_all} )
-target_link_libraries( tester
-    magma
-    lapacktest
-    ${blas_fix}
-    ${LAPACK_LIBRARIES}
-)
-
-
 # ----------------------------------------
 # compile MAGMA sparse library

@@ -664,7 +627,6 @@ else()
   include_directories( sparse_hip/include )
   include_directories( sparse_hip/control )
 endif()
-include_directories( testing )

 if (MAGMA_ENABLE_CUDA)
   add_library( magma_sparse ${libsparse_all} )
@@ -692,54 +654,7 @@ endif()
 add_custom_target( sparse-lib DEPENDS magma_sparse )


-# ----------------------------------------
-# compile each tester
-
-# save testers to testing/
-# save tester lib files to testing_lib/ to avoid cluttering lib/
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
-
-# skip Fortran testers, which require an extra file from CUDA
-foreach( filename ${testing_all} )
-    if (filename MATCHES "\\.(c|cu|cpp)$")
-        list( APPEND testing_all_cpp ${filename} )
-    endif()
-endforeach()
-foreach( TEST ${testing_all_cpp} )
-    string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
-    string( REGEX REPLACE "testing/" "" EXE ${EXE} )
-    #message( "${TEST} --> ${EXE}" )
-    add_executable( ${EXE} ${TEST} )
-    target_link_libraries( ${EXE} tester lapacktest magma )
-    list( APPEND testing ${EXE} )
-endforeach()
-add_custom_target( testing DEPENDS ${testing} )
-
-
-# ----------------------------------------
-# compile each sparse tester
-
-if (MAGMA_ENABLE_CUDA)
-  set(SPARSE_TEST_DIR "sparse/testing")
-else()
-  set(SPARSE_TEST_DIR "sparse_hip/testing")
-endif()
-
-
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
 cmake_policy( SET CMP0037 OLD)
-foreach( TEST ${sparse_testing_all} )
-    string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
-    string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
-    #message( "${TEST} --> ${EXE}" )
-    add_executable( ${EXE} ${TEST} )
-    target_link_libraries( ${EXE} magma_sparse magma )
-    list( APPEND sparse-testing ${EXE} )
-endforeach()
-add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
-

 # ----------------------------------------
 # what to install
	diff --git a/CMakeLists.txt b/CMakeLists.txt
	index b6370da..56392d9 100644
	--- a/CMakeLists.txt
	+++ b/CMakeLists.txt
	@@ -616,43 +616,6 @@ endif()
	add_custom_target( lib DEPENDS magma )


	-# ----------------------------------------
	-# compile lapacktest library
	-# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
	-# else, compile only C++ files, not Fortran files
	-if (USE_FORTRAN)
	- foreach( filename ${liblapacktest_all} )
	- if (filename MATCHES "\\.(f\|f90\|F90)$")
	- list( APPEND liblapacktest_all_f ${filename} )
	- endif()
	- endforeach()
	- add_library( lapacktest ${liblapacktest_all_f} )
	-else()
	- # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
	- foreach( filename ${liblapacktest_all} )
	- if (filename MATCHES "\\.(c\|cu\|cpp)$")
	- list( APPEND liblapacktest_all_cpp ${filename} )
	- endif()
	- endforeach()
	- add_library( lapacktest ${liblapacktest_all_cpp} )
	-endif()
	-target_link_libraries( lapacktest
	- ${blas_fix}
	- ${LAPACK_LIBRARIES}
	-)
	-
	-
	-# ----------------------------------------
	-# compile tester library
	-add_library( tester ${libtest_all} )
	-target_link_libraries( tester
	- magma
	- lapacktest
	- ${blas_fix}
	- ${LAPACK_LIBRARIES}
	-)
	-
	-
	# ----------------------------------------
	# compile MAGMA sparse library

	@@ -664,7 +627,6 @@ else()
	include_directories( sparse_hip/include )
	include_directories( sparse_hip/control )
	endif()
	-include_directories( testing )

	if (MAGMA_ENABLE_CUDA)
	add_library( magma_sparse ${libsparse_all} )
	@@ -692,54 +654,7 @@ endif()
	add_custom_target( sparse-lib DEPENDS magma_sparse )


	-# ----------------------------------------
	-# compile each tester
	-
	-# save testers to testing/
	-# save tester lib files to testing_lib/ to avoid cluttering lib/
	-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
	-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
	-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
	-
	-# skip Fortran testers, which require an extra file from CUDA
	-foreach( filename ${testing_all} )
	- if (filename MATCHES "\\.(c\|cu\|cpp)$")
	- list( APPEND testing_all_cpp ${filename} )
	- endif()
	-endforeach()
	-foreach( TEST ${testing_all_cpp} )
	- string( REGEX REPLACE "\\.(cpp\|f90\|F90)" "" EXE ${TEST} )
	- string( REGEX REPLACE "testing/" "" EXE ${EXE} )
	- #message( "${TEST} --> ${EXE}" )
	- add_executable( ${EXE} ${TEST} )
	- target_link_libraries( ${EXE} tester lapacktest magma )
	- list( APPEND testing ${EXE} )
	-endforeach()
	-add_custom_target( testing DEPENDS ${testing} )
	-
	-
	-# ----------------------------------------
	-# compile each sparse tester
	-
	-if (MAGMA_ENABLE_CUDA)
	- set(SPARSE_TEST_DIR "sparse/testing")
	-else()
	- set(SPARSE_TEST_DIR "sparse_hip/testing")
	-endif()
	-
	-
	-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
	cmake_policy( SET CMP0037 OLD)
	-foreach( TEST ${sparse_testing_all} )
	- string( REGEX REPLACE "\\.(cpp\|f90\|F90)" "" EXE ${TEST} )
	- string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
	- #message( "${TEST} --> ${EXE}" )
	- add_executable( ${EXE} ${TEST} )
	- target_link_libraries( ${EXE} magma_sparse magma )
	- list( APPEND sparse-testing ${EXE} )
	-endforeach()
	-add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
	-

	# ----------------------------------------
	# what to install
No results found