WizardlyBump17/Containerfile

## Containerfile
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04

FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION

RUN apt update \
  && add-apt-repository -y ppa:deadsnakes/ppa \
  && apt install -y python3.11

RUN ln -sT /usr/bin/python3.11 /usr/bin/python \
  && wget https://bootstrap.pypa.io/get-pip.py \
  && python3.11 get-pip.py

WORKDIR /llama.cpp/
RUN pip install --pre ipex-llm[cpp] \
  && init-llama-cpp

#from here downwards, this is just an utility to set up the stuff from https://github.com/intel/ipex-llm/blob/main/docs/mddocs/Quickstart/llama_cpp_quickstart.md#runtime-configuration
#and a helper to call the binaries easier
ARG ONEAPI_DEVICE_SELECTOR=level_zero:0
RUN echo "#!/bin/bash\nexport SYCL_CACHE_PERSISTENT=1\nexport ZES_ENABLE_SYSMAN=1\nexport SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1\nexport ONEAPI_DEVICE_SELELECTOR=$ONEAPI_DEVICE_SELECTOR\n/llama.cpp/\$@" > /llama.cpp/oneapi-setup.sh \
  && chmod +x /llama.cpp/oneapi-setup.sh
ENTRYPOINT ["/llama.cpp/oneapi-setup.sh"]

## docker-compose.yaml
#example usage of the container
services:
  llama.cpp-ipex-qwen2.5-coder-14b:
    image: "localhost/llama.cpp-ipex"
    command: "llama-server --host 0.0.0.0 --port 8080 --model /models/Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf --n-gpu-layers 99 --ctx-size 16384 --batch-size 1024 --parallel 1 --jinja"
    volumes:
      - "/home/davi/AI/models/:/models/"
    ports:
      - "8084:8080"
    devices:
      - "/dev/dri/renderD128"
	ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04

	FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION

	RUN apt update \
	&& add-apt-repository -y ppa:deadsnakes/ppa \
	&& apt install -y python3.11

	RUN ln -sT /usr/bin/python3.11 /usr/bin/python \
	&& wget https://bootstrap.pypa.io/get-pip.py \
	&& python3.11 get-pip.py

	WORKDIR /llama.cpp/
	RUN pip install --pre ipex-llm[cpp] \
	&& init-llama-cpp

	#from here downwards, this is just an utility to set up the stuff from https://github.com/intel/ipex-llm/blob/main/docs/mddocs/Quickstart/llama_cpp_quickstart.md#runtime-configuration
	#and a helper to call the binaries easier
	ARG ONEAPI_DEVICE_SELECTOR=level_zero:0
	RUN echo "#!/bin/bash\nexport SYCL_CACHE_PERSISTENT=1\nexport ZES_ENABLE_SYSMAN=1\nexport SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1\nexport ONEAPI_DEVICE_SELELECTOR=$ONEAPI_DEVICE_SELECTOR\n/llama.cpp/\$@" > /llama.cpp/oneapi-setup.sh \
	&& chmod +x /llama.cpp/oneapi-setup.sh
	ENTRYPOINT ["/llama.cpp/oneapi-setup.sh"]
	#example usage of the container
	services:
	llama.cpp-ipex-qwen2.5-coder-14b:
	image: "localhost/llama.cpp-ipex"
	command: "llama-server --host 0.0.0.0 --port 8080 --model /models/Qwen2.5-Coder-14B-Instruct-Q4_K_M.gguf --n-gpu-layers 99 --ctx-size 16384 --batch-size 1024 --parallel 1 --jinja"
	volumes:
	- "/home/davi/AI/models/:/models/"
	ports:
	- "8084:8080"
	devices:
	- "/dev/dri/renderD128"