sudo apt install apt-transport-https ca-certificates curl software-properties-common curl gnupg lsb-release
sudo mkdir -p /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu sb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
apt list --upgradable
sudo apt-get install docker-ce containerd.io docker-compose-plugin
sudo apt autoremove
apt-cache madison docker-ce | awk '{ print $3 }'Test time
sudo docker run hello-worldsudo groupadd docker
sudo usermod -aG docker usernamehere- Install dependencies
# Dependencies
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list- Install nvidia-docker2
sudo apt update && sudo apt install -y nvidia-docker2- Test it
docker run --rm --gpus all -it stable-diffusion-server nvidia-smisudo nvidia-ctk runtime configure --runtime=docker
INFO[0000] Config file does not exist; using empty config
INFO[0000] Wrote updated config to /etc/docker/daemon.json
INFO[0000] It is recommended that docker daemon be restarted.
sudo systemctl restart docker
cat /etc/docker/daemon.json
{
"runtimes": {
"nvidia": {
"args": [],
"path": "nvidia-container-runtime"
}
}
}- Clone the TensorRT
git clone -b v0.8.0 https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM- Download Llama3 on the same folder
git clone https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct- Run the docker container and install everything to be able to compile the model in step 4
# Obtain and start the basic docker image environment.
docker run --rm --runtime=nvidia --gpus all --volume ${PWD}:/TensorRT-LLM --entrypoint /bin/bash -it --workdir /TensorRT-LLM nvidia/cuda:12.1.0-devel-ubuntu22.04
# Install dependencies, TensorRT-LLM requires Python 3.10
apt-get update && apt-get -y install python3.10 python3-pip openmpi-bin libopenmpi-dev
# Install the stable version (corresponding to the cloned branch) of TensorRT-LLM.
pip3 install tensorrt_llm==0.8.0 -U --extra-index-url https://pypi.nvidia.com- Compile the model to TensorRT (from the docker container)
python3 examples/llama/convert_checkpoint.py --model_dir ./Meta-Llama-3-8B-Instruct \
--output_dir ./tllm_checkpoint_1gpu_bf16 \
--dtype bfloat16
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_bf16 \
--output_dir ./tmp/llama/8B/trt_engines/bf16/1-gpu \
--gpt_attention_plugin bfloat16 \
--gemm_plugin bfloat16- Test the model (from the container)
python3 examples/run.py --engine_dir=./tmp/llama/8B/trt_engines/bf16/1-gpu --max_output_len 100 --tokenizer_dir ./Meta-Llama-3-8B-Instruct --input_text "How do I count to nine in French?"- Set up the backend (OUTSIDE THE CONTAINER)
cd
git clone -b v0.8.0 https://github.com/triton-inference-server/tensorrtllm_backend.git
cd tensorrtllm_backend
cp ../TensorRT-LLM/tmp/llama/8B/trt_engines/bf16/1-gpu/* all_models/inflight_batcher_llm/tensorrt_llm/1/Create configuration
#Set the tokenizer_dir and engine_dir paths
export HF_LLAMA_MODEL=TensorRT-LLM/Meta-Llama-3-8B-Instruct
export ENGINE_PATH=tensorrtllm_backend/all_models/inflight_batcher_llm/tensorrt_llm/1
python3 tools/fill_template.py -i all_models/inflight_batcher_llm/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},tokenizer_type:auto,triton_max_batch_size:64,preprocessing_instance_count:1
python3 tools/fill_template.py -i all_models/inflight_batcher_llm/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},tokenizer_type:auto,triton_max_batch_size:64,postprocessing_instance_count:1
python3 tools/fill_template.py -i all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
python3 tools/fill_template.py -i all_models/inflight_batcher_llm/ensemble/config.pbtxt triton_max_batch_size:64
python3 tools/fill_template.py -i all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0- Run Triton Inference Server (OUTSIDE THE CONTAINER) Change to base working directory
docker run -it --rm --gpus all --network host --shm-size=1g \
-v $(pwd):/workspace \
--workdir /workspace \
nvcr.io/nvidia/tritonserver:24.03-trtllm-python-py3
# Log in to huggingface-cli to get tokenizer (optional)
huggingface-cli login --token *****
# Install python dependencies
pip install sentencepiece protobufLaunch Triton Inference Server
python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1- Test the inference server
curl -X POST localhost:8000/v2/models/ensemble/generate -d \
'{
"text_input": "How do I count to nine in French?",
"parameters": {
"max_tokens": 100,
"bad_words":[""],
"stop_words":[""]
}
}'- https://developer.nvidia.com/tensorrt#inference
- https://developer.nvidia.com/blog/turbocharging-meta-llama-3-performance-with-nvidia-tensorrt-llm-and-nvidia-triton-inference-server/?mkt_tok=MTU2LU9GTi03NDIAAAGS2eluc9wp-yjxRBt1zqQwATaocC2RSOhNWuExpPwINzB7jDhQbeWkktvDNY65MK5yv6mk8zNR5JCpJxD1qB6BX_N-W7lifyrNG9ZdbTBagx2O7O5deiQ
- https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct/tree/main