2024-04-02 13:07:30 +08:00
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
2024-06-04 01:39:50 +08:00
FROM ubuntu:22.04 AS cpu-test-1
2024-04-02 13:07:30 +08:00
2024-09-12 00:46:46 +08:00
ENV CCACHE_DIR = /root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER = ccache
2024-08-09 03:24:52 +08:00
RUN --mount= type = cache,target= /var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
2024-09-11 13:21:36 +08:00
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
2024-04-02 13:07:30 +08:00
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
2024-07-05 06:22:12 +08:00
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
2024-08-09 03:24:52 +08:00
RUN --mount= type = cache,target= /root/.cache/pip \
pip install intel-openmp
2024-07-05 06:22:12 +08:00
2024-08-16 01:03:01 +08:00
ENV LD_PRELOAD = "/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
2024-07-05 06:22:12 +08:00
2024-07-27 04:50:10 +08:00
RUN echo 'ulimit -c 0' >> ~/.bashrc
2024-06-14 00:33:14 +08:00
2024-07-27 04:50:10 +08:00
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
2024-06-14 00:33:14 +08:00
2024-09-18 10:49:53 +08:00
WORKDIR /workspace
2024-08-09 03:24:52 +08:00
ENV PIP_EXTRA_INDEX_URL = https://download.pytorch.org/whl/cpu
RUN --mount= type = cache,target= /root/.cache/pip \
--mount= type = bind,src= requirements-build.txt,target= requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
2024-04-02 13:07:30 +08:00
2024-09-12 00:46:46 +08:00
# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
RUN --mount= type = cache,target= /root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE= STATIC \
-DONEDNN_BUILD_DOC= OFF \
-DONEDNN_BUILD_EXAMPLES= OFF \
-DONEDNN_BUILD_TESTS= OFF \
-DONEDNN_BUILD_GRAPH= OFF \
-DONEDNN_ENABLE_WORKLOAD= INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE= MATMUL && \
cmake --build ./oneDNN/build --target install --config Release
2024-06-04 01:39:50 +08:00
FROM cpu-test-1 AS build
2024-04-02 13:07:30 +08:00
WORKDIR /workspace/vllm
2024-08-09 03:24:52 +08:00
RUN --mount= type = cache,target= /root/.cache/pip \
--mount= type = bind,src= requirements-common.txt,target= requirements-common.txt \
--mount= type = bind,src= requirements-cpu.txt,target= requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
COPY ./ ./
2024-04-02 13:07:30 +08:00
2024-06-18 02:36:10 +08:00
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512 = ${ VLLM_CPU_DISABLE_AVX512 }
2024-08-09 03:24:52 +08:00
RUN --mount= type = cache,target= /root/.cache/pip \
--mount= type = cache,target= /root/.cache/ccache \
2024-09-24 00:44:26 +08:00
--mount= type = bind,source= .git,target= .git \
2024-08-09 03:24:52 +08:00
VLLM_TARGET_DEVICE = cpu python3 setup.py bdist_wheel && \
2024-09-24 00:44:26 +08:00
pip install dist/*.whl && \
rm -rf dist
2024-04-02 13:07:30 +08:00
2024-05-24 00:08:58 +08:00
WORKDIR /workspace/
2024-06-14 00:33:14 +08:00
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
2024-06-04 01:39:50 +08:00
2024-07-05 06:22:12 +08:00
ENTRYPOINT [ "python3" , "-m" , "vllm.entrypoints.openai.api_server" ]