vllm/Dockerfile.cpu

# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.

FROM ubuntu:22.04 AS cpu-test-1

ENV CCACHE_DIR=/root/.cache/ccache

ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache

RUN --mount=type=cache,target=/var/cache/apt \
    apt-get update -y \
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install intel-openmp

ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"

RUN echo 'ulimit -c 0' >> ~/.bashrc

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

WORKDIR /workspace

ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
    pip install --upgrade pip && \
    pip install -r requirements-build.txt

# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git

RUN --mount=type=cache,target=/root/.cache/ccache \
    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
    -DONEDNN_BUILD_DOC=OFF \ 
    -DONEDNN_BUILD_EXAMPLES=OFF \ 
    -DONEDNN_BUILD_TESTS=OFF \ 
    -DONEDNN_BUILD_GRAPH=OFF \ 
    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
    cmake --build ./oneDNN/build --target install --config Release

FROM cpu-test-1 AS build

WORKDIR /workspace/vllm

RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
    pip install -v -r requirements-cpu.txt

COPY ./ ./

# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
    pip install dist/*.whl && \
    rm -rf dist

WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00			`# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.`

[CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-04 01:39:50 +08:00			`FROM ubuntu:22.04 AS cpu-test-1`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`ENV CCACHE_DIR=/root/.cache/ccache`

			`ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache`

[CI/Build] Dockerfile.cpu improvements (#7298) 2024-08-09 03:24:52 +08:00			`RUN --mount=type=cache,target=/var/cache/apt \`
			`apt-get update -y \`
			`&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \`
[model] Support for Llava-Next-Video model (#7559) Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2024-09-11 13:21:36 +08:00			`&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00			`&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12`

[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> 2024-07-05 06:22:12 +08:00			`# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html`
			`# intel-openmp provides additional performance improvement vs. openmp`
			`# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.`
[CI/Build] Dockerfile.cpu improvements (#7298) 2024-08-09 03:24:52 +08:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install intel-openmp`
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> 2024-07-05 06:22:12 +08:00
[Misc] Update dockerfile for CPU to cover protobuf installation (#7182) 2024-08-16 01:03:01 +08:00			`ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"`
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> 2024-07-05 06:22:12 +08:00
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`RUN echo 'ulimit -c 0' >> ~/.bashrc`
[Hardware][Intel] Optimize CPU backend and add more performance tips (#4971) Co-authored-by: Jianan Gu <jianan.gu@intel.com> 2024-06-14 00:33:14 +08:00
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl`
[Hardware][Intel] Optimize CPU backend and add more performance tips (#4971) Co-authored-by: Jianan Gu <jianan.gu@intel.com> 2024-06-14 00:33:14 +08:00
[CI/Build] fix Dockerfile.cpu on podman (#8540) 2024-09-18 10:49:53 +08:00			`WORKDIR /workspace`

[CI/Build] Dockerfile.cpu improvements (#7298) 2024-08-09 03:24:52 +08:00			`ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \`
			`pip install --upgrade pip && \`
			`pip install -r requirements-build.txt`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`# install oneDNN`
			`RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git`

			`RUN --mount=type=cache,target=/root/.cache/ccache \`
			`cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \`
			`-DONEDNN_BUILD_DOC=OFF \`
			`-DONEDNN_BUILD_EXAMPLES=OFF \`
			`-DONEDNN_BUILD_TESTS=OFF \`
			`-DONEDNN_BUILD_GRAPH=OFF \`
			`-DONEDNN_ENABLE_WORKLOAD=INFERENCE \`
			`-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \`
			`cmake --build ./oneDNN/build --target install --config Release`

[CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-04 01:39:50 +08:00			`FROM cpu-test-1 AS build`

[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00			`WORKDIR /workspace/vllm`

[CI/Build] Dockerfile.cpu improvements (#7298) 2024-08-09 03:24:52 +08:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \`
			`--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \`
			`pip install -v -r requirements-cpu.txt`

			`COPY ./ ./`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00
[CI/BUILD] Support non-AVX512 vLLM building and testing (#5574) 2024-06-18 02:36:10 +08:00			`# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...`
			`ARG VLLM_CPU_DISABLE_AVX512`
			`ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}`

[CI/Build] Dockerfile.cpu improvements (#7298) 2024-08-09 03:24:52 +08:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`--mount=type=cache,target=/root/.cache/ccache \`
[CI/Build] use setuptools-scm to set __version__ (#4738) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-24 00:44:26 +08:00			`--mount=type=bind,source=.git,target=.git \`
[CI/Build] Dockerfile.cpu improvements (#7298) 2024-08-09 03:24:52 +08:00			`VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \`
[CI/Build] use setuptools-scm to set __version__ (#4738) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-24 00:44:26 +08:00			`pip install dist/*.whl && \`
			`rm -rf dist`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00
[Bugfix] Update Dockerfile.cpu to fix NameError: name 'vllm_ops' is not defined (#5009) 2024-05-24 00:08:58 +08:00			`WORKDIR /workspace/`

[Hardware][Intel] Optimize CPU backend and add more performance tips (#4971) Co-authored-by: Jianan Gu <jianan.gu@intel.com> 2024-06-14 00:33:14 +08:00			`RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks`
[CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-04 01:39:50 +08:00
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> 2024-07-05 06:22:12 +08:00			`ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]`