Support vLLM XFT LLM microservice (#174)

lvliang-intel · pre-commit-ci[bot] · web-flow · commit 2a6a29fda4ff · 2024-06-14T20:35:29.000+08:00
* Support vLLM XFT serving Signed-off-by: lvliang-intel <liang1.lv@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix access vllm issue Signed-off-by: lvliang-intel <liang1.lv@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add permission for run.sh Signed-off-by: lvliang-intel <liang1.lv@intel.com> * add readme Signed-off-by: lvliang-intel <liang1.lv@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix proxy issue Signed-off-by: lvliang-intel <liang1.lv@intel.com> --------- Signed-off-by: lvliang-intel <liang1.lv@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/comps/llms/text-generation/vllm-xft/README.md b/comps/llms/text-generation/vllm-xft/README.md
@@ -0,0 +1,47 @@
+vLLM-xFT is a fork of vLLM to integrate the xfastertransformer backend, maintaining compatibility with most of the official vLLM's features.
+For usage of vllm-xFT, please refer to [xFasterTransformer/vllm-xft](https://github.com/intel/xFasterTransformer/blob/main/serving/vllm-xft.md)
+
+# 🚀 Start Microservice with Docker
+
+## 1 Build Docker Image
+
+```bash
+cd ../../../
+docker build -t opea/llm-vllm-xft:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm-xft/docker/Dockerfile .
+```
+
+## 2 Run Docker with CLI
+
+```bash
+docker run -it -p 9000:9000  -v /home/sdp/Qwen2-7B-Instruct/:/Qwen2-7B-Instruct/   -e vLLM_LLM_ENDPOINT="http://localhost:18688" -e HF_DATASET_DIR="/Qwen2-7B-Instruct/" -e OUTPUT_DIR="./output" -e TOKEN_PATH="/Qwen2-7B-Instruct/" -e https_proxy=$https_proxy -e http_proxy=$http_proxy --ipc=host opea/llm-vllm-xft:latest
+```
+
+# 🚀3. Consume LLM Service
+
+## 3.1 Check Service Status
+
+```bash
+curl http://${your_ip}:9000/v1/health_check\
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+## 3.2 Consume LLM Service
+
+You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`.
+
+The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
+
+```bash
+# non-streaming mode
+curl http://${your_ip}:9000/v1/chat/completions \
+  -X POST \
+  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -H 'Content-Type: application/json'
+
+# streaming mode
+curl http://${your_ip}:9000/v1/chat/completions \
+  -X POST \
+  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -H 'Content-Type: application/json'
+```
diff --git a/comps/llms/text-generation/vllm-xft/docker/Dockerfile b/comps/llms/text-generation/vllm-xft/docker/Dockerfile
@@ -0,0 +1,98 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM ubuntu:22.04
+
+ARG TAG=main
+
+RUN apt-get update \ 
+    && apt-get upgrade -y \ 
+    && apt-get install -y --no-install-recommends \
+    gcc-12 \
+    g++-12 \
+    make \
+    wget \
+    libnuma-dev \
+    numactl \
+    git \
+    pkg-config \
+    software-properties-common \
+    zlib1g-dev \
+    libssl-dev \
+    libffi-dev \
+    libbz2-dev \
+    libsqlite3-dev \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 60 \
+    && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 60 \
+    && apt-get autoremove -y \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install python
+WORKDIR /tmp
+RUN wget -q https://www.python.org/ftp/python/3.8.10/Python-3.8.10.tgz \
+    && tar -xzvf Python-3.8.10.tgz
+WORKDIR /tmp/Python-3.8.10
+RUN ./configure --prefix=/usr/bin/python3.8 --enable-optimizations \
+    && make -j \
+    && make install \
+    && update-alternatives --install /usr/bin/python python /usr/bin/python3.8/bin/python3.8 60 \
+    && update-alternatives --install /usr/bin/pip pip /usr/bin/python3.8/bin/pip3 60 \
+    && python -m pip install --no-cache-dir --upgrade pip setuptools \
+    && pip install --no-cache-dir wheel \
+    && rm -rf /tmp/* \
+    && echo "export PATH=/usr/bin/python3.8:\$PATH" >> ~/.bashrc
+
+RUN pip install --no-cache-dir torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir cmake==3.26.1 transformers==4.41.2 sentencepiece==0.1.99 accelerate==0.23.0 protobuf tiktoken transformers-stream-generator einops \
+    && ln -s /usr/bin/python3.8/lib/python3.8/site-packages/cmake/data/bin/cmake /usr/bin/cmake
+
+# Install oneCCL
+RUN git clone https://github.com/oneapi-src/oneCCL.git /tmp/oneCCL
+WORKDIR /tmp/oneCCL
+RUN git checkout 2021.10 \
+    && sed -i 's/cpu_gpu_dpcpp/./g' cmake/templates/oneCCLConfig.cmake.in \
+    && mkdir build
+WORKDIR /tmp/oneCCL/build
+RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local/oneCCL \
+    && make -j install
+
+RUN echo "source /usr/local/oneCCL/env/setvars.sh" >> ~/.bashrc
+
+WORKDIR /root/
+RUN rm -rf /tmp/oneCCL
+
+RUN git clone https://github.com/intel/xFasterTransformer.git
+
+SHELL ["/bin/bash", "-c"]
+WORKDIR /root/xFasterTransformer
+RUN git checkout ${TAG} \
+    && export "LD_LIBRARY_PATH=/usr/local/mklml_lnx_2019.0.5.20190502/lib:$LD_LIBRARY_PATH" \
+    && export "PATH=/usr/bin/python3.8:$PATH" \
+    && echo "source /usr/local/oneCCL/env/setvars.sh" >> ~/.bash_profile \
+    && source ~/.bash_profile \
+    && python setup.py build \
+    && python setup.py egg_info bdist_wheel --verbose \
+    && pip install --no-cache-dir dist/*
+
+RUN mkdir -p /usr/local/xft/lib \
+    && cp /root/xFasterTransformer/build/libxfastertransformer.so /usr/local/xft/lib \
+    && cp /root/xFasterTransformer/build/libxft_comm_helper.so /usr/local/xft/lib \
+    && cp -r /root/xFasterTransformer/include /usr/local/xft/ \
+    && mkdir -p  /usr/local/include/xft/ \
+    && ln -s /usr/local/xft/include /usr/local/include/xft/include
+
+RUN echo "export \$(python -c 'import xfastertransformer as xft; print(xft.get_env())')" >> ~/.bashrc
+
+COPY comps /root/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /root/comps/llms/text-generation/vllm-xft/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/root
+
+RUN chmod +x /root/comps/llms/text-generation/vllm-xft/run.sh
+
+WORKDIR /root/comps/llms/text-generation/vllm-xft/
+
+ENTRYPOINT ["/root/comps/llms/text-generation/vllm-xft/run.sh"]
+
diff --git a/comps/llms/text-generation/vllm-xft/llm.py b/comps/llms/text-generation/vllm-xft/llm.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from fastapi.responses import StreamingResponse
+from langchain_community.llms import VLLMOpenAI
+from langsmith import traceable
+
+from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+
+
+@register_microservice(
+    name="opea_service@llm_vllm_xft",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=9000,
+)
+@traceable(run_type="llm")
+def llm_generate(input: LLMParamsDoc):
+    llm_endpoint = os.getenv("vLLM_LLM_ENDPOINT", "http://localhost:18688")
+    llm = VLLMOpenAI(
+        openai_api_key="EMPTY",
+        openai_api_base=llm_endpoint + "/v1",
+        max_tokens=input.max_new_tokens,
+        model_name="xft",
+        top_p=input.top_p,
+        temperature=input.temperature,
+        presence_penalty=input.repetition_penalty,
+        streaming=input.streaming,
+    )
+
+    if input.streaming:
+
+        def stream_generator():
+            chat_response = ""
+            for text in llm.stream(input.query):
+                chat_response += text
+                chunk_repr = repr(text.encode("utf-8"))
+                print(f"[llm - chat_stream] chunk:{chunk_repr}")
+                yield f"data: {chunk_repr}\n\n"
+            print(f"[llm - chat_stream] stream response: {chat_response}")
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    else:
+        response = llm.invoke(input.query)
+        return GeneratedDoc(text=response, prompt=input.query)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_vllm_xft"].start()
diff --git a/comps/llms/text-generation/vllm-xft/requirements.txt b/comps/llms/text-generation/vllm-xft/requirements.txt
@@ -0,0 +1,9 @@
+docarray[full]
+fastapi
+langchain==0.1.16
+langsmith
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+shortuuid
+vllm-xft
diff --git a/comps/llms/text-generation/vllm-xft/run.sh b/comps/llms/text-generation/vllm-xft/run.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# convert the model to fastertransformer format
+python -c 'import os; import xfastertransformer as xft; xft.Qwen2Convert().convert(os.environ["HF_DATASET_DIR"], os.environ["OUTPUT_DIR"])'
+
+unset http_proxy
+
+# serving with vllm
+python -m vllm.entrypoints.openai.api_server \
+        --model ${OUTPUT_DIR} \
+        --tokenizer ${TOKEN_PATH} \
+        --dtype bf16 \
+        --kv-cache-dtype fp16 \
+        --served-model-name xft \
+        --host localhost \
+        --port 18688 \
+        --trust-remote-code &
+
+# run llm microservice wrapper
+python llm.py