opea-project
diff --git a/‎comps/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎comps/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎comps/cores/proto/docarray.py‎
Lines changed: 6 additions & 0 deletions b/‎comps/cores/proto/docarray.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎comps/llms/summarization/tgi/langchain/README.md‎
Lines changed: 61 additions & 2 deletions b/‎comps/llms/summarization/tgi/langchain/README.md‎
Lines changed: 61 additions & 2 deletions
diff --git a/‎comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml‎
Lines changed: 4 additions & 1 deletion b/‎comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎comps/llms/summarization/tgi/langchain/llm.py‎
Lines changed: 132 additions & 26 deletions b/‎comps/llms/summarization/tgi/langchain/llm.py‎
Lines changed: 132 additions & 26 deletions
diff --git a/‎comps/llms/summarization/tgi/langchain/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎comps/llms/summarization/tgi/langchain/requirements.txt‎
Lines changed: 1 addition & 0 deletions
@@ -38,6 +38,7 @@
     PIIResponseDoc,
     Audio2text,
     DocSumDoc,
+    DocSumLLMParams,
 )
 
 # Constants
 
@@ -212,6 +212,12 @@ def chat_template_must_contain_variables(cls, v):
         return v
 
 
+class DocSumLLMParams(LLMParamsDoc):
+    summary_type: str = "stuff"  # can be "truncate", "map_reduce", "refine"
+    chunk_size: int = -1
+    chunk_overlap: int = -1
+
+
 class LLMParams(BaseDoc):
     model: Optional[str] = None
     max_tokens: int = 1024
 
@@ -48,8 +48,12 @@ In order to start TGI and LLM services, you need to setup the following environm
 export HF_TOKEN=${your_hf_api_token}
 export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
 export LLM_MODEL_ID=${your_hf_llm_model}
+export MAX_INPUT_TOKENS=2048
+export MAX_TOTAL_TOKENS=4096
 ```
 
+Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.
+
 ### 2.2 Build Docker Image
 
 ```bash
@@ -67,7 +71,7 @@ You can choose one as needed.
 ### 2.3 Run Docker with CLI (Option A)
 
 ```bash
-docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-docsum-tgi:latest
+docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} opea/llm-docsum-tgi:latest
 ```
 
 ### 2.4 Run Docker with Docker Compose (Option B)
@@ -88,6 +92,18 @@ curl http://${your_ip}:9000/v1/health_check\
 
 ### 3.2 Consume LLM Service
 
+In DocSum microservice, except for basic LLM parameters, we also support several optimization parameters setting.
+
+- "language": specify the language, can be "auto", "en", "zh", default is "auto"
+
+If you want to deal with long context, can select suitable summary type, details in section 3.2.2.
+
+- "summary_type": can be "stuff", "truncate", "map_reduce", "refine", default is "stuff"
+- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
+- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
+
+#### 3.2.1 Basic usage
+
 ```bash
 # Enable streaming to receive a streaming response. By default, this is set to True.
 curl http://${your_ip}:9000/v1/chat/docsum \
@@ -101,9 +117,52 @@ curl http://${your_ip}:9000/v1/chat/docsum \
   -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
   -H 'Content-Type: application/json'
 
-# Use Chinese mode. By default, language is set to "en"
+# Use Chinese mode
 curl http://${your_ip}:9000/v1/chat/docsum \
   -X POST \
   -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
   -H 'Content-Type: application/json'
 ```
+
+#### 3.2.2 Long context summarization with "summary_type"
+
+"summary_type" is set to be "stuff" by default, which will let LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
+
+When deal with long context, you can set "summary_type" to one of "truncate", "map_reduce" and "refine" for better performance.
+
+**summary_type=truncate**
+
+Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
+
+```bash
+curl http://${your_ip}:9000/v1/chat/docsum \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
+  -H 'Content-Type: application/json'
+```
+
+**summary_type=map_reduce**
+
+Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+
+In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
+
+```bash
+curl http://${your_ip}:9000/v1/chat/docsum \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
+  -H 'Content-Type: application/json'
+```
+
+**summary_type=refine**
+
+Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
+
+In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
+
+```bash
+curl http://${your_ip}:9000/v1/chat/docsum \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
+  -H 'Content-Type: application/json'
+```
@@ -14,7 +14,7 @@ services:
     environment:
       HF_TOKEN: ${HF_TOKEN}
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
   llm:
     image: opea/llm-docsum-tgi:latest
     container_name: llm-docsum-tgi-server
@@ -27,6 +27,9 @@ services:
       https_proxy: ${https_proxy}
       TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
       HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
     restart: unless-stopped
 
 networks:
 
@@ -4,10 +4,14 @@
 import os
 
 from fastapi.responses import StreamingResponse
-from huggingface_hub import AsyncInferenceClient
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
 from langchain.prompts import PromptTemplate
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from transformers import AutoTokenizer
 
-from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
 from comps.cores.mega.utils import get_access_token
 
 logger = CustomLogger("llm_docsum")
@@ -17,6 +21,9 @@
 TOKEN_URL = os.getenv("TOKEN_URL")
 CLIENTID = os.getenv("CLIENTID")
 CLIENT_SECRET = os.getenv("CLIENT_SECRET")
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
+MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
 
 templ_en = """Write a concise summary of the following:
 
@@ -35,70 +42,169 @@
 概况:"""
 
 
+templ_refine_en = """\
+Your job is to produce a final summary.
+We have provided an existing summary up to a certain point: {existing_answer}
+We have the opportunity to refine the existing summary (only if needed) with some more context below.
+------------
+{text}
+------------
+Given the new context, refine the original summary.
+If the context isn't useful, return the original summary.\
+"""
+
+templ_refine_zh = """\
+你的任务是生成一个最终摘要。
+我们已经提供了部分摘要：{existing_answer}
+如果有需要的话，可以通过以下更多上下文来完善现有摘要。
+------------
+{text}
+------------
+根据新上下文，完善原始摘要。
+如果上下文无用，则返回原始摘要。\
+"""
+
+
 @register_microservice(
     name="opea_service@llm_docsum",
     service_type=ServiceType.LLM,
     endpoint="/v1/chat/docsum",
     host="0.0.0.0",
     port=9000,
 )
-async def llm_generate(input: LLMParamsDoc):
+async def llm_generate(input: DocSumLLMParams):
     if logflag:
         logger.info(input)
+
     if input.language in ["en", "auto"]:
         templ = templ_en
+        templ_refine = templ_refine_en
     elif input.language in ["zh"]:
         templ = templ_zh
+        templ_refine = templ_refine_zh
     else:
         raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
 
-    prompt_template = PromptTemplate.from_template(templ)
-    prompt = prompt_template.format(text=input.query)
-
+    ## Prompt
+    PROMPT = PromptTemplate.from_template(templ)
+    if input.summary_type == "refine":
+        PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
     if logflag:
         logger.info("After prompting:")
-        logger.info(prompt)
+        logger.info(PROMPT)
+        if input.summary_type == "refine":
+            logger.info(PROMPT_REFINE)
+
+    ## Split text
+    if input.summary_type == "stuff":
+        text_splitter = CharacterTextSplitter()
+    elif input.summary_type in ["truncate", "map_reduce", "refine"]:
+        if input.summary_type == "refine":
+            if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
+                raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
+            max_input_tokens = min(
+                MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
+            )  # 128 is reserved token length for prompt
+        else:
+            if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
+                raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
+            max_input_tokens = min(
+                MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
+            )  # 50 is reserved token length for prompt
+        chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
+        chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
+        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        if logflag:
+            logger.info(f"set chunk size to: {chunk_size}")
+            logger.info(f"set chunk overlap to: {chunk_overlap}")
+    else:
+        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
+    texts = text_splitter.split_text(input.query)
+    docs = [Document(page_content=t) for t in texts]
+    if logflag:
+        logger.info(f"Split input query into {len(docs)} chunks")
+        logger.info(f"The character length of the first chunk is {len(texts[0])}")
 
+    ## Access auth
     access_token = (
         get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
     )
-    headers = {}
+    server_kwargs = {}
     if access_token:
-        headers = {"Authorization": f"Bearer {access_token}"}
-    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
-    llm = AsyncInferenceClient(model=llm_endpoint, timeout=600, headers=headers)
+        server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
 
-    text_generation = await llm.text_generation(
-        prompt=prompt,
-        stream=input.streaming,
+    ## LLM
+    if input.streaming and input.summary_type == "map_reduce":
+        logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
+        input.streaming = False
+    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+    llm = HuggingFaceEndpoint(
+        endpoint_url=llm_endpoint,
         max_new_tokens=input.max_tokens,
-        repetition_penalty=input.repetition_penalty,
-        temperature=input.temperature,
         top_k=input.top_k,
         top_p=input.top_p,
         typical_p=input.typical_p,
+        temperature=input.temperature,
+        repetition_penalty=input.repetition_penalty,
+        streaming=input.streaming,
+        server_kwargs=server_kwargs,
     )
 
+    ## LLM chain
+    summary_type = input.summary_type
+    if summary_type == "stuff":
+        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
+    elif summary_type == "truncate":
+        docs = [docs[0]]
+        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
+    elif summary_type == "map_reduce":
+        llm_chain = load_summarize_chain(
+            llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
+        )
+    elif summary_type == "refine":
+        llm_chain = load_summarize_chain(
+            llm=llm,
+            question_prompt=PROMPT,
+            refine_prompt=PROMPT_REFINE,
+            chain_type="refine",
+            return_intermediate_steps=True,
+        )
+    else:
+        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
+
     if input.streaming:
 
         async def stream_generator():
-            chat_response = ""
-            async for text in text_generation:
-                chat_response += text
-                chunk_repr = repr(text.encode("utf-8"))
+            from langserve.serialization import WellKnownLCSerializer
+
+            _serializer = WellKnownLCSerializer()
+            async for chunk in llm_chain.astream_log(docs):
+                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
                 if logflag:
-                    logger.info(f"[ docsum - text_summarize ] chunk:{chunk_repr}")
-                yield f"data: {chunk_repr}\n\n"
-            if logflag:
-                logger.info(f"[ docsum - text_summarize ] stream response: {chat_response}")
+                    logger.info(data)
+                yield f"data: {data}\n\n"
             yield "data: [DONE]\n\n"
 
         return StreamingResponse(stream_generator(), media_type="text/event-stream")
     else:
+        response = await llm_chain.ainvoke(docs)
+
+        if input.summary_type in ["map_reduce", "refine"]:
+            intermediate_steps = response["intermediate_steps"]
+            if logflag:
+                logger.info("intermediate_steps:")
+                logger.info(intermediate_steps)
+
+        output_text = response["output_text"]
         if logflag:
-            logger.info(text_generation)
-        return GeneratedDoc(text=text_generation, prompt=input.query)
+            logger.info("\n\noutput_text:")
+            logger.info(output_text)
+
+        return GeneratedDoc(text=output_text, prompt=input.query)
 
 
 if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
     opea_microservices["opea_service@llm_docsum"].start()
@@ -1,5 +1,6 @@
 docarray[full]
 fastapi
+httpx==0.27.2
 huggingface_hub
 langchain #==0.1.12
 langchain-huggingface
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@`
`38`	`38`	`PIIResponseDoc,`
`39`	`39`	`Audio2text,`
`40`	`40`	`DocSumDoc,`
	`41`	`+ DocSumLLMParams,`
`41`	`42`	`)`
`42`	`43`
`43`	`44`	`# Constants`