update Langchain.rst

2024-04-12 12:39:34 +08:00 · 2024-04-12 12:39:34 +08:00 · 6c2a2df7a6
parent aaeda027c5
commit 6c2a2df7a6
1 changed files with 32 additions and 7 deletions
--- a/docs/source/framework/Langchain.rst
+++ b/docs/source/framework/Langchain.rst
@ -15,6 +15,11 @@ incorporating the matched text as context along with the question into the promp
 submitting to the Qwen1.5-7B-Chat to generate an answer.
 Below is an example:

+.. code:: bash
+
+   pip install langchain==0.0.174
+   pip install faiss-gpu
+
 .. code:: python

   from transformers import AutoModelForCausalLM, AutoTokenizer
@ -92,6 +97,7 @@ for retrieval.
 .. code:: python

    import os
+    import re
    import torch
    import argparse
    from langchain.vectorstores import FAISS
@ -99,15 +105,35 @@ for retrieval.
    from typing import List, Tuple
    import numpy as np
    from langchain.document_loaders import TextLoader
-    from chinese_text_splitter import ChineseTextSplitter
+    from langchain.text_splitter import CharacterTextSplitter
    from langchain.docstore.document import Document
    from langchain.prompts.prompt import PromptTemplate
    from langchain.chains import RetrievalQA
+    
+    class ChineseTextSplitter(CharacterTextSplitter):
+        def __init__(self, pdf: bool = False, **kwargs):
+            super().__init__(**kwargs)
+            self.pdf = pdf
+
+        def split_text(self, text: str) -> List[str]:
+            if self.pdf:
+                text = re.sub(r"\n{3,}", "\n", text)
+                text = re.sub('\s', ' ', text)
+                text = text.replace("\n\n", "")
+            sent_sep_pattern = re.compile(
+                '([﹒﹔﹖﹗．。！？]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') 
+            sent_list = []
+            for ele in sent_sep_pattern.split(text):
+                if sent_sep_pattern.match(ele) and sent_list:
+                    sent_list[-1] += ele
+                elif ele:
+                    sent_list.append(ele)
+            return sent_list


-    def load_file(filepath, sentence_size=100):
+    def load_file(filepath):
        loader = TextLoader(filepath, autodetect_encoding=True)
-        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
+        textsplitter = ChineseTextSplitter(pdf=False)
        docs = loader.load_and_split(textsplitter)
        write_check_file(filepath, docs)
        return docs
@ -215,15 +241,14 @@ for retrieval.
        EMBEDDING_DEVICE = "cuda"
        # return top-k text chunk from vector store
        VECTOR_SEARCH_TOP_K = 3
-        SENTENCE_SIZE = 50
        CHAIN_TYPE = 'stuff'
        embedding_model_dict = {
            "text2vec": "your text2vec model path",
        }
-        llm = QWen()
+        llm = Qwen()
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],model_kwargs={'device': EMBEDDING_DEVICE})
        
-        docs = load_file(filepath, sentence_size=SENTENCE_SIZE)
+        docs = load_file(filepath)
        
        docsearch = FAISSWrapper.from_documents(docs, embeddings)
        
@ -246,4 +271,4 @@ Next Step

 Now you can chat with Qwen1.5 use your own document. Continue
 to read the documentation and try to figure out more advanced usages of
-model retrieval!
+model retrieval!