diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py new file mode 100644 index 0000000..2c3076c --- /dev/null +++ b/apps/common/handle/base_split_handle.py @@ -0,0 +1,20 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: base_split_handle.py + @date:2024/3/27 18:13 + @desc: +""" +from abc import ABC, abstractmethod +from typing import List + + +class BaseSplitHandle(ABC): + @abstractmethod + def support(self, file, get_buffer): + pass + + @abstractmethod + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer): + pass diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py new file mode 100644 index 0000000..50addb3 --- /dev/null +++ b/apps/common/handle/impl/doc_split_handle.py @@ -0,0 +1,45 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: text_split_handle.py + @date:2024/3/27 18:19 + @desc: +""" +import io +import re +from typing import List + +from docx import Document + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } + + def support(self, file, get_buffer): + file_name: str = file.name.lower() + if file_name.endswith(".docx") or file_name.endswith(".doc"): + return True + return False diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py new file mode 100644 index 0000000..c839a10 --- /dev/null +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -0,0 +1,50 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: text_split_handle.py + @date:2024/3/27 18:19 + @desc: +""" +import re +from typing import List + +import fitz + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } + + def support(self, file, get_buffer): + file_name: str = file.name.lower() + if file_name.endswith(".pdf"): + return True + return False diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py new file mode 100644 index 0000000..67f56c3 --- /dev/null +++ b/apps/common/handle/impl/text_split_handle.py @@ -0,0 +1,47 @@ +# coding=utf-8 +""" + @project: maxkb + @Author:虎 + @file: text_split_handle.py + @date:2024/3/27 18:19 + @desc: +""" +import re +from typing import List + +import chardet + +from common.handle.base_split_handle import BaseSplitHandle +from common.util.split_model import SplitModel + +default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0.5: + return True + return False + + def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer): + buffer = get_buffer(file) + if pattern_list is not None and len(pattern_list) > 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + try: + content = buffer.decode(chardet.detect(buffer)['encoding']) + except BaseException as e: + return {'name': file.name, + 'content': []} + return {'name': file.name, + 'content': split_model.parse(content) + } diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index 5294799..8e062d8 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -22,6 +22,9 @@ from common.db.search import native_search, native_page_search from common.event.common import work_thread_pool from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs from common.exception.app_exception import AppApiException +from common.handle.impl.doc_split_handle import DocSplitHandle +from common.handle.impl.pdf_split_handle import PdfSplitHandle +from common.handle.impl.text_split_handle import TextSplitHandle from common.mixins.api_mixin import ApiMixin from common.util.common import post from common.util.field_message import ErrMessage @@ -593,17 +596,22 @@ class DocumentSerializers(ApiMixin, serializers.Serializer): return True +class FileBufferHandle: + buffer = None + + def get_buffer(self, file): + if self.buffer is None: + self.buffer = file.read() + return self.buffer + + +default_split_handle = TextSplitHandle() +split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle] + + def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int): - data = file.read() - if pattern_list is not None and len(pattern_list) > 0: - split_model = SplitModel(pattern_list, with_filter, limit) - else: - split_model = get_split_model(file.name, with_filter=with_filter, limit=limit) - try: - content = data.decode(chardet.detect(data)['encoding']) - except BaseException as e: - return {'name': file.name, - 'content': []} - return {'name': file.name, - 'content': split_model.parse(content) - } + get_buffer = FileBufferHandle().get_buffer + for split_handle in split_handles: + if split_handle.support(file, get_buffer): + return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer) + return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer) diff --git a/pyproject.toml b/pyproject.toml index 2675941..b2152f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,8 @@ langchain-openai = "^0.0.8" django-ipware = "^6.0.4" django-apscheduler = "^0.6.2" chardet2 = "^2.0.3" +pymupdf = "^1.24.0" +python-docx = "^1.1.0" [build-system] requires = ["poetry-core"] diff --git a/ui/src/views/dataset/component/UploadComponent.vue b/ui/src/views/dataset/component/UploadComponent.vue index d4d3fc4..b750889 100644 --- a/ui/src/views/dataset/component/UploadComponent.vue +++ b/ui/src/views/dataset/component/UploadComponent.vue @@ -16,7 +16,7 @@ action="#" :auto-upload="false" :show-file-list="false" - accept=".txt, .md, .csv, .log" + accept=".txt, .md, .csv, .log, .doc, .docx, .pdf" :limit="50" :on-exceed="onExceed" >