mirror of
https://github.com/langgenius/dify.git
synced 2026-02-16 07:01:44 -05:00
feat: mypy for all type check (#10921)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
|
||||
@@ -30,6 +30,9 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
||||
if self._api_url:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
|
||||
if self._api_key is None:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
@@ -27,9 +27,11 @@ class UnstructuredPPTExtractor(BaseExtractor):
|
||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||
else:
|
||||
raise NotImplementedError("Unstructured API Url is not configured")
|
||||
text_by_page = {}
|
||||
text_by_page: dict[int, str] = {}
|
||||
for element in elements:
|
||||
page = element.metadata.page_number
|
||||
if page is None:
|
||||
continue
|
||||
text = element.text
|
||||
if page in text_by_page:
|
||||
text_by_page[page] += "\n" + text
|
||||
|
||||
@@ -29,14 +29,15 @@ class UnstructuredPPTXExtractor(BaseExtractor):
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
elements = partition_pptx(filename=self._file_path)
|
||||
text_by_page = {}
|
||||
text_by_page: dict[int, str] = {}
|
||||
for element in elements:
|
||||
page = element.metadata.page_number
|
||||
text = element.text
|
||||
if page in text_by_page:
|
||||
text_by_page[page] += "\n" + text
|
||||
else:
|
||||
text_by_page[page] = text
|
||||
if page is not None:
|
||||
if page in text_by_page:
|
||||
text_by_page[page] += "\n" + text
|
||||
else:
|
||||
text_by_page[page] = text
|
||||
|
||||
combined_texts = list(text_by_page.values())
|
||||
documents = []
|
||||
|
||||
Reference in New Issue
Block a user