feat: mypy for all type check (#10921)

This commit is contained in:
yihong
2024-12-24 18:38:51 +08:00
committed by GitHub
parent c91e8b1737
commit 56e15d09a9
584 changed files with 3975 additions and 2826 deletions

View File

@@ -1,7 +1,7 @@
import base64
import logging
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup # type: ignore
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document

View File

@@ -30,6 +30,9 @@ class UnstructuredEpubExtractor(BaseExtractor):
if self._api_url:
from unstructured.partition.api import partition_via_api
if self._api_key is None:
raise ValueError("api_key is required")
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.epub import partition_epub

View File

@@ -27,9 +27,11 @@ class UnstructuredPPTExtractor(BaseExtractor):
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
raise NotImplementedError("Unstructured API Url is not configured")
text_by_page = {}
text_by_page: dict[int, str] = {}
for element in elements:
page = element.metadata.page_number
if page is None:
continue
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text

View File

@@ -29,14 +29,15 @@ class UnstructuredPPTXExtractor(BaseExtractor):
from unstructured.partition.pptx import partition_pptx
elements = partition_pptx(filename=self._file_path)
text_by_page = {}
text_by_page: dict[int, str] = {}
for element in elements:
page = element.metadata.page_number
text = element.text
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text
if page is not None:
if page in text_by_page:
text_by_page[page] += "\n" + text
else:
text_by_page[page] = text
combined_texts = list(text_by_page.values())
documents = []