feat: mypy for all type check (#10921)

2026-02-16 07:01:44 -05:00 · 2024-12-24 18:38:51 +08:00
parent c91e8b1737
commit 56e15d09a9
584 changed files with 3975 additions and 2826 deletions
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -1,7 +1,7 @@
 import base64
 import logging

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup  # type: ignore

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -30,6 +30,9 @@ class UnstructuredEpubExtractor(BaseExtractor):
        if self._api_url:
            from unstructured.partition.api import partition_via_api

+            if self._api_key is None:
+                raise ValueError("api_key is required")
+
            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
        else:
            from unstructured.partition.epub import partition_epub
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -27,9 +27,11 @@ class UnstructuredPPTExtractor(BaseExtractor):
            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
        else:
            raise NotImplementedError("Unstructured API Url is not configured")
-        text_by_page = {}
+        text_by_page: dict[int, str] = {}
        for element in elements:
            page = element.metadata.page_number
+            if page is None:
+                continue
            text = element.text
            if page in text_by_page:
                text_by_page[page] += "\n" + text
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -29,14 +29,15 @@ class UnstructuredPPTXExtractor(BaseExtractor):
            from unstructured.partition.pptx import partition_pptx

            elements = partition_pptx(filename=self._file_path)
-        text_by_page = {}
+        text_by_page: dict[int, str] = {}
        for element in elements:
            page = element.metadata.page_number
            text = element.text
-            if page in text_by_page:
-                text_by_page[page] += "\n" + text
-            else:
-                text_by_page[page] = text
+            if page is not None:
+                if page in text_by_page:
+                    text_by_page[page] += "\n" + text
+                else:
+                    text_by_page[page] = text

        combined_texts = list(text_by_page.values())
        documents = []