mirror of
https://github.com/langgenius/dify.git
synced 2025-12-25 01:00:42 -05:00
perf: commit once (#29590)
This commit is contained in:
@@ -84,7 +84,7 @@ class WordExtractor(BaseExtractor):
|
||||
image_count = 0
|
||||
image_map = {}
|
||||
|
||||
for rId, rel in doc.part.rels.items():
|
||||
for r_id, rel in doc.part.rels.items():
|
||||
if "image" in rel.target_ref:
|
||||
image_count += 1
|
||||
if rel.is_external:
|
||||
@@ -121,9 +121,8 @@ class WordExtractor(BaseExtractor):
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
db.session.add(upload_file)
|
||||
db.session.commit()
|
||||
# Use rId as key for external images since target_part is undefined
|
||||
image_map[rId] = f""
|
||||
# Use r_id as key for external images since target_part is undefined
|
||||
image_map[r_id] = f""
|
||||
else:
|
||||
image_ext = rel.target_ref.split(".")[-1]
|
||||
if image_ext is None:
|
||||
@@ -151,12 +150,11 @@ class WordExtractor(BaseExtractor):
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
db.session.add(upload_file)
|
||||
db.session.commit()
|
||||
# Use target_part as key for internal images
|
||||
image_map[rel.target_part] = (
|
||||
f""
|
||||
)
|
||||
|
||||
db.session.commit()
|
||||
return image_map
|
||||
|
||||
def _table_to_markdown(self, table, image_map):
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
"""Primarily used for testing merged cell scenarios"""
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
from docx import Document
|
||||
|
||||
import core.rag.extractor.word_extractor as we
|
||||
from core.rag.extractor.word_extractor import WordExtractor
|
||||
|
||||
|
||||
@@ -47,3 +50,85 @@ def test_parse_row():
|
||||
extractor = object.__new__(WordExtractor)
|
||||
for idx, row in enumerate(table.rows):
|
||||
assert extractor._parse_row(row, {}, 3) == gt[idx]
|
||||
|
||||
|
||||
def test_extract_images_from_docx(monkeypatch):
|
||||
external_bytes = b"ext-bytes"
|
||||
internal_bytes = b"int-bytes"
|
||||
|
||||
# Patch storage.save to capture writes
|
||||
saves: list[tuple[str, bytes]] = []
|
||||
|
||||
def save(key: str, data: bytes):
|
||||
saves.append((key, data))
|
||||
|
||||
monkeypatch.setattr(we, "storage", SimpleNamespace(save=save))
|
||||
|
||||
# Patch db.session to record adds/commit
|
||||
class DummySession:
|
||||
def __init__(self):
|
||||
self.added = []
|
||||
self.committed = False
|
||||
|
||||
def add(self, obj):
|
||||
self.added.append(obj)
|
||||
|
||||
def commit(self):
|
||||
self.committed = True
|
||||
|
||||
db_stub = SimpleNamespace(session=DummySession())
|
||||
monkeypatch.setattr(we, "db", db_stub)
|
||||
|
||||
# Patch config values used for URL composition and storage type
|
||||
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
|
||||
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
|
||||
|
||||
# Patch UploadFile to avoid real DB models
|
||||
class FakeUploadFile:
|
||||
_i = 0
|
||||
|
||||
def __init__(self, **kwargs): # kwargs match the real signature fields
|
||||
type(self)._i += 1
|
||||
self.id = f"u{self._i}"
|
||||
|
||||
monkeypatch.setattr(we, "UploadFile", FakeUploadFile)
|
||||
|
||||
# Patch external image fetcher
|
||||
def fake_get(url: str):
|
||||
assert url == "https://example.com/image.png"
|
||||
return SimpleNamespace(status_code=200, headers={"Content-Type": "image/png"}, content=external_bytes)
|
||||
|
||||
monkeypatch.setattr(we, "ssrf_proxy", SimpleNamespace(get=fake_get))
|
||||
|
||||
# A hashable internal part object with a blob attribute
|
||||
class HashablePart:
|
||||
def __init__(self, blob: bytes):
|
||||
self.blob = blob
|
||||
|
||||
def __hash__(self) -> int: # ensure it can be used as a dict key like real docx parts
|
||||
return id(self)
|
||||
|
||||
# Build a minimal doc object with both external and internal image rels
|
||||
internal_part = HashablePart(blob=internal_bytes)
|
||||
rel_ext = SimpleNamespace(is_external=True, target_ref="https://example.com/image.png")
|
||||
rel_int = SimpleNamespace(is_external=False, target_ref="word/media/image1.png", target_part=internal_part)
|
||||
doc = SimpleNamespace(part=SimpleNamespace(rels={"rId1": rel_ext, "rId2": rel_int}))
|
||||
|
||||
extractor = object.__new__(WordExtractor)
|
||||
extractor.tenant_id = "t1"
|
||||
extractor.user_id = "u1"
|
||||
|
||||
image_map = extractor._extract_images_from_docx(doc)
|
||||
|
||||
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
|
||||
assert set(image_map.keys()) == {"rId1", internal_part}
|
||||
assert all(v.startswith(" and v.endswith("/file-preview)") for v in image_map.values())
|
||||
|
||||
# Storage should receive both payloads
|
||||
payloads = {data for _, data in saves}
|
||||
assert external_bytes in payloads
|
||||
assert internal_bytes in payloads
|
||||
|
||||
# DB interactions should be recorded
|
||||
assert len(db_stub.session.added) == 2
|
||||
assert db_stub.session.committed is True
|
||||
|
||||
Reference in New Issue
Block a user