perf: commit once (#29590)

This commit is contained in:
wangxiaolei
2025-12-15 11:40:26 +08:00
committed by GitHub
parent 1a18012f98
commit 8f3fd9a728
2 changed files with 89 additions and 6 deletions

View File

@@ -84,7 +84,7 @@ class WordExtractor(BaseExtractor):
image_count = 0
image_map = {}
for rId, rel in doc.part.rels.items():
for r_id, rel in doc.part.rels.items():
if "image" in rel.target_ref:
image_count += 1
if rel.is_external:
@@ -121,9 +121,8 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
# Use rId as key for external images since target_part is undefined
image_map[rId] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
# Use r_id as key for external images since target_part is undefined
image_map[r_id] = f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
else:
image_ext = rel.target_ref.split(".")[-1]
if image_ext is None:
@@ -151,12 +150,11 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
# Use target_part as key for internal images
image_map[rel.target_part] = (
f"![image]({dify_config.FILES_URL}/files/{upload_file.id}/file-preview)"
)
db.session.commit()
return image_map
def _table_to_markdown(self, table, image_map):

View File

@@ -1,7 +1,10 @@
"""Primarily used for testing merged cell scenarios"""
from types import SimpleNamespace
from docx import Document
import core.rag.extractor.word_extractor as we
from core.rag.extractor.word_extractor import WordExtractor
@@ -47,3 +50,85 @@ def test_parse_row():
extractor = object.__new__(WordExtractor)
for idx, row in enumerate(table.rows):
assert extractor._parse_row(row, {}, 3) == gt[idx]
def test_extract_images_from_docx(monkeypatch):
external_bytes = b"ext-bytes"
internal_bytes = b"int-bytes"
# Patch storage.save to capture writes
saves: list[tuple[str, bytes]] = []
def save(key: str, data: bytes):
saves.append((key, data))
monkeypatch.setattr(we, "storage", SimpleNamespace(save=save))
# Patch db.session to record adds/commit
class DummySession:
def __init__(self):
self.added = []
self.committed = False
def add(self, obj):
self.added.append(obj)
def commit(self):
self.committed = True
db_stub = SimpleNamespace(session=DummySession())
monkeypatch.setattr(we, "db", db_stub)
# Patch config values used for URL composition and storage type
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
# Patch UploadFile to avoid real DB models
class FakeUploadFile:
_i = 0
def __init__(self, **kwargs): # kwargs match the real signature fields
type(self)._i += 1
self.id = f"u{self._i}"
monkeypatch.setattr(we, "UploadFile", FakeUploadFile)
# Patch external image fetcher
def fake_get(url: str):
assert url == "https://example.com/image.png"
return SimpleNamespace(status_code=200, headers={"Content-Type": "image/png"}, content=external_bytes)
monkeypatch.setattr(we, "ssrf_proxy", SimpleNamespace(get=fake_get))
# A hashable internal part object with a blob attribute
class HashablePart:
def __init__(self, blob: bytes):
self.blob = blob
def __hash__(self) -> int: # ensure it can be used as a dict key like real docx parts
return id(self)
# Build a minimal doc object with both external and internal image rels
internal_part = HashablePart(blob=internal_bytes)
rel_ext = SimpleNamespace(is_external=True, target_ref="https://example.com/image.png")
rel_int = SimpleNamespace(is_external=False, target_ref="word/media/image1.png", target_part=internal_part)
doc = SimpleNamespace(part=SimpleNamespace(rels={"rId1": rel_ext, "rId2": rel_int}))
extractor = object.__new__(WordExtractor)
extractor.tenant_id = "t1"
extractor.user_id = "u1"
image_map = extractor._extract_images_from_docx(doc)
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
assert set(image_map.keys()) == {"rId1", internal_part}
assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values())
# Storage should receive both payloads
payloads = {data for _, data in saves}
assert external_bytes in payloads
assert internal_bytes in payloads
# DB interactions should be recorded
assert len(db_stub.session.added) == 2
assert db_stub.session.committed is True