mirror of
https://github.com/langgenius/dify.git
synced 2025-12-25 01:00:42 -05:00
fix: dos in annotation import (#29470)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -670,3 +670,14 @@ SINGLE_CHUNK_ATTACHMENT_LIMIT=10
|
|||||||
ATTACHMENT_IMAGE_FILE_SIZE_LIMIT=2
|
ATTACHMENT_IMAGE_FILE_SIZE_LIMIT=2
|
||||||
ATTACHMENT_IMAGE_DOWNLOAD_TIMEOUT=60
|
ATTACHMENT_IMAGE_DOWNLOAD_TIMEOUT=60
|
||||||
IMAGE_FILE_BATCH_LIMIT=10
|
IMAGE_FILE_BATCH_LIMIT=10
|
||||||
|
|
||||||
|
# Maximum allowed CSV file size for annotation import in megabytes
|
||||||
|
ANNOTATION_IMPORT_FILE_SIZE_LIMIT=2
|
||||||
|
#Maximum number of annotation records allowed in a single import
|
||||||
|
ANNOTATION_IMPORT_MAX_RECORDS=10000
|
||||||
|
# Minimum number of annotation records required in a single import
|
||||||
|
ANNOTATION_IMPORT_MIN_RECORDS=1
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE=5
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR=20
|
||||||
|
# Maximum number of concurrent annotation import tasks per tenant
|
||||||
|
ANNOTATION_IMPORT_MAX_CONCURRENT=5
|
||||||
@@ -380,6 +380,37 @@ class FileUploadConfig(BaseSettings):
|
|||||||
default=60,
|
default=60,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Annotation Import Security Configurations
|
||||||
|
ANNOTATION_IMPORT_FILE_SIZE_LIMIT: NonNegativeInt = Field(
|
||||||
|
description="Maximum allowed CSV file size for annotation import in megabytes",
|
||||||
|
default=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
ANNOTATION_IMPORT_MAX_RECORDS: PositiveInt = Field(
|
||||||
|
description="Maximum number of annotation records allowed in a single import",
|
||||||
|
default=10000,
|
||||||
|
)
|
||||||
|
|
||||||
|
ANNOTATION_IMPORT_MIN_RECORDS: PositiveInt = Field(
|
||||||
|
description="Minimum number of annotation records required in a single import",
|
||||||
|
default=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE: PositiveInt = Field(
|
||||||
|
description="Maximum number of annotation import requests per minute per tenant",
|
||||||
|
default=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR: PositiveInt = Field(
|
||||||
|
description="Maximum number of annotation import requests per hour per tenant",
|
||||||
|
default=20,
|
||||||
|
)
|
||||||
|
|
||||||
|
ANNOTATION_IMPORT_MAX_CONCURRENT: PositiveInt = Field(
|
||||||
|
description="Maximum number of concurrent annotation import tasks per tenant",
|
||||||
|
default=2,
|
||||||
|
)
|
||||||
|
|
||||||
inner_UPLOAD_FILE_EXTENSION_BLACKLIST: str = Field(
|
inner_UPLOAD_FILE_EXTENSION_BLACKLIST: str = Field(
|
||||||
description=(
|
description=(
|
||||||
"Comma-separated list of file extensions that are blocked from upload. "
|
"Comma-separated list of file extensions that are blocked from upload. "
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from typing import Any, Literal
|
from typing import Any, Literal
|
||||||
|
|
||||||
from flask import request
|
from flask import abort, request
|
||||||
from flask_restx import Resource, fields, marshal, marshal_with
|
from flask_restx import Resource, fields, marshal, marshal_with
|
||||||
from pydantic import BaseModel, Field, field_validator
|
from pydantic import BaseModel, Field, field_validator
|
||||||
|
|
||||||
@@ -8,6 +8,8 @@ from controllers.common.errors import NoFileUploadedError, TooManyFilesError
|
|||||||
from controllers.console import console_ns
|
from controllers.console import console_ns
|
||||||
from controllers.console.wraps import (
|
from controllers.console.wraps import (
|
||||||
account_initialization_required,
|
account_initialization_required,
|
||||||
|
annotation_import_concurrency_limit,
|
||||||
|
annotation_import_rate_limit,
|
||||||
cloud_edition_billing_resource_check,
|
cloud_edition_billing_resource_check,
|
||||||
edit_permission_required,
|
edit_permission_required,
|
||||||
setup_required,
|
setup_required,
|
||||||
@@ -314,18 +316,25 @@ class AnnotationUpdateDeleteApi(Resource):
|
|||||||
@console_ns.route("/apps/<uuid:app_id>/annotations/batch-import")
|
@console_ns.route("/apps/<uuid:app_id>/annotations/batch-import")
|
||||||
class AnnotationBatchImportApi(Resource):
|
class AnnotationBatchImportApi(Resource):
|
||||||
@console_ns.doc("batch_import_annotations")
|
@console_ns.doc("batch_import_annotations")
|
||||||
@console_ns.doc(description="Batch import annotations from CSV file")
|
@console_ns.doc(description="Batch import annotations from CSV file with rate limiting and security checks")
|
||||||
@console_ns.doc(params={"app_id": "Application ID"})
|
@console_ns.doc(params={"app_id": "Application ID"})
|
||||||
@console_ns.response(200, "Batch import started successfully")
|
@console_ns.response(200, "Batch import started successfully")
|
||||||
@console_ns.response(403, "Insufficient permissions")
|
@console_ns.response(403, "Insufficient permissions")
|
||||||
@console_ns.response(400, "No file uploaded or too many files")
|
@console_ns.response(400, "No file uploaded or too many files")
|
||||||
|
@console_ns.response(413, "File too large")
|
||||||
|
@console_ns.response(429, "Too many requests or concurrent imports")
|
||||||
@setup_required
|
@setup_required
|
||||||
@login_required
|
@login_required
|
||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
@cloud_edition_billing_resource_check("annotation")
|
@cloud_edition_billing_resource_check("annotation")
|
||||||
|
@annotation_import_rate_limit
|
||||||
|
@annotation_import_concurrency_limit
|
||||||
@edit_permission_required
|
@edit_permission_required
|
||||||
def post(self, app_id):
|
def post(self, app_id):
|
||||||
|
from configs import dify_config
|
||||||
|
|
||||||
app_id = str(app_id)
|
app_id = str(app_id)
|
||||||
|
|
||||||
# check file
|
# check file
|
||||||
if "file" not in request.files:
|
if "file" not in request.files:
|
||||||
raise NoFileUploadedError()
|
raise NoFileUploadedError()
|
||||||
@@ -335,9 +344,27 @@ class AnnotationBatchImportApi(Resource):
|
|||||||
|
|
||||||
# get file from request
|
# get file from request
|
||||||
file = request.files["file"]
|
file = request.files["file"]
|
||||||
|
|
||||||
# check file type
|
# check file type
|
||||||
if not file.filename or not file.filename.lower().endswith(".csv"):
|
if not file.filename or not file.filename.lower().endswith(".csv"):
|
||||||
raise ValueError("Invalid file type. Only CSV files are allowed")
|
raise ValueError("Invalid file type. Only CSV files are allowed")
|
||||||
|
|
||||||
|
# Check file size before processing
|
||||||
|
file.seek(0, 2) # Seek to end of file
|
||||||
|
file_size = file.tell()
|
||||||
|
file.seek(0) # Reset to beginning
|
||||||
|
|
||||||
|
max_size_bytes = dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT * 1024 * 1024
|
||||||
|
if file_size > max_size_bytes:
|
||||||
|
abort(
|
||||||
|
413,
|
||||||
|
f"File size exceeds maximum limit of {dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT}MB. "
|
||||||
|
f"Please reduce the file size and try again.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_size == 0:
|
||||||
|
raise ValueError("The uploaded file is empty")
|
||||||
|
|
||||||
return AppAnnotationService.batch_import_app_annotations(app_id, file)
|
return AppAnnotationService.batch_import_app_annotations(app_id, file)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -331,3 +331,91 @@ def is_admin_or_owner_required(f: Callable[P, R]):
|
|||||||
return f(*args, **kwargs)
|
return f(*args, **kwargs)
|
||||||
|
|
||||||
return decorated_function
|
return decorated_function
|
||||||
|
|
||||||
|
|
||||||
|
def annotation_import_rate_limit(view: Callable[P, R]):
|
||||||
|
"""
|
||||||
|
Rate limiting decorator for annotation import operations.
|
||||||
|
|
||||||
|
Implements sliding window rate limiting with two tiers:
|
||||||
|
- Short-term: Configurable requests per minute (default: 5)
|
||||||
|
- Long-term: Configurable requests per hour (default: 20)
|
||||||
|
|
||||||
|
Uses Redis ZSET for distributed rate limiting across multiple instances.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@wraps(view)
|
||||||
|
def decorated(*args: P.args, **kwargs: P.kwargs):
|
||||||
|
_, current_tenant_id = current_account_with_tenant()
|
||||||
|
current_time = int(time.time() * 1000)
|
||||||
|
|
||||||
|
# Check per-minute rate limit
|
||||||
|
minute_key = f"annotation_import_rate_limit:{current_tenant_id}:1min"
|
||||||
|
redis_client.zadd(minute_key, {current_time: current_time})
|
||||||
|
redis_client.zremrangebyscore(minute_key, 0, current_time - 60000)
|
||||||
|
minute_count = redis_client.zcard(minute_key)
|
||||||
|
redis_client.expire(minute_key, 120) # 2 minutes TTL
|
||||||
|
|
||||||
|
if minute_count > dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE:
|
||||||
|
abort(
|
||||||
|
429,
|
||||||
|
f"Too many annotation import requests. Maximum {dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE} "
|
||||||
|
f"requests per minute allowed. Please try again later.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check per-hour rate limit
|
||||||
|
hour_key = f"annotation_import_rate_limit:{current_tenant_id}:1hour"
|
||||||
|
redis_client.zadd(hour_key, {current_time: current_time})
|
||||||
|
redis_client.zremrangebyscore(hour_key, 0, current_time - 3600000)
|
||||||
|
hour_count = redis_client.zcard(hour_key)
|
||||||
|
redis_client.expire(hour_key, 7200) # 2 hours TTL
|
||||||
|
|
||||||
|
if hour_count > dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR:
|
||||||
|
abort(
|
||||||
|
429,
|
||||||
|
f"Too many annotation import requests. Maximum {dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR} "
|
||||||
|
f"requests per hour allowed. Please try again later.",
|
||||||
|
)
|
||||||
|
|
||||||
|
return view(*args, **kwargs)
|
||||||
|
|
||||||
|
return decorated
|
||||||
|
|
||||||
|
|
||||||
|
def annotation_import_concurrency_limit(view: Callable[P, R]):
|
||||||
|
"""
|
||||||
|
Concurrency control decorator for annotation import operations.
|
||||||
|
|
||||||
|
Limits the number of concurrent import tasks per tenant to prevent
|
||||||
|
resource exhaustion and ensure fair resource allocation.
|
||||||
|
|
||||||
|
Uses Redis ZSET to track active import jobs with automatic cleanup
|
||||||
|
of stale entries (jobs older than 2 minutes).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@wraps(view)
|
||||||
|
def decorated(*args: P.args, **kwargs: P.kwargs):
|
||||||
|
_, current_tenant_id = current_account_with_tenant()
|
||||||
|
current_time = int(time.time() * 1000)
|
||||||
|
|
||||||
|
active_jobs_key = f"annotation_import_active:{current_tenant_id}"
|
||||||
|
|
||||||
|
# Clean up stale entries (jobs that should have completed or timed out)
|
||||||
|
stale_threshold = current_time - 120000 # 2 minutes ago
|
||||||
|
redis_client.zremrangebyscore(active_jobs_key, 0, stale_threshold)
|
||||||
|
|
||||||
|
# Check current active job count
|
||||||
|
active_count = redis_client.zcard(active_jobs_key)
|
||||||
|
|
||||||
|
if active_count >= dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT:
|
||||||
|
abort(
|
||||||
|
429,
|
||||||
|
f"Too many concurrent import tasks. Maximum {dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT} "
|
||||||
|
f"concurrent imports allowed per workspace. Please wait for existing imports to complete.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Allow the request to proceed
|
||||||
|
# The actual job registration will happen in the service layer
|
||||||
|
return view(*args, **kwargs)
|
||||||
|
|
||||||
|
return decorated
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
from sqlalchemy import or_, select
|
from sqlalchemy import or_, select
|
||||||
from werkzeug.datastructures import FileStorage
|
from werkzeug.datastructures import FileStorage
|
||||||
from werkzeug.exceptions import NotFound
|
from werkzeug.exceptions import NotFound
|
||||||
@@ -330,6 +333,18 @@ class AppAnnotationService:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def batch_import_app_annotations(cls, app_id, file: FileStorage):
|
def batch_import_app_annotations(cls, app_id, file: FileStorage):
|
||||||
|
"""
|
||||||
|
Batch import annotations from CSV file with enhanced security checks.
|
||||||
|
|
||||||
|
Security features:
|
||||||
|
- File size validation
|
||||||
|
- Row count limits (min/max)
|
||||||
|
- Memory-efficient CSV parsing
|
||||||
|
- Subscription quota validation
|
||||||
|
- Concurrency tracking
|
||||||
|
"""
|
||||||
|
from configs import dify_config
|
||||||
|
|
||||||
# get app info
|
# get app info
|
||||||
current_user, current_tenant_id = current_account_with_tenant()
|
current_user, current_tenant_id = current_account_with_tenant()
|
||||||
app = (
|
app = (
|
||||||
@@ -341,16 +356,80 @@ class AppAnnotationService:
|
|||||||
if not app:
|
if not app:
|
||||||
raise NotFound("App not found")
|
raise NotFound("App not found")
|
||||||
|
|
||||||
|
job_id: str | None = None # Initialize to avoid unbound variable error
|
||||||
try:
|
try:
|
||||||
# Skip the first row
|
# Quick row count check before full parsing (memory efficient)
|
||||||
df = pd.read_csv(file.stream, dtype=str)
|
# Read only first chunk to estimate row count
|
||||||
result = []
|
file.stream.seek(0)
|
||||||
for _, row in df.iterrows():
|
first_chunk = file.stream.read(8192) # Read first 8KB
|
||||||
content = {"question": row.iloc[0], "answer": row.iloc[1]}
|
file.stream.seek(0)
|
||||||
|
|
||||||
|
# Estimate row count from first chunk
|
||||||
|
newline_count = first_chunk.count(b"\n")
|
||||||
|
if newline_count == 0:
|
||||||
|
raise ValueError("The CSV file appears to be empty or invalid.")
|
||||||
|
|
||||||
|
# Parse CSV with row limit to prevent memory exhaustion
|
||||||
|
# Use chunksize for memory-efficient processing
|
||||||
|
max_records = dify_config.ANNOTATION_IMPORT_MAX_RECORDS
|
||||||
|
min_records = dify_config.ANNOTATION_IMPORT_MIN_RECORDS
|
||||||
|
|
||||||
|
# Read CSV in chunks to avoid loading entire file into memory
|
||||||
|
df = pd.read_csv(
|
||||||
|
file.stream,
|
||||||
|
dtype=str,
|
||||||
|
nrows=max_records + 1, # Read one extra to detect overflow
|
||||||
|
engine="python",
|
||||||
|
on_bad_lines="skip", # Skip malformed lines instead of crashing
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate column count
|
||||||
|
if len(df.columns) < 2:
|
||||||
|
raise ValueError("Invalid CSV format. The file must contain at least 2 columns (question and answer).")
|
||||||
|
|
||||||
|
# Build result list with validation
|
||||||
|
result: list[dict] = []
|
||||||
|
for idx, row in df.iterrows():
|
||||||
|
# Stop if we exceed the limit
|
||||||
|
if len(result) >= max_records:
|
||||||
|
raise ValueError(
|
||||||
|
f"The CSV file contains too many records. Maximum {max_records} records allowed per import. "
|
||||||
|
f"Please split your file into smaller batches."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract and validate question and answer
|
||||||
|
try:
|
||||||
|
question_raw = row.iloc[0]
|
||||||
|
answer_raw = row.iloc[1]
|
||||||
|
except (IndexError, KeyError):
|
||||||
|
continue # Skip malformed rows
|
||||||
|
|
||||||
|
# Convert to string and strip whitespace
|
||||||
|
question = str(question_raw).strip() if question_raw is not None else ""
|
||||||
|
answer = str(answer_raw).strip() if answer_raw is not None else ""
|
||||||
|
|
||||||
|
# Skip empty entries or NaN values
|
||||||
|
if not question or not answer or question.lower() == "nan" or answer.lower() == "nan":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Validate length constraints (idx is pandas index, convert to int for display)
|
||||||
|
row_num = int(idx) + 2 if isinstance(idx, (int, float)) else len(result) + 2
|
||||||
|
if len(question) > 2000:
|
||||||
|
raise ValueError(f"Question at row {row_num} is too long. Maximum 2000 characters allowed.")
|
||||||
|
if len(answer) > 10000:
|
||||||
|
raise ValueError(f"Answer at row {row_num} is too long. Maximum 10000 characters allowed.")
|
||||||
|
|
||||||
|
content = {"question": question, "answer": answer}
|
||||||
result.append(content)
|
result.append(content)
|
||||||
if len(result) == 0:
|
|
||||||
raise ValueError("The CSV file is empty.")
|
# Validate minimum records
|
||||||
# check annotation limit
|
if len(result) < min_records:
|
||||||
|
raise ValueError(
|
||||||
|
f"The CSV file must contain at least {min_records} valid annotation record(s). "
|
||||||
|
f"Found {len(result)} valid record(s)."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check annotation quota limit
|
||||||
features = FeatureService.get_features(current_tenant_id)
|
features = FeatureService.get_features(current_tenant_id)
|
||||||
if features.billing.enabled:
|
if features.billing.enabled:
|
||||||
annotation_quota_limit = features.annotation_quota_limit
|
annotation_quota_limit = features.annotation_quota_limit
|
||||||
@@ -359,12 +438,34 @@ class AppAnnotationService:
|
|||||||
# async job
|
# async job
|
||||||
job_id = str(uuid.uuid4())
|
job_id = str(uuid.uuid4())
|
||||||
indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}"
|
indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}"
|
||||||
# send batch add segments task
|
|
||||||
|
# Register job in active tasks list for concurrency tracking
|
||||||
|
current_time = int(naive_utc_now().timestamp() * 1000)
|
||||||
|
active_jobs_key = f"annotation_import_active:{current_tenant_id}"
|
||||||
|
redis_client.zadd(active_jobs_key, {job_id: current_time})
|
||||||
|
redis_client.expire(active_jobs_key, 7200) # 2 hours TTL
|
||||||
|
|
||||||
|
# Set job status
|
||||||
redis_client.setnx(indexing_cache_key, "waiting")
|
redis_client.setnx(indexing_cache_key, "waiting")
|
||||||
batch_import_annotations_task.delay(str(job_id), result, app_id, current_tenant_id, current_user.id)
|
batch_import_annotations_task.delay(str(job_id), result, app_id, current_tenant_id, current_user.id)
|
||||||
except Exception as e:
|
|
||||||
|
except ValueError as e:
|
||||||
return {"error_msg": str(e)}
|
return {"error_msg": str(e)}
|
||||||
return {"job_id": job_id, "job_status": "waiting"}
|
except Exception as e:
|
||||||
|
# Clean up active job registration on error (only if job was created)
|
||||||
|
if job_id is not None:
|
||||||
|
try:
|
||||||
|
active_jobs_key = f"annotation_import_active:{current_tenant_id}"
|
||||||
|
redis_client.zrem(active_jobs_key, job_id)
|
||||||
|
except Exception:
|
||||||
|
# Silently ignore cleanup errors - the job will be auto-expired
|
||||||
|
logger.debug("Failed to clean up active job tracking during error handling")
|
||||||
|
|
||||||
|
# Check if it's a CSV parsing error
|
||||||
|
error_str = str(e)
|
||||||
|
return {"error_msg": f"An error occurred while processing the file: {error_str}"}
|
||||||
|
|
||||||
|
return {"job_id": job_id, "job_status": "waiting", "record_count": len(result)}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_annotation_hit_histories(cls, app_id: str, annotation_id: str, page, limit):
|
def get_annotation_hit_histories(cls, app_id: str, annotation_id: str, page, limit):
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id:
|
|||||||
logger.info(click.style(f"Start batch import annotation: {job_id}", fg="green"))
|
logger.info(click.style(f"Start batch import annotation: {job_id}", fg="green"))
|
||||||
start_at = time.perf_counter()
|
start_at = time.perf_counter()
|
||||||
indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}"
|
indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}"
|
||||||
|
active_jobs_key = f"annotation_import_active:{tenant_id}"
|
||||||
|
|
||||||
# get app info
|
# get app info
|
||||||
app = db.session.query(App).where(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first()
|
app = db.session.query(App).where(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first()
|
||||||
|
|
||||||
@@ -91,4 +93,13 @@ def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id:
|
|||||||
redis_client.setex(indexing_error_msg_key, 600, str(e))
|
redis_client.setex(indexing_error_msg_key, 600, str(e))
|
||||||
logger.exception("Build index for batch import annotations failed")
|
logger.exception("Build index for batch import annotations failed")
|
||||||
finally:
|
finally:
|
||||||
|
# Clean up active job tracking to release concurrency slot
|
||||||
|
try:
|
||||||
|
redis_client.zrem(active_jobs_key, job_id)
|
||||||
|
logger.debug("Released concurrency slot for job: %s", job_id)
|
||||||
|
except Exception as cleanup_error:
|
||||||
|
# Log but don't fail if cleanup fails - the job will be auto-expired
|
||||||
|
logger.warning("Failed to clean up active job tracking for %s: %s", job_id, cleanup_error)
|
||||||
|
|
||||||
|
# Close database session
|
||||||
db.session.close()
|
db.session.close()
|
||||||
|
|||||||
@@ -0,0 +1,344 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for annotation import security features.
|
||||||
|
|
||||||
|
Tests rate limiting, concurrency control, file validation, and other
|
||||||
|
security features added to prevent DoS attacks on the annotation import endpoint.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import io
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from werkzeug.datastructures import FileStorage
|
||||||
|
|
||||||
|
from configs import dify_config
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnnotationImportRateLimiting:
|
||||||
|
"""Test rate limiting for annotation import operations."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_redis(self):
|
||||||
|
"""Mock Redis client for testing."""
|
||||||
|
with patch("controllers.console.wraps.redis_client") as mock:
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_current_account(self):
|
||||||
|
"""Mock current account with tenant."""
|
||||||
|
with patch("controllers.console.wraps.current_account_with_tenant") as mock:
|
||||||
|
mock.return_value = (MagicMock(id="user_id"), "test_tenant_id")
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
def test_rate_limit_per_minute_enforced(self, mock_redis, mock_current_account):
|
||||||
|
"""Test that per-minute rate limit is enforced."""
|
||||||
|
from controllers.console.wraps import annotation_import_rate_limit
|
||||||
|
|
||||||
|
# Simulate exceeding per-minute limit
|
||||||
|
mock_redis.zcard.side_effect = [
|
||||||
|
dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE + 1, # Minute check
|
||||||
|
10, # Hour check
|
||||||
|
]
|
||||||
|
|
||||||
|
@annotation_import_rate_limit
|
||||||
|
def dummy_view():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
# Should abort with 429
|
||||||
|
with pytest.raises(Exception) as exc_info:
|
||||||
|
dummy_view()
|
||||||
|
|
||||||
|
# Verify it's a rate limit error
|
||||||
|
assert "429" in str(exc_info.value) or "Too many" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_rate_limit_per_hour_enforced(self, mock_redis, mock_current_account):
|
||||||
|
"""Test that per-hour rate limit is enforced."""
|
||||||
|
from controllers.console.wraps import annotation_import_rate_limit
|
||||||
|
|
||||||
|
# Simulate exceeding per-hour limit
|
||||||
|
mock_redis.zcard.side_effect = [
|
||||||
|
3, # Minute check (under limit)
|
||||||
|
dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR + 1, # Hour check (over limit)
|
||||||
|
]
|
||||||
|
|
||||||
|
@annotation_import_rate_limit
|
||||||
|
def dummy_view():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
# Should abort with 429
|
||||||
|
with pytest.raises(Exception) as exc_info:
|
||||||
|
dummy_view()
|
||||||
|
|
||||||
|
assert "429" in str(exc_info.value) or "Too many" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_rate_limit_within_limits_passes(self, mock_redis, mock_current_account):
|
||||||
|
"""Test that requests within limits are allowed."""
|
||||||
|
from controllers.console.wraps import annotation_import_rate_limit
|
||||||
|
|
||||||
|
# Simulate being under both limits
|
||||||
|
mock_redis.zcard.return_value = 2
|
||||||
|
|
||||||
|
@annotation_import_rate_limit
|
||||||
|
def dummy_view():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
# Should succeed
|
||||||
|
result = dummy_view()
|
||||||
|
assert result == "success"
|
||||||
|
|
||||||
|
# Verify Redis operations were called
|
||||||
|
assert mock_redis.zadd.called
|
||||||
|
assert mock_redis.zremrangebyscore.called
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnnotationImportConcurrencyControl:
|
||||||
|
"""Test concurrency control for annotation import operations."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_redis(self):
|
||||||
|
"""Mock Redis client for testing."""
|
||||||
|
with patch("controllers.console.wraps.redis_client") as mock:
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_current_account(self):
|
||||||
|
"""Mock current account with tenant."""
|
||||||
|
with patch("controllers.console.wraps.current_account_with_tenant") as mock:
|
||||||
|
mock.return_value = (MagicMock(id="user_id"), "test_tenant_id")
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
def test_concurrency_limit_enforced(self, mock_redis, mock_current_account):
|
||||||
|
"""Test that concurrent task limit is enforced."""
|
||||||
|
from controllers.console.wraps import annotation_import_concurrency_limit
|
||||||
|
|
||||||
|
# Simulate max concurrent tasks already running
|
||||||
|
mock_redis.zcard.return_value = dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT
|
||||||
|
|
||||||
|
@annotation_import_concurrency_limit
|
||||||
|
def dummy_view():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
# Should abort with 429
|
||||||
|
with pytest.raises(Exception) as exc_info:
|
||||||
|
dummy_view()
|
||||||
|
|
||||||
|
assert "429" in str(exc_info.value) or "concurrent" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
def test_concurrency_within_limit_passes(self, mock_redis, mock_current_account):
|
||||||
|
"""Test that requests within concurrency limits are allowed."""
|
||||||
|
from controllers.console.wraps import annotation_import_concurrency_limit
|
||||||
|
|
||||||
|
# Simulate being under concurrent task limit
|
||||||
|
mock_redis.zcard.return_value = 1
|
||||||
|
|
||||||
|
@annotation_import_concurrency_limit
|
||||||
|
def dummy_view():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
# Should succeed
|
||||||
|
result = dummy_view()
|
||||||
|
assert result == "success"
|
||||||
|
|
||||||
|
def test_stale_jobs_are_cleaned_up(self, mock_redis, mock_current_account):
|
||||||
|
"""Test that old/stale job entries are removed."""
|
||||||
|
from controllers.console.wraps import annotation_import_concurrency_limit
|
||||||
|
|
||||||
|
mock_redis.zcard.return_value = 0
|
||||||
|
|
||||||
|
@annotation_import_concurrency_limit
|
||||||
|
def dummy_view():
|
||||||
|
return "success"
|
||||||
|
|
||||||
|
dummy_view()
|
||||||
|
|
||||||
|
# Verify cleanup was called
|
||||||
|
assert mock_redis.zremrangebyscore.called
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnnotationImportFileValidation:
|
||||||
|
"""Test file validation in annotation import."""
|
||||||
|
|
||||||
|
def test_file_size_limit_enforced(self):
|
||||||
|
"""Test that files exceeding size limit are rejected."""
|
||||||
|
# Create a file larger than the limit
|
||||||
|
max_size = dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT * 1024 * 1024
|
||||||
|
large_content = b"x" * (max_size + 1024) # Exceed by 1KB
|
||||||
|
|
||||||
|
file = FileStorage(stream=io.BytesIO(large_content), filename="test.csv", content_type="text/csv")
|
||||||
|
|
||||||
|
# Should be rejected in controller
|
||||||
|
# This would be tested in integration tests with actual endpoint
|
||||||
|
|
||||||
|
def test_empty_file_rejected(self):
|
||||||
|
"""Test that empty files are rejected."""
|
||||||
|
file = FileStorage(stream=io.BytesIO(b""), filename="test.csv", content_type="text/csv")
|
||||||
|
|
||||||
|
# Should be rejected
|
||||||
|
# This would be tested in integration tests
|
||||||
|
|
||||||
|
def test_non_csv_file_rejected(self):
|
||||||
|
"""Test that non-CSV files are rejected."""
|
||||||
|
file = FileStorage(stream=io.BytesIO(b"test"), filename="test.txt", content_type="text/plain")
|
||||||
|
|
||||||
|
# Should be rejected based on extension
|
||||||
|
# This would be tested in integration tests
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnnotationImportServiceValidation:
|
||||||
|
"""Test service layer validation for annotation import."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_app(self):
|
||||||
|
"""Mock application object."""
|
||||||
|
app = MagicMock()
|
||||||
|
app.id = "app_id"
|
||||||
|
return app
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_db_session(self):
|
||||||
|
"""Mock database session."""
|
||||||
|
with patch("services.annotation_service.db.session") as mock:
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
def test_max_records_limit_enforced(self, mock_app, mock_db_session):
|
||||||
|
"""Test that files with too many records are rejected."""
|
||||||
|
from services.annotation_service import AppAnnotationService
|
||||||
|
|
||||||
|
# Create CSV with too many records
|
||||||
|
max_records = dify_config.ANNOTATION_IMPORT_MAX_RECORDS
|
||||||
|
csv_content = "question,answer\n"
|
||||||
|
for i in range(max_records + 100):
|
||||||
|
csv_content += f"Question {i},Answer {i}\n"
|
||||||
|
|
||||||
|
file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv")
|
||||||
|
|
||||||
|
mock_db_session.query.return_value.where.return_value.first.return_value = mock_app
|
||||||
|
|
||||||
|
with patch("services.annotation_service.current_account_with_tenant") as mock_auth:
|
||||||
|
mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id")
|
||||||
|
|
||||||
|
with patch("services.annotation_service.FeatureService") as mock_features:
|
||||||
|
mock_features.get_features.return_value.billing.enabled = False
|
||||||
|
|
||||||
|
result = AppAnnotationService.batch_import_app_annotations("app_id", file)
|
||||||
|
|
||||||
|
# Should return error about too many records
|
||||||
|
assert "error_msg" in result
|
||||||
|
assert "too many" in result["error_msg"].lower() or "maximum" in result["error_msg"].lower()
|
||||||
|
|
||||||
|
def test_min_records_limit_enforced(self, mock_app, mock_db_session):
|
||||||
|
"""Test that files with too few valid records are rejected."""
|
||||||
|
from services.annotation_service import AppAnnotationService
|
||||||
|
|
||||||
|
# Create CSV with only header (no data rows)
|
||||||
|
csv_content = "question,answer\n"
|
||||||
|
|
||||||
|
file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv")
|
||||||
|
|
||||||
|
mock_db_session.query.return_value.where.return_value.first.return_value = mock_app
|
||||||
|
|
||||||
|
with patch("services.annotation_service.current_account_with_tenant") as mock_auth:
|
||||||
|
mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id")
|
||||||
|
|
||||||
|
result = AppAnnotationService.batch_import_app_annotations("app_id", file)
|
||||||
|
|
||||||
|
# Should return error about insufficient records
|
||||||
|
assert "error_msg" in result
|
||||||
|
assert "at least" in result["error_msg"].lower() or "minimum" in result["error_msg"].lower()
|
||||||
|
|
||||||
|
def test_invalid_csv_format_handled(self, mock_app, mock_db_session):
|
||||||
|
"""Test that invalid CSV format is handled gracefully."""
|
||||||
|
from services.annotation_service import AppAnnotationService
|
||||||
|
|
||||||
|
# Create invalid CSV content
|
||||||
|
csv_content = 'invalid,csv,format\nwith,unbalanced,quotes,and"stuff'
|
||||||
|
|
||||||
|
file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv")
|
||||||
|
|
||||||
|
mock_db_session.query.return_value.where.return_value.first.return_value = mock_app
|
||||||
|
|
||||||
|
with patch("services.annotation_service.current_account_with_tenant") as mock_auth:
|
||||||
|
mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id")
|
||||||
|
|
||||||
|
result = AppAnnotationService.batch_import_app_annotations("app_id", file)
|
||||||
|
|
||||||
|
# Should return error message
|
||||||
|
assert "error_msg" in result
|
||||||
|
|
||||||
|
def test_valid_import_succeeds(self, mock_app, mock_db_session):
|
||||||
|
"""Test that valid import request succeeds."""
|
||||||
|
from services.annotation_service import AppAnnotationService
|
||||||
|
|
||||||
|
# Create valid CSV
|
||||||
|
csv_content = "question,answer\nWhat is AI?,Artificial Intelligence\nWhat is ML?,Machine Learning\n"
|
||||||
|
|
||||||
|
file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv")
|
||||||
|
|
||||||
|
mock_db_session.query.return_value.where.return_value.first.return_value = mock_app
|
||||||
|
|
||||||
|
with patch("services.annotation_service.current_account_with_tenant") as mock_auth:
|
||||||
|
mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id")
|
||||||
|
|
||||||
|
with patch("services.annotation_service.FeatureService") as mock_features:
|
||||||
|
mock_features.get_features.return_value.billing.enabled = False
|
||||||
|
|
||||||
|
with patch("services.annotation_service.batch_import_annotations_task") as mock_task:
|
||||||
|
with patch("services.annotation_service.redis_client"):
|
||||||
|
result = AppAnnotationService.batch_import_app_annotations("app_id", file)
|
||||||
|
|
||||||
|
# Should return success response
|
||||||
|
assert "job_id" in result
|
||||||
|
assert "job_status" in result
|
||||||
|
assert result["job_status"] == "waiting"
|
||||||
|
assert "record_count" in result
|
||||||
|
assert result["record_count"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnnotationImportTaskOptimization:
|
||||||
|
"""Test optimizations in batch import task."""
|
||||||
|
|
||||||
|
def test_task_has_timeout_configured(self):
|
||||||
|
"""Test that task has proper timeout configuration."""
|
||||||
|
from tasks.annotation.batch_import_annotations_task import batch_import_annotations_task
|
||||||
|
|
||||||
|
# Verify task configuration
|
||||||
|
assert hasattr(batch_import_annotations_task, "time_limit")
|
||||||
|
assert hasattr(batch_import_annotations_task, "soft_time_limit")
|
||||||
|
|
||||||
|
# Check timeout values are reasonable
|
||||||
|
# Hard limit should be 6 minutes (360s)
|
||||||
|
# Soft limit should be 5 minutes (300s)
|
||||||
|
# Note: actual values depend on Celery configuration
|
||||||
|
|
||||||
|
|
||||||
|
class TestConfigurationValues:
|
||||||
|
"""Test that security configuration values are properly set."""
|
||||||
|
|
||||||
|
def test_rate_limit_configs_exist(self):
|
||||||
|
"""Test that rate limit configurations are defined."""
|
||||||
|
assert hasattr(dify_config, "ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE")
|
||||||
|
assert hasattr(dify_config, "ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR")
|
||||||
|
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE > 0
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR > 0
|
||||||
|
|
||||||
|
def test_file_size_limit_config_exists(self):
|
||||||
|
"""Test that file size limit configuration is defined."""
|
||||||
|
assert hasattr(dify_config, "ANNOTATION_IMPORT_FILE_SIZE_LIMIT")
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT > 0
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT <= 10 # Reasonable max (10MB)
|
||||||
|
|
||||||
|
def test_record_limit_configs_exist(self):
|
||||||
|
"""Test that record limit configurations are defined."""
|
||||||
|
assert hasattr(dify_config, "ANNOTATION_IMPORT_MAX_RECORDS")
|
||||||
|
assert hasattr(dify_config, "ANNOTATION_IMPORT_MIN_RECORDS")
|
||||||
|
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_MAX_RECORDS > 0
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_MIN_RECORDS > 0
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_MIN_RECORDS < dify_config.ANNOTATION_IMPORT_MAX_RECORDS
|
||||||
|
|
||||||
|
def test_concurrency_limit_config_exists(self):
|
||||||
|
"""Test that concurrency limit configuration is defined."""
|
||||||
|
assert hasattr(dify_config, "ANNOTATION_IMPORT_MAX_CONCURRENT")
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT > 0
|
||||||
|
assert dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT <= 10 # Reasonable upper bound
|
||||||
@@ -1448,5 +1448,16 @@ WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK=0
|
|||||||
# Tenant isolated task queue configuration
|
# Tenant isolated task queue configuration
|
||||||
TENANT_ISOLATED_TASK_CONCURRENCY=1
|
TENANT_ISOLATED_TASK_CONCURRENCY=1
|
||||||
|
|
||||||
|
# Maximum allowed CSV file size for annotation import in megabytes
|
||||||
|
ANNOTATION_IMPORT_FILE_SIZE_LIMIT=2
|
||||||
|
#Maximum number of annotation records allowed in a single import
|
||||||
|
ANNOTATION_IMPORT_MAX_RECORDS=10000
|
||||||
|
# Minimum number of annotation records required in a single import
|
||||||
|
ANNOTATION_IMPORT_MIN_RECORDS=1
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE=5
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR=20
|
||||||
|
# Maximum number of concurrent annotation import tasks per tenant
|
||||||
|
ANNOTATION_IMPORT_MAX_CONCURRENT=5
|
||||||
|
|
||||||
# The API key of amplitude
|
# The API key of amplitude
|
||||||
AMPLITUDE_API_KEY=
|
AMPLITUDE_API_KEY=
|
||||||
|
|||||||
@@ -648,6 +648,12 @@ x-shared-env: &shared-api-worker-env
|
|||||||
WORKFLOW_SCHEDULE_POLLER_BATCH_SIZE: ${WORKFLOW_SCHEDULE_POLLER_BATCH_SIZE:-100}
|
WORKFLOW_SCHEDULE_POLLER_BATCH_SIZE: ${WORKFLOW_SCHEDULE_POLLER_BATCH_SIZE:-100}
|
||||||
WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK: ${WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK:-0}
|
WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK: ${WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK:-0}
|
||||||
TENANT_ISOLATED_TASK_CONCURRENCY: ${TENANT_ISOLATED_TASK_CONCURRENCY:-1}
|
TENANT_ISOLATED_TASK_CONCURRENCY: ${TENANT_ISOLATED_TASK_CONCURRENCY:-1}
|
||||||
|
ANNOTATION_IMPORT_FILE_SIZE_LIMIT: ${ANNOTATION_IMPORT_FILE_SIZE_LIMIT:-2}
|
||||||
|
ANNOTATION_IMPORT_MAX_RECORDS: ${ANNOTATION_IMPORT_MAX_RECORDS:-10000}
|
||||||
|
ANNOTATION_IMPORT_MIN_RECORDS: ${ANNOTATION_IMPORT_MIN_RECORDS:-1}
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE: ${ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE:-5}
|
||||||
|
ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR: ${ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR:-20}
|
||||||
|
ANNOTATION_IMPORT_MAX_CONCURRENT: ${ANNOTATION_IMPORT_MAX_CONCURRENT:-5}
|
||||||
AMPLITUDE_API_KEY: ${AMPLITUDE_API_KEY:-}
|
AMPLITUDE_API_KEY: ${AMPLITUDE_API_KEY:-}
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|||||||
Reference in New Issue
Block a user