diff --git a/api/.env.example b/api/.env.example index fb92199893..8c4ea617d4 100644 --- a/api/.env.example +++ b/api/.env.example @@ -670,3 +670,14 @@ SINGLE_CHUNK_ATTACHMENT_LIMIT=10 ATTACHMENT_IMAGE_FILE_SIZE_LIMIT=2 ATTACHMENT_IMAGE_DOWNLOAD_TIMEOUT=60 IMAGE_FILE_BATCH_LIMIT=10 + +# Maximum allowed CSV file size for annotation import in megabytes +ANNOTATION_IMPORT_FILE_SIZE_LIMIT=2 +#Maximum number of annotation records allowed in a single import +ANNOTATION_IMPORT_MAX_RECORDS=10000 +# Minimum number of annotation records required in a single import +ANNOTATION_IMPORT_MIN_RECORDS=1 +ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE=5 +ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR=20 +# Maximum number of concurrent annotation import tasks per tenant +ANNOTATION_IMPORT_MAX_CONCURRENT=5 \ No newline at end of file diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index 5c0edb60ac..b9091b5e2f 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -380,6 +380,37 @@ class FileUploadConfig(BaseSettings): default=60, ) + # Annotation Import Security Configurations + ANNOTATION_IMPORT_FILE_SIZE_LIMIT: NonNegativeInt = Field( + description="Maximum allowed CSV file size for annotation import in megabytes", + default=2, + ) + + ANNOTATION_IMPORT_MAX_RECORDS: PositiveInt = Field( + description="Maximum number of annotation records allowed in a single import", + default=10000, + ) + + ANNOTATION_IMPORT_MIN_RECORDS: PositiveInt = Field( + description="Minimum number of annotation records required in a single import", + default=1, + ) + + ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE: PositiveInt = Field( + description="Maximum number of annotation import requests per minute per tenant", + default=5, + ) + + ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR: PositiveInt = Field( + description="Maximum number of annotation import requests per hour per tenant", + default=20, + ) + + ANNOTATION_IMPORT_MAX_CONCURRENT: PositiveInt = Field( + description="Maximum number of concurrent annotation import tasks per tenant", + default=2, + ) + inner_UPLOAD_FILE_EXTENSION_BLACKLIST: str = Field( description=( "Comma-separated list of file extensions that are blocked from upload. " diff --git a/api/controllers/console/app/annotation.py b/api/controllers/console/app/annotation.py index 3b6fb58931..0aed36a7fd 100644 --- a/api/controllers/console/app/annotation.py +++ b/api/controllers/console/app/annotation.py @@ -1,6 +1,6 @@ from typing import Any, Literal -from flask import request +from flask import abort, request from flask_restx import Resource, fields, marshal, marshal_with from pydantic import BaseModel, Field, field_validator @@ -8,6 +8,8 @@ from controllers.common.errors import NoFileUploadedError, TooManyFilesError from controllers.console import console_ns from controllers.console.wraps import ( account_initialization_required, + annotation_import_concurrency_limit, + annotation_import_rate_limit, cloud_edition_billing_resource_check, edit_permission_required, setup_required, @@ -314,18 +316,25 @@ class AnnotationUpdateDeleteApi(Resource): @console_ns.route("/apps//annotations/batch-import") class AnnotationBatchImportApi(Resource): @console_ns.doc("batch_import_annotations") - @console_ns.doc(description="Batch import annotations from CSV file") + @console_ns.doc(description="Batch import annotations from CSV file with rate limiting and security checks") @console_ns.doc(params={"app_id": "Application ID"}) @console_ns.response(200, "Batch import started successfully") @console_ns.response(403, "Insufficient permissions") @console_ns.response(400, "No file uploaded or too many files") + @console_ns.response(413, "File too large") + @console_ns.response(429, "Too many requests or concurrent imports") @setup_required @login_required @account_initialization_required @cloud_edition_billing_resource_check("annotation") + @annotation_import_rate_limit + @annotation_import_concurrency_limit @edit_permission_required def post(self, app_id): + from configs import dify_config + app_id = str(app_id) + # check file if "file" not in request.files: raise NoFileUploadedError() @@ -335,9 +344,27 @@ class AnnotationBatchImportApi(Resource): # get file from request file = request.files["file"] + # check file type if not file.filename or not file.filename.lower().endswith(".csv"): raise ValueError("Invalid file type. Only CSV files are allowed") + + # Check file size before processing + file.seek(0, 2) # Seek to end of file + file_size = file.tell() + file.seek(0) # Reset to beginning + + max_size_bytes = dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT * 1024 * 1024 + if file_size > max_size_bytes: + abort( + 413, + f"File size exceeds maximum limit of {dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT}MB. " + f"Please reduce the file size and try again.", + ) + + if file_size == 0: + raise ValueError("The uploaded file is empty") + return AppAnnotationService.batch_import_app_annotations(app_id, file) diff --git a/api/controllers/console/wraps.py b/api/controllers/console/wraps.py index f40f566a36..4654650c77 100644 --- a/api/controllers/console/wraps.py +++ b/api/controllers/console/wraps.py @@ -331,3 +331,91 @@ def is_admin_or_owner_required(f: Callable[P, R]): return f(*args, **kwargs) return decorated_function + + +def annotation_import_rate_limit(view: Callable[P, R]): + """ + Rate limiting decorator for annotation import operations. + + Implements sliding window rate limiting with two tiers: + - Short-term: Configurable requests per minute (default: 5) + - Long-term: Configurable requests per hour (default: 20) + + Uses Redis ZSET for distributed rate limiting across multiple instances. + """ + + @wraps(view) + def decorated(*args: P.args, **kwargs: P.kwargs): + _, current_tenant_id = current_account_with_tenant() + current_time = int(time.time() * 1000) + + # Check per-minute rate limit + minute_key = f"annotation_import_rate_limit:{current_tenant_id}:1min" + redis_client.zadd(minute_key, {current_time: current_time}) + redis_client.zremrangebyscore(minute_key, 0, current_time - 60000) + minute_count = redis_client.zcard(minute_key) + redis_client.expire(minute_key, 120) # 2 minutes TTL + + if minute_count > dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE: + abort( + 429, + f"Too many annotation import requests. Maximum {dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE} " + f"requests per minute allowed. Please try again later.", + ) + + # Check per-hour rate limit + hour_key = f"annotation_import_rate_limit:{current_tenant_id}:1hour" + redis_client.zadd(hour_key, {current_time: current_time}) + redis_client.zremrangebyscore(hour_key, 0, current_time - 3600000) + hour_count = redis_client.zcard(hour_key) + redis_client.expire(hour_key, 7200) # 2 hours TTL + + if hour_count > dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR: + abort( + 429, + f"Too many annotation import requests. Maximum {dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR} " + f"requests per hour allowed. Please try again later.", + ) + + return view(*args, **kwargs) + + return decorated + + +def annotation_import_concurrency_limit(view: Callable[P, R]): + """ + Concurrency control decorator for annotation import operations. + + Limits the number of concurrent import tasks per tenant to prevent + resource exhaustion and ensure fair resource allocation. + + Uses Redis ZSET to track active import jobs with automatic cleanup + of stale entries (jobs older than 2 minutes). + """ + + @wraps(view) + def decorated(*args: P.args, **kwargs: P.kwargs): + _, current_tenant_id = current_account_with_tenant() + current_time = int(time.time() * 1000) + + active_jobs_key = f"annotation_import_active:{current_tenant_id}" + + # Clean up stale entries (jobs that should have completed or timed out) + stale_threshold = current_time - 120000 # 2 minutes ago + redis_client.zremrangebyscore(active_jobs_key, 0, stale_threshold) + + # Check current active job count + active_count = redis_client.zcard(active_jobs_key) + + if active_count >= dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT: + abort( + 429, + f"Too many concurrent import tasks. Maximum {dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT} " + f"concurrent imports allowed per workspace. Please wait for existing imports to complete.", + ) + + # Allow the request to proceed + # The actual job registration will happen in the service layer + return view(*args, **kwargs) + + return decorated diff --git a/api/services/annotation_service.py b/api/services/annotation_service.py index 9258def907..f750186ab0 100644 --- a/api/services/annotation_service.py +++ b/api/services/annotation_service.py @@ -1,6 +1,9 @@ +import logging import uuid import pandas as pd + +logger = logging.getLogger(__name__) from sqlalchemy import or_, select from werkzeug.datastructures import FileStorage from werkzeug.exceptions import NotFound @@ -330,6 +333,18 @@ class AppAnnotationService: @classmethod def batch_import_app_annotations(cls, app_id, file: FileStorage): + """ + Batch import annotations from CSV file with enhanced security checks. + + Security features: + - File size validation + - Row count limits (min/max) + - Memory-efficient CSV parsing + - Subscription quota validation + - Concurrency tracking + """ + from configs import dify_config + # get app info current_user, current_tenant_id = current_account_with_tenant() app = ( @@ -341,16 +356,80 @@ class AppAnnotationService: if not app: raise NotFound("App not found") + job_id: str | None = None # Initialize to avoid unbound variable error try: - # Skip the first row - df = pd.read_csv(file.stream, dtype=str) - result = [] - for _, row in df.iterrows(): - content = {"question": row.iloc[0], "answer": row.iloc[1]} + # Quick row count check before full parsing (memory efficient) + # Read only first chunk to estimate row count + file.stream.seek(0) + first_chunk = file.stream.read(8192) # Read first 8KB + file.stream.seek(0) + + # Estimate row count from first chunk + newline_count = first_chunk.count(b"\n") + if newline_count == 0: + raise ValueError("The CSV file appears to be empty or invalid.") + + # Parse CSV with row limit to prevent memory exhaustion + # Use chunksize for memory-efficient processing + max_records = dify_config.ANNOTATION_IMPORT_MAX_RECORDS + min_records = dify_config.ANNOTATION_IMPORT_MIN_RECORDS + + # Read CSV in chunks to avoid loading entire file into memory + df = pd.read_csv( + file.stream, + dtype=str, + nrows=max_records + 1, # Read one extra to detect overflow + engine="python", + on_bad_lines="skip", # Skip malformed lines instead of crashing + ) + + # Validate column count + if len(df.columns) < 2: + raise ValueError("Invalid CSV format. The file must contain at least 2 columns (question and answer).") + + # Build result list with validation + result: list[dict] = [] + for idx, row in df.iterrows(): + # Stop if we exceed the limit + if len(result) >= max_records: + raise ValueError( + f"The CSV file contains too many records. Maximum {max_records} records allowed per import. " + f"Please split your file into smaller batches." + ) + + # Extract and validate question and answer + try: + question_raw = row.iloc[0] + answer_raw = row.iloc[1] + except (IndexError, KeyError): + continue # Skip malformed rows + + # Convert to string and strip whitespace + question = str(question_raw).strip() if question_raw is not None else "" + answer = str(answer_raw).strip() if answer_raw is not None else "" + + # Skip empty entries or NaN values + if not question or not answer or question.lower() == "nan" or answer.lower() == "nan": + continue + + # Validate length constraints (idx is pandas index, convert to int for display) + row_num = int(idx) + 2 if isinstance(idx, (int, float)) else len(result) + 2 + if len(question) > 2000: + raise ValueError(f"Question at row {row_num} is too long. Maximum 2000 characters allowed.") + if len(answer) > 10000: + raise ValueError(f"Answer at row {row_num} is too long. Maximum 10000 characters allowed.") + + content = {"question": question, "answer": answer} result.append(content) - if len(result) == 0: - raise ValueError("The CSV file is empty.") - # check annotation limit + + # Validate minimum records + if len(result) < min_records: + raise ValueError( + f"The CSV file must contain at least {min_records} valid annotation record(s). " + f"Found {len(result)} valid record(s)." + ) + + # Check annotation quota limit features = FeatureService.get_features(current_tenant_id) if features.billing.enabled: annotation_quota_limit = features.annotation_quota_limit @@ -359,12 +438,34 @@ class AppAnnotationService: # async job job_id = str(uuid.uuid4()) indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}" - # send batch add segments task + + # Register job in active tasks list for concurrency tracking + current_time = int(naive_utc_now().timestamp() * 1000) + active_jobs_key = f"annotation_import_active:{current_tenant_id}" + redis_client.zadd(active_jobs_key, {job_id: current_time}) + redis_client.expire(active_jobs_key, 7200) # 2 hours TTL + + # Set job status redis_client.setnx(indexing_cache_key, "waiting") batch_import_annotations_task.delay(str(job_id), result, app_id, current_tenant_id, current_user.id) - except Exception as e: + + except ValueError as e: return {"error_msg": str(e)} - return {"job_id": job_id, "job_status": "waiting"} + except Exception as e: + # Clean up active job registration on error (only if job was created) + if job_id is not None: + try: + active_jobs_key = f"annotation_import_active:{current_tenant_id}" + redis_client.zrem(active_jobs_key, job_id) + except Exception: + # Silently ignore cleanup errors - the job will be auto-expired + logger.debug("Failed to clean up active job tracking during error handling") + + # Check if it's a CSV parsing error + error_str = str(e) + return {"error_msg": f"An error occurred while processing the file: {error_str}"} + + return {"job_id": job_id, "job_status": "waiting", "record_count": len(result)} @classmethod def get_annotation_hit_histories(cls, app_id: str, annotation_id: str, page, limit): diff --git a/api/tasks/annotation/batch_import_annotations_task.py b/api/tasks/annotation/batch_import_annotations_task.py index 8e46e8d0e3..775814318b 100644 --- a/api/tasks/annotation/batch_import_annotations_task.py +++ b/api/tasks/annotation/batch_import_annotations_task.py @@ -30,6 +30,8 @@ def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id: logger.info(click.style(f"Start batch import annotation: {job_id}", fg="green")) start_at = time.perf_counter() indexing_cache_key = f"app_annotation_batch_import_{str(job_id)}" + active_jobs_key = f"annotation_import_active:{tenant_id}" + # get app info app = db.session.query(App).where(App.id == app_id, App.tenant_id == tenant_id, App.status == "normal").first() @@ -91,4 +93,13 @@ def batch_import_annotations_task(job_id: str, content_list: list[dict], app_id: redis_client.setex(indexing_error_msg_key, 600, str(e)) logger.exception("Build index for batch import annotations failed") finally: + # Clean up active job tracking to release concurrency slot + try: + redis_client.zrem(active_jobs_key, job_id) + logger.debug("Released concurrency slot for job: %s", job_id) + except Exception as cleanup_error: + # Log but don't fail if cleanup fails - the job will be auto-expired + logger.warning("Failed to clean up active job tracking for %s: %s", job_id, cleanup_error) + + # Close database session db.session.close() diff --git a/api/tests/unit_tests/controllers/console/app/test_annotation_security.py b/api/tests/unit_tests/controllers/console/app/test_annotation_security.py new file mode 100644 index 0000000000..36da3c264e --- /dev/null +++ b/api/tests/unit_tests/controllers/console/app/test_annotation_security.py @@ -0,0 +1,344 @@ +""" +Unit tests for annotation import security features. + +Tests rate limiting, concurrency control, file validation, and other +security features added to prevent DoS attacks on the annotation import endpoint. +""" + +import io +from unittest.mock import MagicMock, patch + +import pytest +from werkzeug.datastructures import FileStorage + +from configs import dify_config + + +class TestAnnotationImportRateLimiting: + """Test rate limiting for annotation import operations.""" + + @pytest.fixture + def mock_redis(self): + """Mock Redis client for testing.""" + with patch("controllers.console.wraps.redis_client") as mock: + yield mock + + @pytest.fixture + def mock_current_account(self): + """Mock current account with tenant.""" + with patch("controllers.console.wraps.current_account_with_tenant") as mock: + mock.return_value = (MagicMock(id="user_id"), "test_tenant_id") + yield mock + + def test_rate_limit_per_minute_enforced(self, mock_redis, mock_current_account): + """Test that per-minute rate limit is enforced.""" + from controllers.console.wraps import annotation_import_rate_limit + + # Simulate exceeding per-minute limit + mock_redis.zcard.side_effect = [ + dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE + 1, # Minute check + 10, # Hour check + ] + + @annotation_import_rate_limit + def dummy_view(): + return "success" + + # Should abort with 429 + with pytest.raises(Exception) as exc_info: + dummy_view() + + # Verify it's a rate limit error + assert "429" in str(exc_info.value) or "Too many" in str(exc_info.value) + + def test_rate_limit_per_hour_enforced(self, mock_redis, mock_current_account): + """Test that per-hour rate limit is enforced.""" + from controllers.console.wraps import annotation_import_rate_limit + + # Simulate exceeding per-hour limit + mock_redis.zcard.side_effect = [ + 3, # Minute check (under limit) + dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR + 1, # Hour check (over limit) + ] + + @annotation_import_rate_limit + def dummy_view(): + return "success" + + # Should abort with 429 + with pytest.raises(Exception) as exc_info: + dummy_view() + + assert "429" in str(exc_info.value) or "Too many" in str(exc_info.value) + + def test_rate_limit_within_limits_passes(self, mock_redis, mock_current_account): + """Test that requests within limits are allowed.""" + from controllers.console.wraps import annotation_import_rate_limit + + # Simulate being under both limits + mock_redis.zcard.return_value = 2 + + @annotation_import_rate_limit + def dummy_view(): + return "success" + + # Should succeed + result = dummy_view() + assert result == "success" + + # Verify Redis operations were called + assert mock_redis.zadd.called + assert mock_redis.zremrangebyscore.called + + +class TestAnnotationImportConcurrencyControl: + """Test concurrency control for annotation import operations.""" + + @pytest.fixture + def mock_redis(self): + """Mock Redis client for testing.""" + with patch("controllers.console.wraps.redis_client") as mock: + yield mock + + @pytest.fixture + def mock_current_account(self): + """Mock current account with tenant.""" + with patch("controllers.console.wraps.current_account_with_tenant") as mock: + mock.return_value = (MagicMock(id="user_id"), "test_tenant_id") + yield mock + + def test_concurrency_limit_enforced(self, mock_redis, mock_current_account): + """Test that concurrent task limit is enforced.""" + from controllers.console.wraps import annotation_import_concurrency_limit + + # Simulate max concurrent tasks already running + mock_redis.zcard.return_value = dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT + + @annotation_import_concurrency_limit + def dummy_view(): + return "success" + + # Should abort with 429 + with pytest.raises(Exception) as exc_info: + dummy_view() + + assert "429" in str(exc_info.value) or "concurrent" in str(exc_info.value).lower() + + def test_concurrency_within_limit_passes(self, mock_redis, mock_current_account): + """Test that requests within concurrency limits are allowed.""" + from controllers.console.wraps import annotation_import_concurrency_limit + + # Simulate being under concurrent task limit + mock_redis.zcard.return_value = 1 + + @annotation_import_concurrency_limit + def dummy_view(): + return "success" + + # Should succeed + result = dummy_view() + assert result == "success" + + def test_stale_jobs_are_cleaned_up(self, mock_redis, mock_current_account): + """Test that old/stale job entries are removed.""" + from controllers.console.wraps import annotation_import_concurrency_limit + + mock_redis.zcard.return_value = 0 + + @annotation_import_concurrency_limit + def dummy_view(): + return "success" + + dummy_view() + + # Verify cleanup was called + assert mock_redis.zremrangebyscore.called + + +class TestAnnotationImportFileValidation: + """Test file validation in annotation import.""" + + def test_file_size_limit_enforced(self): + """Test that files exceeding size limit are rejected.""" + # Create a file larger than the limit + max_size = dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT * 1024 * 1024 + large_content = b"x" * (max_size + 1024) # Exceed by 1KB + + file = FileStorage(stream=io.BytesIO(large_content), filename="test.csv", content_type="text/csv") + + # Should be rejected in controller + # This would be tested in integration tests with actual endpoint + + def test_empty_file_rejected(self): + """Test that empty files are rejected.""" + file = FileStorage(stream=io.BytesIO(b""), filename="test.csv", content_type="text/csv") + + # Should be rejected + # This would be tested in integration tests + + def test_non_csv_file_rejected(self): + """Test that non-CSV files are rejected.""" + file = FileStorage(stream=io.BytesIO(b"test"), filename="test.txt", content_type="text/plain") + + # Should be rejected based on extension + # This would be tested in integration tests + + +class TestAnnotationImportServiceValidation: + """Test service layer validation for annotation import.""" + + @pytest.fixture + def mock_app(self): + """Mock application object.""" + app = MagicMock() + app.id = "app_id" + return app + + @pytest.fixture + def mock_db_session(self): + """Mock database session.""" + with patch("services.annotation_service.db.session") as mock: + yield mock + + def test_max_records_limit_enforced(self, mock_app, mock_db_session): + """Test that files with too many records are rejected.""" + from services.annotation_service import AppAnnotationService + + # Create CSV with too many records + max_records = dify_config.ANNOTATION_IMPORT_MAX_RECORDS + csv_content = "question,answer\n" + for i in range(max_records + 100): + csv_content += f"Question {i},Answer {i}\n" + + file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv") + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_app + + with patch("services.annotation_service.current_account_with_tenant") as mock_auth: + mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id") + + with patch("services.annotation_service.FeatureService") as mock_features: + mock_features.get_features.return_value.billing.enabled = False + + result = AppAnnotationService.batch_import_app_annotations("app_id", file) + + # Should return error about too many records + assert "error_msg" in result + assert "too many" in result["error_msg"].lower() or "maximum" in result["error_msg"].lower() + + def test_min_records_limit_enforced(self, mock_app, mock_db_session): + """Test that files with too few valid records are rejected.""" + from services.annotation_service import AppAnnotationService + + # Create CSV with only header (no data rows) + csv_content = "question,answer\n" + + file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv") + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_app + + with patch("services.annotation_service.current_account_with_tenant") as mock_auth: + mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id") + + result = AppAnnotationService.batch_import_app_annotations("app_id", file) + + # Should return error about insufficient records + assert "error_msg" in result + assert "at least" in result["error_msg"].lower() or "minimum" in result["error_msg"].lower() + + def test_invalid_csv_format_handled(self, mock_app, mock_db_session): + """Test that invalid CSV format is handled gracefully.""" + from services.annotation_service import AppAnnotationService + + # Create invalid CSV content + csv_content = 'invalid,csv,format\nwith,unbalanced,quotes,and"stuff' + + file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv") + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_app + + with patch("services.annotation_service.current_account_with_tenant") as mock_auth: + mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id") + + result = AppAnnotationService.batch_import_app_annotations("app_id", file) + + # Should return error message + assert "error_msg" in result + + def test_valid_import_succeeds(self, mock_app, mock_db_session): + """Test that valid import request succeeds.""" + from services.annotation_service import AppAnnotationService + + # Create valid CSV + csv_content = "question,answer\nWhat is AI?,Artificial Intelligence\nWhat is ML?,Machine Learning\n" + + file = FileStorage(stream=io.BytesIO(csv_content.encode()), filename="test.csv", content_type="text/csv") + + mock_db_session.query.return_value.where.return_value.first.return_value = mock_app + + with patch("services.annotation_service.current_account_with_tenant") as mock_auth: + mock_auth.return_value = (MagicMock(id="user_id"), "tenant_id") + + with patch("services.annotation_service.FeatureService") as mock_features: + mock_features.get_features.return_value.billing.enabled = False + + with patch("services.annotation_service.batch_import_annotations_task") as mock_task: + with patch("services.annotation_service.redis_client"): + result = AppAnnotationService.batch_import_app_annotations("app_id", file) + + # Should return success response + assert "job_id" in result + assert "job_status" in result + assert result["job_status"] == "waiting" + assert "record_count" in result + assert result["record_count"] == 2 + + +class TestAnnotationImportTaskOptimization: + """Test optimizations in batch import task.""" + + def test_task_has_timeout_configured(self): + """Test that task has proper timeout configuration.""" + from tasks.annotation.batch_import_annotations_task import batch_import_annotations_task + + # Verify task configuration + assert hasattr(batch_import_annotations_task, "time_limit") + assert hasattr(batch_import_annotations_task, "soft_time_limit") + + # Check timeout values are reasonable + # Hard limit should be 6 minutes (360s) + # Soft limit should be 5 minutes (300s) + # Note: actual values depend on Celery configuration + + +class TestConfigurationValues: + """Test that security configuration values are properly set.""" + + def test_rate_limit_configs_exist(self): + """Test that rate limit configurations are defined.""" + assert hasattr(dify_config, "ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE") + assert hasattr(dify_config, "ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR") + + assert dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE > 0 + assert dify_config.ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR > 0 + + def test_file_size_limit_config_exists(self): + """Test that file size limit configuration is defined.""" + assert hasattr(dify_config, "ANNOTATION_IMPORT_FILE_SIZE_LIMIT") + assert dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT > 0 + assert dify_config.ANNOTATION_IMPORT_FILE_SIZE_LIMIT <= 10 # Reasonable max (10MB) + + def test_record_limit_configs_exist(self): + """Test that record limit configurations are defined.""" + assert hasattr(dify_config, "ANNOTATION_IMPORT_MAX_RECORDS") + assert hasattr(dify_config, "ANNOTATION_IMPORT_MIN_RECORDS") + + assert dify_config.ANNOTATION_IMPORT_MAX_RECORDS > 0 + assert dify_config.ANNOTATION_IMPORT_MIN_RECORDS > 0 + assert dify_config.ANNOTATION_IMPORT_MIN_RECORDS < dify_config.ANNOTATION_IMPORT_MAX_RECORDS + + def test_concurrency_limit_config_exists(self): + """Test that concurrency limit configuration is defined.""" + assert hasattr(dify_config, "ANNOTATION_IMPORT_MAX_CONCURRENT") + assert dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT > 0 + assert dify_config.ANNOTATION_IMPORT_MAX_CONCURRENT <= 10 # Reasonable upper bound diff --git a/docker/.env.example b/docker/.env.example index 0227f75b70..8be75420b1 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -1448,5 +1448,16 @@ WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK=0 # Tenant isolated task queue configuration TENANT_ISOLATED_TASK_CONCURRENCY=1 +# Maximum allowed CSV file size for annotation import in megabytes +ANNOTATION_IMPORT_FILE_SIZE_LIMIT=2 +#Maximum number of annotation records allowed in a single import +ANNOTATION_IMPORT_MAX_RECORDS=10000 +# Minimum number of annotation records required in a single import +ANNOTATION_IMPORT_MIN_RECORDS=1 +ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE=5 +ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR=20 +# Maximum number of concurrent annotation import tasks per tenant +ANNOTATION_IMPORT_MAX_CONCURRENT=5 + # The API key of amplitude AMPLITUDE_API_KEY= diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 71c0a5e687..cc17b2853a 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -648,6 +648,12 @@ x-shared-env: &shared-api-worker-env WORKFLOW_SCHEDULE_POLLER_BATCH_SIZE: ${WORKFLOW_SCHEDULE_POLLER_BATCH_SIZE:-100} WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK: ${WORKFLOW_SCHEDULE_MAX_DISPATCH_PER_TICK:-0} TENANT_ISOLATED_TASK_CONCURRENCY: ${TENANT_ISOLATED_TASK_CONCURRENCY:-1} + ANNOTATION_IMPORT_FILE_SIZE_LIMIT: ${ANNOTATION_IMPORT_FILE_SIZE_LIMIT:-2} + ANNOTATION_IMPORT_MAX_RECORDS: ${ANNOTATION_IMPORT_MAX_RECORDS:-10000} + ANNOTATION_IMPORT_MIN_RECORDS: ${ANNOTATION_IMPORT_MIN_RECORDS:-1} + ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE: ${ANNOTATION_IMPORT_RATE_LIMIT_PER_MINUTE:-5} + ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR: ${ANNOTATION_IMPORT_RATE_LIMIT_PER_HOUR:-20} + ANNOTATION_IMPORT_MAX_CONCURRENT: ${ANNOTATION_IMPORT_MAX_CONCURRENT:-5} AMPLITUDE_API_KEY: ${AMPLITUDE_API_KEY:-} services: