dify/api/tests/unit_tests/services/test_variable_truncator.py

"""
Comprehensive unit tests for VariableTruncator class based on current implementation.

This test suite covers all functionality of the current VariableTruncator including:
- JSON size calculation for different data types
- String, array, and object truncation logic
- Segment-based truncation interface
- Helper methods for budget-based truncation
- Edge cases and error handling
"""

import functools
import json
import uuid
from typing import Any
from uuid import uuid4

import pytest

from core.variables.segments import (
    ArrayFileSegment,
    ArrayNumberSegment,
    ArraySegment,
    FileSegment,
    FloatSegment,
    IntegerSegment,
    NoneSegment,
    ObjectSegment,
    StringSegment,
)
from core.workflow.file.enums import FileTransferMethod, FileType
from core.workflow.file.models import File
from services.variable_truncator import (
    DummyVariableTruncator,
    MaxDepthExceededError,
    TruncationResult,
    UnknownTypeError,
    VariableTruncator,
)


@pytest.fixture
def file() -> File:
    return File(
        id=str(uuid4()),  # Generate new UUID for File.id
        tenant_id=str(uuid.uuid4()),
        type=FileType.DOCUMENT,
        transfer_method=FileTransferMethod.LOCAL_FILE,
        related_id=str(uuid.uuid4()),
        filename="test_file.txt",
        extension=".txt",
        mime_type="text/plain",
        size=1024,
        storage_key="initial_key",
    )


_compact_json_dumps = functools.partial(json.dumps, separators=(",", ":"))


class TestCalculateJsonSize:
    """Test calculate_json_size method with different data types."""

    @pytest.fixture
    def truncator(self):
        return VariableTruncator()

    def test_string_size_calculation(self):
        """Test JSON size calculation for strings."""
        # Simple ASCII string
        assert VariableTruncator.calculate_json_size("hello") == 7  # "hello" + 2 quotes

        # Empty string
        assert VariableTruncator.calculate_json_size("") == 2  # Just quotes

        # Unicode string
        assert VariableTruncator.calculate_json_size("你好") == 4

    def test_number_size_calculation(self, truncator):
        """Test JSON size calculation for numbers."""
        assert truncator.calculate_json_size(123) == 3
        assert truncator.calculate_json_size(12.34) == 5
        assert truncator.calculate_json_size(-456) == 4
        assert truncator.calculate_json_size(0) == 1

    def test_boolean_size_calculation(self, truncator):
        """Test JSON size calculation for booleans."""
        assert truncator.calculate_json_size(True) == 4  # "true"
        assert truncator.calculate_json_size(False) == 5  # "false"

    def test_null_size_calculation(self, truncator):
        """Test JSON size calculation for None/null."""
        assert truncator.calculate_json_size(None) == 4  # "null"

    def test_array_size_calculation(self, truncator):
        """Test JSON size calculation for arrays."""
        # Empty array
        assert truncator.calculate_json_size([]) == 2  # "[]"

        # Simple array
        simple_array = [1, 2, 3]
        # [1,2,3] = 1 + 1 + 1 + 1 + 1 + 2 = 7 (numbers + commas + brackets)
        assert truncator.calculate_json_size(simple_array) == 7

        # Array with strings
        string_array = ["a", "b"]
        # ["a","b"] = 3 + 3 + 1 + 2 = 9 (quoted strings + comma + brackets)
        assert truncator.calculate_json_size(string_array) == 9

    def test_object_size_calculation(self, truncator):
        """Test JSON size calculation for objects."""
        # Empty object
        assert truncator.calculate_json_size({}) == 2  # "{}"

        # Simple object
        simple_obj = {"a": 1}
        # {"a":1} = 3 + 1 + 1 + 2 = 7 (key + colon + value + brackets)
        assert truncator.calculate_json_size(simple_obj) == 7

        # Multiple keys
        multi_obj = {"a": 1, "b": 2}
        # {"a":1,"b":2} = 3 + 1 + 1 + 1 + 3 + 1 + 1 + 2 = 13
        assert truncator.calculate_json_size(multi_obj) == 13

    def test_nested_structure_size_calculation(self, truncator):
        """Test JSON size calculation for nested structures."""
        nested = {"items": [1, 2, {"nested": "value"}]}
        size = truncator.calculate_json_size(nested)
        assert size > 0  # Should calculate without error

        # Verify it matches actual JSON length roughly

        actual_json = _compact_json_dumps(nested)
        # Should be close but not exact due to UTF-8 encoding considerations
        assert abs(size - len(actual_json.encode())) <= 5

    def test_calculate_json_size_max_depth_exceeded(self, truncator):
        """Test that calculate_json_size handles deep nesting gracefully."""
        # Create deeply nested structure
        nested: dict[str, Any] = {"level": 0}
        current = nested
        for i in range(105):  # Create deep nesting
            current["next"] = {"level": i + 1}
            current = current["next"]

        # Should either raise an error or handle gracefully
        with pytest.raises(MaxDepthExceededError):
            truncator.calculate_json_size(nested)

    def test_calculate_json_size_unknown_type(self, truncator):
        """Test that calculate_json_size raises error for unknown types."""

        class CustomType:
            pass

        with pytest.raises(UnknownTypeError):
            truncator.calculate_json_size(CustomType())


class TestStringTruncation:
    LENGTH_LIMIT = 10
    """Test string truncation functionality."""

    @pytest.fixture
    def small_truncator(self):
        return VariableTruncator(string_length_limit=10)

    def test_short_string_no_truncation(self, small_truncator):
        """Test that short strings are not truncated."""
        short_str = "hello"
        result = small_truncator._truncate_string(short_str, self.LENGTH_LIMIT)
        assert result.value == short_str
        assert result.truncated is False
        assert result.value_size == VariableTruncator.calculate_json_size(short_str)

    def test_long_string_truncation(self, small_truncator: VariableTruncator):
        """Test that long strings are truncated with ellipsis."""
        long_str = "this is a very long string that exceeds the limit"
        result = small_truncator._truncate_string(long_str, self.LENGTH_LIMIT)

        assert result.truncated is True
        assert result.value == long_str[:5] + "..."
        assert result.value_size == 10  # 10 chars + "..."

    def test_exact_limit_string(self, small_truncator: VariableTruncator):
        """Test string exactly at limit."""
        exact_str = "1234567890"  # Exactly 10 chars
        result = small_truncator._truncate_string(exact_str, self.LENGTH_LIMIT)
        assert result.value == "12345..."
        assert result.truncated is True
        assert result.value_size == 10


class TestArrayTruncation:
    """Test array truncation functionality."""

    @pytest.fixture
    def small_truncator(self):
        return VariableTruncator(array_element_limit=3, max_size_bytes=100)

    def test_small_array_no_truncation(self, small_truncator: VariableTruncator):
        """Test that small arrays are not truncated."""
        small_array = [1, 2]
        result = small_truncator._truncate_array(small_array, 1000)
        assert result.value == small_array
        assert result.truncated is False

    def test_array_element_limit_truncation(self, small_truncator: VariableTruncator):
        """Test that arrays over element limit are truncated."""
        large_array = [1, 2, 3, 4, 5, 6]  # Exceeds limit of 3
        result = small_truncator._truncate_array(large_array, 1000)

        assert result.truncated is True
        assert result.value == [1, 2, 3]

    def test_array_size_budget_truncation(self, small_truncator: VariableTruncator):
        """Test array truncation due to size budget constraints."""
        # Create array with strings that will exceed size budget
        large_strings = ["very long string " * 5, "another long string " * 5]
        result = small_truncator._truncate_array(large_strings, 50)

        assert result.truncated is True
        # Should have truncated the strings within the array
        for item in result.value:
            assert isinstance(item, str)
        assert VariableTruncator.calculate_json_size(result.value) <= 50

    def test_array_with_nested_objects(self, small_truncator):
        """Test array truncation with nested objects."""
        nested_array = [
            {"name": "item1", "data": "some data"},
            {"name": "item2", "data": "more data"},
            {"name": "item3", "data": "even more data"},
        ]
        result = small_truncator._truncate_array(nested_array, 30)

        assert isinstance(result.value, list)
        assert len(result.value) <= 3
        for item in result.value:
            assert isinstance(item, dict)


class TestObjectTruncation:
    """Test object truncation functionality."""

    @pytest.fixture
    def small_truncator(self):
        return VariableTruncator(max_size_bytes=100)

    def test_small_object_no_truncation(self, small_truncator):
        """Test that small objects are not truncated."""
        small_obj = {"a": 1, "b": 2}
        result = small_truncator._truncate_object(small_obj, 1000)
        assert result.value == small_obj
        assert result.truncated is False

    def test_empty_object_no_truncation(self, small_truncator):
        """Test that empty objects are not truncated."""
        empty_obj = {}
        result = small_truncator._truncate_object(empty_obj, 100)
        assert result.value == empty_obj
        assert result.truncated is False

    def test_object_value_truncation(self, small_truncator):
        """Test object truncation where values are truncated to fit budget."""
        obj_with_long_values = {
            "key1": "very long string " * 10,
            "key2": "another long string " * 10,
            "key3": "third long string " * 10,
        }
        result = small_truncator._truncate_object(obj_with_long_values, 80)

        assert result.truncated is True
        assert isinstance(result.value, dict)

        assert set(result.value.keys()).issubset(obj_with_long_values.keys())

        # Values should be truncated if they exist
        for key, value in result.value.items():
            if isinstance(value, str):
                original_value = obj_with_long_values[key]
                # Value should be same or smaller
                assert len(value) <= len(original_value)

    def test_object_key_dropping(self, small_truncator):
        """Test object truncation where keys are dropped due to size constraints."""
        large_obj = {f"key{i:02d}": f"value{i}" for i in range(20)}
        result = small_truncator._truncate_object(large_obj, 50)

        assert result.truncated is True
        assert len(result.value) < len(large_obj)

        # Should maintain sorted key order
        result_keys = list(result.value.keys())
        assert result_keys == sorted(result_keys)

    def test_object_with_nested_structures(self, small_truncator):
        """Test object truncation with nested arrays and objects."""
        nested_obj = {"simple": "value", "array": [1, 2, 3, 4, 5], "nested": {"inner": "data", "more": ["a", "b", "c"]}}
        result = small_truncator._truncate_object(nested_obj, 60)

        assert isinstance(result.value, dict)


class TestSegmentBasedTruncation:
    """Test the main truncate method that works with Segments."""

    @pytest.fixture
    def truncator(self):
        return VariableTruncator()

    @pytest.fixture
    def small_truncator(self):
        return VariableTruncator(string_length_limit=20, array_element_limit=3, max_size_bytes=200)

    def test_integer_segment_no_truncation(self, truncator):
        """Test that integer segments are never truncated."""
        segment = IntegerSegment(value=12345)
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert result.result == segment

    def test_boolean_as_integer_segment(self, truncator):
        """Test boolean values in IntegerSegment are converted to int."""
        segment = IntegerSegment(value=True)
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert isinstance(result.result, IntegerSegment)
        assert result.result.value == 1  # True converted to 1

    def test_float_segment_no_truncation(self, truncator):
        """Test that float segments are never truncated."""
        segment = FloatSegment(value=123.456)
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert result.result == segment

    def test_none_segment_no_truncation(self, truncator):
        """Test that None segments are never truncated."""
        segment = NoneSegment()
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert result.result == segment

    def test_file_segment_no_truncation(self, truncator, file):
        """Test that file segments are never truncated."""
        file_segment = FileSegment(value=file)
        result = truncator.truncate(file_segment)
        assert result.result == file_segment
        assert result.truncated is False

    def test_array_file_segment_no_truncation(self, truncator, file):
        """Test that array file segments are never truncated."""

        array_file_segment = ArrayFileSegment(value=[file] * 20)
        result = truncator.truncate(array_file_segment)
        assert result.result == array_file_segment
        assert result.truncated is False

    def test_string_segment_small_no_truncation(self, truncator):
        """Test small string segments are not truncated."""
        segment = StringSegment(value="hello world")
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert result.result == segment

    def test_string_segment_large_truncation(self, small_truncator):
        """Test large string segments are truncated."""
        long_text = "this is a very long string that will definitely exceed the limit"
        segment = StringSegment(value=long_text)
        result = small_truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is True
        assert isinstance(result.result, StringSegment)
        assert len(result.result.value) < len(long_text)
        assert result.result.value.endswith("...")

    def test_array_segment_small_no_truncation(self, truncator):
        """Test small array segments are not truncated."""
        from factories.variable_factory import build_segment

        segment = build_segment([1, 2, 3])
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert result.result == segment

    def test_array_segment_large_truncation(self, small_truncator):
        """Test large array segments are truncated."""
        from factories.variable_factory import build_segment

        large_array = list(range(10))  # Exceeds element limit of 3
        segment = build_segment(large_array)
        result = small_truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is True
        assert isinstance(result.result, ArraySegment)
        assert len(result.result.value) <= 3

    def test_object_segment_small_no_truncation(self, truncator):
        """Test small object segments are not truncated."""
        segment = ObjectSegment(value={"key": "value"})
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is False
        assert result.result == segment

    def test_object_segment_large_truncation(self, small_truncator):
        """Test large object segments are truncated."""
        large_obj = {f"key{i}": f"very long value {i}" * 5 for i in range(5)}
        segment = ObjectSegment(value=large_obj)
        result = small_truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is True
        assert isinstance(result.result, ObjectSegment)
        # Object should be smaller or equal than original
        original_size = small_truncator.calculate_json_size(large_obj)
        result_size = small_truncator.calculate_json_size(result.result.value)
        assert result_size <= original_size

    def test_final_size_fallback_to_json_string(self, small_truncator):
        """Test final fallback when truncated result still exceeds size limit."""
        # Create data that will still be large after initial truncation
        large_nested_data = {"data": ["very long string " * 5] * 5, "more": {"nested": "content " * 20}}
        segment = ObjectSegment(value=large_nested_data)

        # Use very small limit to force JSON string fallback
        tiny_truncator = VariableTruncator(max_size_bytes=50)
        result = tiny_truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is True
        assert isinstance(result.result, StringSegment)
        # Should be JSON string with possible truncation
        assert len(result.result.value) <= 53  # 50 + "..." = 53

    def test_final_size_fallback_string_truncation(self, small_truncator):
        """Test final fallback for string that still exceeds limit."""
        # Create very long string that exceeds string length limit
        very_long_string = "x" * 6000  # Exceeds default string_length_limit of 5000
        segment = StringSegment(value=very_long_string)

        # Use small limit to test string fallback path
        tiny_truncator = VariableTruncator(string_length_limit=100, max_size_bytes=50)
        result = tiny_truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is True
        assert isinstance(result.result, StringSegment)
        # Should be truncated due to string limit or final size limit
        assert len(result.result.value) <= 1000  # Much smaller than original


class TestEdgeCases:
    """Test edge cases and error conditions."""

    def test_empty_inputs(self):
        """Test truncator with empty inputs."""
        truncator = VariableTruncator()

        # Empty string
        result = truncator.truncate(StringSegment(value=""))
        assert not result.truncated
        assert result.result.value == ""

        # Empty array
        from factories.variable_factory import build_segment

        result = truncator.truncate(build_segment([]))
        assert not result.truncated
        assert result.result.value == []

        # Empty object
        result = truncator.truncate(ObjectSegment(value={}))
        assert not result.truncated
        assert result.result.value == {}

    def test_zero_and_negative_limits(self):
        """Test truncator behavior with zero or very small limits."""
        # Zero string limit
        with pytest.raises(ValueError):
            truncator = VariableTruncator(string_length_limit=3)

        with pytest.raises(ValueError):
            truncator = VariableTruncator(array_element_limit=0)

        with pytest.raises(ValueError):
            truncator = VariableTruncator(max_size_bytes=0)

    def test_unicode_and_special_characters(self):
        """Test truncator with unicode and special characters."""
        truncator = VariableTruncator(string_length_limit=10)

        # Unicode characters
        unicode_text = "🌍🚀🌍🚀🌍🚀🌍🚀🌍🚀"  # Each emoji counts as 1 character
        result = truncator.truncate(StringSegment(value=unicode_text))
        if len(unicode_text) > 10:
            assert result.truncated is True

        # Special JSON characters
        special_chars = '{"key": "value with \\"quotes\\" and \\n newlines"}'
        result = truncator.truncate(StringSegment(value=special_chars))
        assert isinstance(result.result, StringSegment)


class TestTruncateJsonPrimitives:
    """Test _truncate_json_primitives method with different data types."""

    @pytest.fixture
    def truncator(self):
        return VariableTruncator()

    def test_truncate_json_primitives_file_type(self, truncator, file):
        """Test that File objects are handled correctly in _truncate_json_primitives."""
        # Test File object is returned as-is without truncation
        result = truncator._truncate_json_primitives(file, 1000)

        assert result.value == file
        assert result.truncated is False
        # Size should be calculated correctly
        expected_size = VariableTruncator.calculate_json_size(file)
        assert result.value_size == expected_size

    def test_truncate_json_primitives_file_type_small_budget(self, truncator, file):
        """Test that File objects are returned as-is even with small budget."""
        # Even with a small size budget, File objects should not be truncated
        result = truncator._truncate_json_primitives(file, 10)

        assert result.value == file
        assert result.truncated is False

    def test_truncate_json_primitives_file_type_in_array(self, truncator, file):
        """Test File objects in arrays are handled correctly."""
        array_with_files = [file, file]
        result = truncator._truncate_json_primitives(array_with_files, 1000)

        assert isinstance(result.value, list)
        assert len(result.value) == 2
        assert result.value[0] == file
        assert result.value[1] == file
        assert result.truncated is False

    def test_truncate_json_primitives_file_type_in_object(self, truncator, file):
        """Test File objects in objects are handled correctly."""
        obj_with_files = {"file1": file, "file2": file}
        result = truncator._truncate_json_primitives(obj_with_files, 1000)

        assert isinstance(result.value, dict)
        assert len(result.value) == 2
        assert result.value["file1"] == file
        assert result.value["file2"] == file
        assert result.truncated is False


class TestIntegrationScenarios:
    """Test realistic integration scenarios."""

    def test_workflow_output_scenario(self):
        """Test truncation of typical workflow output data."""
        truncator = VariableTruncator()

        workflow_data = {
            "result": "success",
            "data": {
                "users": [
                    {"id": 1, "name": "Alice", "email": "alice@example.com"},
                    {"id": 2, "name": "Bob", "email": "bob@example.com"},
                ]
                * 3,  # Multiply to make it larger
                "metadata": {
                    "count": 6,
                    "processing_time": "1.23s",
                    "details": "x" * 200,  # Long string but not too long
                },
            },
        }

        segment = ObjectSegment(value=workflow_data)
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert isinstance(result.result, (ObjectSegment, StringSegment))
        # Should handle complex nested structure appropriately

    def test_large_text_processing_scenario(self):
        """Test truncation of large text data."""
        truncator = VariableTruncator(string_length_limit=100)

        large_text = "This is a very long text document. " * 20  # Make it larger than limit

        segment = StringSegment(value=large_text)
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        assert result.truncated is True
        assert isinstance(result.result, StringSegment)
        assert len(result.result.value) <= 103  # 100 + "..."
        assert result.result.value.endswith("...")

    def test_mixed_data_types_scenario(self):
        """Test truncation with mixed data types in complex structure."""
        truncator = VariableTruncator(string_length_limit=30, array_element_limit=3, max_size_bytes=300)

        mixed_data = {
            "strings": ["short", "medium length", "very long string " * 3],
            "numbers": [1, 2.5, 999999],
            "booleans": [True, False, True],
            "nested": {
                "more_strings": ["nested string " * 2],
                "more_numbers": list(range(5)),
                "deep": {"level": 3, "content": "deep content " * 3},
            },
            "nulls": [None, None],
        }

        segment = ObjectSegment(value=mixed_data)
        result = truncator.truncate(segment)

        assert isinstance(result, TruncationResult)
        # Should handle all data types appropriately
        if result.truncated:
            # Verify the result is smaller or equal than original
            original_size = truncator.calculate_json_size(mixed_data)
            if isinstance(result.result, ObjectSegment):
                result_size = truncator.calculate_json_size(result.result.value)
                assert result_size <= original_size

    def test_file_and_array_file_variable_mapping(self, file):
        truncator = VariableTruncator(string_length_limit=30, array_element_limit=3, max_size_bytes=300)

        mapping = {"array_file": [file]}
        truncated_mapping, truncated = truncator.truncate_variable_mapping(mapping)
        assert truncated is False
        assert truncated_mapping == mapping


def test_dummy_variable_truncator_methods():
    """Test DummyVariableTruncator methods work correctly."""
    truncator = DummyVariableTruncator()

    # Test truncate_variable_mapping
    test_data: dict[str, Any] = {
        "key1": "value1",
        "key2": ["item1", "item2"],
        "large_array": list(range(2000)),
    }
    result, is_truncated = truncator.truncate_variable_mapping(test_data)

    assert result == test_data
    assert not is_truncated

    # Test truncate method
    segment = StringSegment(value="test string")
    result = truncator.truncate(segment)
    assert isinstance(result, TruncationResult)
    assert result.result == segment
    assert result.truncated is False

    segment = ArrayNumberSegment(value=list(range(2000)))
    result = truncator.truncate(segment)
    assert isinstance(result, TruncationResult)
    assert result.result == segment
    assert result.truncated is False