feat: Add Aliyun SLS (Simple Log Service) integration for workflow execution logging (#28986)

Co-authored-by: hieheihei <270985384@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: -LAN- <laipz8200@outlook.com>
2025-12-19 17:27:16 -05:00 · 2025-12-17 13:43:54 +08:00
parent 94a5fd3617
commit 44f8915e30
16 changed files with 5439 additions and 2308 deletions
--- a/api/.env.example
+++ b/api/.env.example
@@ -543,6 +543,25 @@ APP_MAX_EXECUTION_TIME=1200
 APP_DEFAULT_ACTIVE_REQUESTS=0
 APP_MAX_ACTIVE_REQUESTS=0
 # Aliyun SLS Logstore Configuration
 # Aliyun Access Key ID
 ALIYUN_SLS_ACCESS_KEY_ID=
 # Aliyun Access Key Secret
 ALIYUN_SLS_ACCESS_KEY_SECRET=
 # Aliyun SLS Endpoint (e.g., cn-hangzhou.log.aliyuncs.com)
 ALIYUN_SLS_ENDPOINT=
 # Aliyun SLS Region (e.g., cn-hangzhou)
 ALIYUN_SLS_REGION=
 # Aliyun SLS Project Name
 ALIYUN_SLS_PROJECT_NAME=
 # Number of days to retain workflow run logs (default: 365 days， 3650 for permanent storage)
 ALIYUN_SLS_LOGSTORE_TTL=365
 # Enable dual-write to both SLS LogStore and SQL database (default: false)
 LOGSTORE_DUAL_WRITE_ENABLED=false
 # Enable dual-read fallback to SQL database when LogStore returns no results (default: true)
 # Useful for migration scenarios where historical data exists only in SQL database
 LOGSTORE_DUAL_READ_ENABLED=true
 # Celery beat configuration
 CELERY_BEAT_SCHEDULER_TIME=1
--- a/api/app_factory.py
+++ b/api/app_factory.py
@@ -75,6 +75,7 @@ def initialize_extensions(app: DifyApp):
        ext_import_modules,
        ext_logging,
        ext_login,
        ext_logstore,
        ext_mail,
        ext_migrate,
        ext_orjson,
@@ -105,6 +106,7 @@ def initialize_extensions(app: DifyApp):
        ext_migrate,
        ext_redis,
        ext_storage,
        ext_logstore,  # Initialize logstore after storage, before celery
        ext_celery,
        ext_login,
        ext_mail,
--- a/api/extensions/ext_logstore.py
+++ b/api/extensions/ext_logstore.py
@@ -0,0 +1,74 @@
 """
 Logstore extension for Dify application.
 This extension initializes the logstore (Aliyun SLS) on application startup,
 creating necessary projects, logstores, and indexes if they don't exist.
 """
 import logging
 import os
 from dotenv import load_dotenv
 from dify_app import DifyApp
 logger = logging.getLogger(__name__)
 def is_enabled() -> bool:
    """
    Check if logstore extension is enabled.
    Returns:
        True if all required Aliyun SLS environment variables are set, False otherwise
    """
    # Load environment variables from .env file
    load_dotenv()
    required_vars = [
        "ALIYUN_SLS_ACCESS_KEY_ID",
        "ALIYUN_SLS_ACCESS_KEY_SECRET",
        "ALIYUN_SLS_ENDPOINT",
        "ALIYUN_SLS_REGION",
        "ALIYUN_SLS_PROJECT_NAME",
    ]
    all_set = all(os.environ.get(var) for var in required_vars)
    if not all_set:
        logger.info("Logstore extension disabled: required Aliyun SLS environment variables not set")
    return all_set
 def init_app(app: DifyApp):
    """
    Initialize logstore on application startup.
    This function:
    1. Creates Aliyun SLS project if it doesn't exist
    2. Creates logstores (workflow_execution, workflow_node_execution) if they don't exist
    3. Creates indexes with field configurations based on PostgreSQL table structures
    This operation is idempotent and only executes once during application startup.
    Args:
        app: The Dify application instance
    """
    try:
        from extensions.logstore.aliyun_logstore import AliyunLogStore
        logger.info("Initializing logstore...")
        # Create logstore client and initialize project/logstores/indexes
        logstore_client = AliyunLogStore()
        logstore_client.init_project_logstore()
        # Attach to app for potential later use
        app.extensions["logstore"] = logstore_client
        logger.info("Logstore initialized successfully")
    except Exception:
        logger.exception("Failed to initialize logstore")
        # Don't raise - allow application to continue even if logstore init fails
        # This ensures that the application can still run if logstore is misconfigured
--- a/api/extensions/logstore/init.py
+++ b/api/extensions/logstore/init.py
--- a/api/extensions/logstore/aliyun_logstore.py
+++ b/api/extensions/logstore/aliyun_logstore.py
@@ -0,0 +1,890 @@
 import logging
 import os
 import threading
 import time
 from collections.abc import Sequence
 from typing import Any
 import sqlalchemy as sa
 from aliyun.log import (  # type: ignore[import-untyped]
    GetLogsRequest,
    IndexConfig,
    IndexKeyConfig,
    IndexLineConfig,
    LogClient,
    LogItem,
    PutLogsRequest,
 )
 from aliyun.log.auth import AUTH_VERSION_4  # type: ignore[import-untyped]
 from aliyun.log.logexception import LogException  # type: ignore[import-untyped]
 from dotenv import load_dotenv
 from sqlalchemy.orm import DeclarativeBase
 from configs import dify_config
 from extensions.logstore.aliyun_logstore_pg import AliyunLogStorePG
 logger = logging.getLogger(__name__)
 class AliyunLogStore:
    """
    Singleton class for Aliyun SLS LogStore operations.
    Ensures only one instance exists to prevent multiple PG connection pools.
    """
    _instance: "AliyunLogStore | None" = None
    _initialized: bool = False
    # Track delayed PG connection for newly created projects
    _pg_connection_timer: threading.Timer | None = None
    _pg_connection_delay: int = 90  # delay seconds
    # Default tokenizer for text/json fields and full-text index
    # Common delimiters: comma, space, quotes, punctuation, operators, brackets, special chars
    DEFAULT_TOKEN_LIST = [
        ",",
        " ",
        '"',
        '"',
        ";",
        "=",
        "(",
        ")",
        "[",
        "]",
        "{",
        "}",
        "?",
        "@",
        "&",
        "<",
        ">",
        "/",
        ":",
        "\n",
        "\t",
    ]
    def __new__(cls) -> "AliyunLogStore":
        """Implement singleton pattern."""
        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance
    project_des = "dify"
    workflow_execution_logstore = "workflow_execution"
    workflow_node_execution_logstore = "workflow_node_execution"
    @staticmethod
    def _sqlalchemy_type_to_logstore_type(column: Any) -> str:
        """
        Map SQLAlchemy column type to Aliyun LogStore index type.
        Args:
            column: SQLAlchemy column object
        Returns:
            LogStore index type: 'text', 'long', 'double', or 'json'
        """
        column_type = column.type
        # Integer types -> long
        if isinstance(column_type, (sa.Integer, sa.BigInteger, sa.SmallInteger)):
            return "long"
        # Float types -> double
        if isinstance(column_type, (sa.Float, sa.Numeric)):
            return "double"
        # String and Text types -> text
        if isinstance(column_type, (sa.String, sa.Text)):
            return "text"
        # DateTime -> text (stored as ISO format string in logstore)
        if isinstance(column_type, sa.DateTime):
            return "text"
        # Boolean -> long (stored as 0/1)
        if isinstance(column_type, sa.Boolean):
            return "long"
        # JSON -> json
        if isinstance(column_type, sa.JSON):
            return "json"
        # Default to text for unknown types
        return "text"
    @staticmethod
    def _generate_index_keys_from_model(model_class: type[DeclarativeBase]) -> dict[str, IndexKeyConfig]:
        """
        Automatically generate LogStore field index configuration from SQLAlchemy model.
        This method introspects the SQLAlchemy model's column definitions and creates
        corresponding LogStore index configurations. When the PG schema is updated via
        Flask-Migrate, this method will automatically pick up the new fields on next startup.
        Args:
            model_class: SQLAlchemy model class (e.g., WorkflowRun, WorkflowNodeExecutionModel)
        Returns:
            Dictionary mapping field names to IndexKeyConfig objects
        """
        index_keys = {}
        # Iterate over all mapped columns in the model
        if hasattr(model_class, "__mapper__"):
            for column_name, column_property in model_class.__mapper__.columns.items():
                # Skip relationship properties and other non-column attributes
                if not hasattr(column_property, "type"):
                    continue
                # Map SQLAlchemy type to LogStore type
                logstore_type = AliyunLogStore._sqlalchemy_type_to_logstore_type(column_property)
                # Create index configuration
                # - text fields: case_insensitive for better search, with tokenizer and Chinese support
                # - all fields: doc_value=True for analytics
                if logstore_type == "text":
                    index_keys[column_name] = IndexKeyConfig(
                        index_type="text",
                        case_sensitive=False,
                        doc_value=True,
                        token_list=AliyunLogStore.DEFAULT_TOKEN_LIST,
                        chinese=True,
                    )
                else:
                    index_keys[column_name] = IndexKeyConfig(index_type=logstore_type, doc_value=True)
        # Add log_version field (not in PG model, but used in logstore for versioning)
        index_keys["log_version"] = IndexKeyConfig(index_type="long", doc_value=True)
        return index_keys
    def __init__(self) -> None:
        # Skip initialization if already initialized (singleton pattern)
        if self.__class__._initialized:
            return
        load_dotenv()
        self.access_key_id: str = os.environ.get("ALIYUN_SLS_ACCESS_KEY_ID", "")
        self.access_key_secret: str = os.environ.get("ALIYUN_SLS_ACCESS_KEY_SECRET", "")
        self.endpoint: str = os.environ.get("ALIYUN_SLS_ENDPOINT", "")
        self.region: str = os.environ.get("ALIYUN_SLS_REGION", "")
        self.project_name: str = os.environ.get("ALIYUN_SLS_PROJECT_NAME", "")
        self.logstore_ttl: int = int(os.environ.get("ALIYUN_SLS_LOGSTORE_TTL", 365))
        self.log_enabled: bool = os.environ.get("SQLALCHEMY_ECHO", "false").lower() == "true"
        self.pg_mode_enabled: bool = os.environ.get("LOGSTORE_PG_MODE_ENABLED", "true").lower() == "true"
        # Initialize SDK client
        self.client = LogClient(
            self.endpoint, self.access_key_id, self.access_key_secret, auth_version=AUTH_VERSION_4, region=self.region
        )
        # Append Dify identification to the existing user agent
        original_user_agent = self.client._user_agent  # pyright: ignore[reportPrivateUsage]
        dify_version = dify_config.project.version
        enhanced_user_agent = f"Dify,Dify-{dify_version},{original_user_agent}"
        self.client.set_user_agent(enhanced_user_agent)
        # PG client will be initialized in init_project_logstore
        self._pg_client: AliyunLogStorePG | None = None
        self._use_pg_protocol: bool = False
        self.__class__._initialized = True
    @property
    def supports_pg_protocol(self) -> bool:
        """Check if PG protocol is supported and enabled."""
        return self._use_pg_protocol
    def _attempt_pg_connection_init(self) -> bool:
        """
        Attempt to initialize PG connection.
        This method tries to establish PG connection and performs necessary checks.
        It's used both for immediate connection (existing projects) and delayed connection (new projects).
        Returns:
            True if PG connection was successfully established, False otherwise.
        """
        if not self.pg_mode_enabled or not self._pg_client:
            return False
        try:
            self._use_pg_protocol = self._pg_client.init_connection()
            if self._use_pg_protocol:
                logger.info("Successfully connected to project %s using PG protocol", self.project_name)
                # Check if scan_index is enabled for all logstores
                self._check_and_disable_pg_if_scan_index_disabled()
                return True
            else:
                logger.info("PG connection failed for project %s. Will use SDK mode.", self.project_name)
                return False
        except Exception as e:
            logger.warning(
                "Failed to establish PG connection for project %s: %s. Will use SDK mode.",
                self.project_name,
                str(e),
            )
            self._use_pg_protocol = False
            return False
    def _delayed_pg_connection_init(self) -> None:
        """
        Delayed initialization of PG connection for newly created projects.
        This method is called by a background timer 3 minutes after project creation.
        """
        # Double check conditions in case state changed
        if self._use_pg_protocol:
            return
        logger.info(
            "Attempting delayed PG connection for newly created project %s ...",
            self.project_name,
        )
        self._attempt_pg_connection_init()
        self.__class__._pg_connection_timer = None
    def init_project_logstore(self):
        """
        Initialize project, logstore, index, and PG connection.
        This method should be called once during application startup to ensure
        all required resources exist and connections are established.
        """
        # Step 1: Ensure project and logstore exist
        project_is_new = False
        if not self.is_project_exist():
            self.create_project()
            project_is_new = True
        self.create_logstore_if_not_exist()
        # Step 2: Initialize PG client and connection (if enabled)
        if not self.pg_mode_enabled:
            logger.info("PG mode is disabled. Will use SDK mode.")
            return
        # Create PG client if not already created
        if self._pg_client is None:
            logger.info("Initializing PG client for project %s...", self.project_name)
            self._pg_client = AliyunLogStorePG(
                self.access_key_id, self.access_key_secret, self.endpoint, self.project_name
            )
        # Step 3: Establish PG connection based on project status
        if project_is_new:
            # For newly created projects, schedule delayed PG connection
            self._use_pg_protocol = False
            logger.info(
                "Project %s is newly created. Will use SDK mode and schedule PG connection attempt in %d seconds.",
                self.project_name,
                self.__class__._pg_connection_delay,
            )
            if self.__class__._pg_connection_timer is not None:
                self.__class__._pg_connection_timer.cancel()
            self.__class__._pg_connection_timer = threading.Timer(
                self.__class__._pg_connection_delay,
                self._delayed_pg_connection_init,
            )
            self.__class__._pg_connection_timer.daemon = True  # Don't block app shutdown
            self.__class__._pg_connection_timer.start()
        else:
            # For existing projects, attempt PG connection immediately
            logger.info("Project %s already exists. Attempting PG connection...", self.project_name)
            self._attempt_pg_connection_init()
    def _check_and_disable_pg_if_scan_index_disabled(self) -> None:
        """
        Check if scan_index is enabled for all logstores.
        If any logstore has scan_index=false, disable PG protocol.
        This is necessary because PG protocol requires scan_index to be enabled.
        """
        logstore_name_list = [
            AliyunLogStore.workflow_execution_logstore,
            AliyunLogStore.workflow_node_execution_logstore,
        ]
        for logstore_name in logstore_name_list:
            existing_config = self.get_existing_index_config(logstore_name)
            if existing_config and not existing_config.scan_index:
                logger.info(
                    "Logstore %s has scan_index=false, USE SDK mode for read/write operations. "
                    "PG protocol requires scan_index to be enabled.",
                    logstore_name,
                )
                self._use_pg_protocol = False
                # Close PG connection if it was initialized
                if self._pg_client:
                    self._pg_client.close()
                    self._pg_client = None
                return
    def is_project_exist(self) -> bool:
        try:
            self.client.get_project(self.project_name)
            return True
        except Exception as e:
            if e.args[0] == "ProjectNotExist":
                return False
            else:
                raise e
    def create_project(self):
        try:
            self.client.create_project(self.project_name, AliyunLogStore.project_des)
            logger.info("Project %s created successfully", self.project_name)
        except LogException as e:
            logger.exception(
                "Failed to create project %s: errorCode=%s, errorMessage=%s, requestId=%s",
                self.project_name,
                e.get_error_code(),
                e.get_error_message(),
                e.get_request_id(),
            )
            raise
    def is_logstore_exist(self, logstore_name: str) -> bool:
        try:
            _ = self.client.get_logstore(self.project_name, logstore_name)
            return True
        except Exception as e:
            if e.args[0] == "LogStoreNotExist":
                return False
            else:
                raise e
    def create_logstore_if_not_exist(self) -> None:
        logstore_name_list = [
            AliyunLogStore.workflow_execution_logstore,
            AliyunLogStore.workflow_node_execution_logstore,
        ]
        for logstore_name in logstore_name_list:
            if not self.is_logstore_exist(logstore_name):
                try:
                    self.client.create_logstore(
                        project_name=self.project_name, logstore_name=logstore_name, ttl=self.logstore_ttl
                    )
                    logger.info("logstore %s created successfully", logstore_name)
                except LogException as e:
                    logger.exception(
                        "Failed to create logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
                        logstore_name,
                        e.get_error_code(),
                        e.get_error_message(),
                        e.get_request_id(),
                    )
                    raise
            # Ensure index contains all Dify-required fields
            # This intelligently merges with existing config, preserving custom indexes
            self.ensure_index_config(logstore_name)
    def is_index_exist(self, logstore_name: str) -> bool:
        try:
            _ = self.client.get_index_config(self.project_name, logstore_name)
            return True
        except Exception as e:
            if e.args[0] == "IndexConfigNotExist":
                return False
            else:
                raise e
    def get_existing_index_config(self, logstore_name: str) -> IndexConfig | None:
        """
        Get existing index configuration from logstore.
        Args:
            logstore_name: Name of the logstore
        Returns:
            IndexConfig object if index exists, None otherwise
        """
        try:
            response = self.client.get_index_config(self.project_name, logstore_name)
            return response.get_index_config()
        except Exception as e:
            if e.args[0] == "IndexConfigNotExist":
                return None
            else:
                logger.exception("Failed to get index config for logstore %s", logstore_name)
                raise e
    def _get_workflow_execution_index_keys(self) -> dict[str, IndexKeyConfig]:
        """
        Get field index configuration for workflow_execution logstore.
        This method automatically generates index configuration from the WorkflowRun SQLAlchemy model.
        When the PG schema is updated via Flask-Migrate, the index configuration will be automatically
        updated on next application startup.
        """
        from models.workflow import WorkflowRun
        index_keys = self._generate_index_keys_from_model(WorkflowRun)
        # Add custom fields that are in logstore but not in PG model
        # These fields are added by the repository layer
        index_keys["error_message"] = IndexKeyConfig(
            index_type="text",
            case_sensitive=False,
            doc_value=True,
            token_list=self.DEFAULT_TOKEN_LIST,
            chinese=True,
        )  # Maps to 'error' in PG
        index_keys["started_at"] = IndexKeyConfig(
            index_type="text",
            case_sensitive=False,
            doc_value=True,
            token_list=self.DEFAULT_TOKEN_LIST,
            chinese=True,
        )  # Maps to 'created_at' in PG
        logger.info("Generated %d index keys for workflow_execution from WorkflowRun model", len(index_keys))
        return index_keys
    def _get_workflow_node_execution_index_keys(self) -> dict[str, IndexKeyConfig]:
        """
        Get field index configuration for workflow_node_execution logstore.
        This method automatically generates index configuration from the WorkflowNodeExecutionModel.
        When the PG schema is updated via Flask-Migrate, the index configuration will be automatically
        updated on next application startup.
        """
        from models.workflow import WorkflowNodeExecutionModel
        index_keys = self._generate_index_keys_from_model(WorkflowNodeExecutionModel)
        logger.debug(
            "Generated %d index keys for workflow_node_execution from WorkflowNodeExecutionModel", len(index_keys)
        )
        return index_keys
    def _get_index_config(self, logstore_name: str) -> IndexConfig:
        """
        Get index configuration for the specified logstore.
        Args:
            logstore_name: Name of the logstore
        Returns:
            IndexConfig object with line and field indexes
        """
        # Create full-text index (line config) with tokenizer
        line_config = IndexLineConfig(token_list=self.DEFAULT_TOKEN_LIST, case_sensitive=False, chinese=True)
        # Get field index configuration based on logstore name
        field_keys = {}
        if logstore_name == AliyunLogStore.workflow_execution_logstore:
            field_keys = self._get_workflow_execution_index_keys()
        elif logstore_name == AliyunLogStore.workflow_node_execution_logstore:
            field_keys = self._get_workflow_node_execution_index_keys()
        # key_config_list should be a dict, not a list
        # Create index config with both line and field indexes
        return IndexConfig(line_config=line_config, key_config_list=field_keys, scan_index=True)
    def create_index(self, logstore_name: str) -> None:
        """
        Create index for the specified logstore with both full-text and field indexes.
        Field indexes are automatically generated from the corresponding SQLAlchemy model.
        """
        index_config = self._get_index_config(logstore_name)
        try:
            self.client.create_index(self.project_name, logstore_name, index_config)
            logger.info(
                "index for %s created successfully with %d field indexes",
                logstore_name,
                len(index_config.key_config_list or {}),
            )
        except LogException as e:
            logger.exception(
                "Failed to create index for logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
                logstore_name,
                e.get_error_code(),
                e.get_error_message(),
                e.get_request_id(),
            )
            raise
    def _merge_index_configs(
        self, existing_config: IndexConfig, required_keys: dict[str, IndexKeyConfig], logstore_name: str
    ) -> tuple[IndexConfig, bool]:
        """
        Intelligently merge existing index config with Dify's required field indexes.
        This method:
        1. Preserves all existing field indexes in logstore (including custom fields)
        2. Adds missing Dify-required fields
        3. Updates fields where type doesn't match (with json/text compatibility)
        4. Corrects case mismatches (e.g., if Dify needs 'status' but logstore has 'Status')
        Type compatibility rules:
        - json and text types are considered compatible (users can manually choose either)
        - All other type mismatches will be corrected to match Dify requirements
        Note: Logstore is case-sensitive and doesn't allow duplicate fields with different cases.
        Case mismatch means: existing field name differs from required name only in case.
        Args:
            existing_config: Current index configuration from logstore
            required_keys: Dify's required field index configurations
            logstore_name: Name of the logstore (for logging)
        Returns:
            Tuple of (merged_config, needs_update)
        """
        # key_config_list is already a dict in the SDK
        # Make a copy to avoid modifying the original
        existing_keys = dict(existing_config.key_config_list) if existing_config.key_config_list else {}
        # Track changes
        needs_update = False
        case_corrections = []  # Fields that need case correction (e.g., 'Status' -> 'status')
        missing_fields = []
        type_mismatches = []
        # First pass: Check for and resolve case mismatches with required fields
        # Note: Logstore itself doesn't allow duplicate fields with different cases,
        # so we only need to check if the existing case matches the required case
        for required_name in required_keys:
            lower_name = required_name.lower()
            # Find key that matches case-insensitively but not exactly
            wrong_case_key = None
            for existing_key in existing_keys:
                if existing_key.lower() == lower_name and existing_key != required_name:
                    wrong_case_key = existing_key
                    break
            if wrong_case_key:
                # Field exists but with wrong case (e.g., 'Status' when we need 'status')
                # Remove the wrong-case key, will be added back with correct case later
                case_corrections.append((wrong_case_key, required_name))
                del existing_keys[wrong_case_key]
                needs_update = True
        # Second pass: Check each required field
        for required_name, required_config in required_keys.items():
            # Check for exact match (case-sensitive)
            if required_name in existing_keys:
                existing_type = existing_keys[required_name].index_type
                required_type = required_config.index_type
                # Check if type matches
                # Special case: json and text are interchangeable for JSON content fields
                # Allow users to manually configure text instead of json (or vice versa) without forcing updates
                is_compatible = existing_type == required_type or ({existing_type, required_type} == {"json", "text"})
                if not is_compatible:
                    type_mismatches.append((required_name, existing_type, required_type))
                    # Update with correct type
                    existing_keys[required_name] = required_config
                    needs_update = True
                # else: field exists with compatible type, no action needed
            else:
                # Field doesn't exist (may have been removed in first pass due to case conflict)
                missing_fields.append(required_name)
                existing_keys[required_name] = required_config
                needs_update = True
        # Log changes
        if missing_fields:
            logger.info(
                "Logstore %s: Adding %d missing Dify-required fields: %s",
                logstore_name,
                len(missing_fields),
                ", ".join(missing_fields[:10]) + ("..." if len(missing_fields) > 10 else ""),
            )
        if type_mismatches:
            logger.info(
                "Logstore %s: Fixing %d type mismatches: %s",
                logstore_name,
                len(type_mismatches),
                ", ".join([f"{name}({old}->{new})" for name, old, new in type_mismatches[:5]])
                + ("..." if len(type_mismatches) > 5 else ""),
            )
        if case_corrections:
            logger.info(
                "Logstore %s: Correcting %d field name cases: %s",
                logstore_name,
                len(case_corrections),
                ", ".join([f"'{old}' -> '{new}'" for old, new in case_corrections[:5]])
                + ("..." if len(case_corrections) > 5 else ""),
            )
        # Create merged config
        # key_config_list should be a dict, not a list
        # Preserve the original scan_index value - don't force it to True
        merged_config = IndexConfig(
            line_config=existing_config.line_config
            or IndexLineConfig(token_list=self.DEFAULT_TOKEN_LIST, case_sensitive=False, chinese=True),
            key_config_list=existing_keys,
            scan_index=existing_config.scan_index,
        )
        return merged_config, needs_update
    def ensure_index_config(self, logstore_name: str) -> None:
        """
        Ensure index configuration includes all Dify-required fields.
        This method intelligently manages index configuration:
        1. If index doesn't exist, create it with Dify's required fields
        2. If index exists:
           - Check if all Dify-required fields are present
           - Check if field types match requirements
           - Only update if fields are missing or types are incorrect
           - Preserve any additional custom index configurations
        This approach allows users to add their own custom indexes without being overwritten.
        """
        # Get Dify's required field indexes
        required_keys = {}
        if logstore_name == AliyunLogStore.workflow_execution_logstore:
            required_keys = self._get_workflow_execution_index_keys()
        elif logstore_name == AliyunLogStore.workflow_node_execution_logstore:
            required_keys = self._get_workflow_node_execution_index_keys()
        # Check if index exists
        existing_config = self.get_existing_index_config(logstore_name)
        if existing_config is None:
            # Index doesn't exist, create it
            logger.info(
                "Logstore %s: Index doesn't exist, creating with %d required fields",
                logstore_name,
                len(required_keys),
            )
            self.create_index(logstore_name)
        else:
            merged_config, needs_update = self._merge_index_configs(existing_config, required_keys, logstore_name)
            if needs_update:
                logger.info("Logstore %s: Updating index to include Dify-required fields", logstore_name)
                try:
                    self.client.update_index(self.project_name, logstore_name, merged_config)
                    logger.info(
                        "Logstore %s: Index updated successfully, now has %d total field indexes",
                        logstore_name,
                        len(merged_config.key_config_list or {}),
                    )
                except LogException as e:
                    logger.exception(
                        "Failed to update index for logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
                        logstore_name,
                        e.get_error_code(),
                        e.get_error_message(),
                        e.get_request_id(),
                    )
                    raise
            else:
                logger.info(
                    "Logstore %s: Index already contains all %d Dify-required fields with correct types, "
                    "no update needed",
                    logstore_name,
                    len(required_keys),
                )
    def put_log(self, logstore: str, contents: Sequence[tuple[str, str]]) -> None:
        # Route to PG or SDK based on protocol availability
        if self._use_pg_protocol and self._pg_client:
            self._pg_client.put_log(logstore, contents, self.log_enabled)
        else:
            log_item = LogItem(contents=contents)
            request = PutLogsRequest(project=self.project_name, logstore=logstore, logitems=[log_item])
            if self.log_enabled:
                logger.info(
                    "[LogStore-SDK] PUT_LOG | logstore=%s | project=%s | items_count=%d",
                    logstore,
                    self.project_name,
                    len(contents),
                )
            try:
                self.client.put_logs(request)
            except LogException as e:
                logger.exception(
                    "Failed to put logs to logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
                    logstore,
                    e.get_error_code(),
                    e.get_error_message(),
                    e.get_request_id(),
                )
                raise
    def get_logs(
        self,
        logstore: str,
        from_time: int,
        to_time: int,
        topic: str = "",
        query: str = "",
        line: int = 100,
        offset: int = 0,
        reverse: bool = True,
    ) -> list[dict]:
        request = GetLogsRequest(
            project=self.project_name,
            logstore=logstore,
            fromTime=from_time,
            toTime=to_time,
            topic=topic,
            query=query,
            line=line,
            offset=offset,
            reverse=reverse,
        )
        # Log query info if SQLALCHEMY_ECHO is enabled
        if self.log_enabled:
            logger.info(
                "[LogStore] GET_LOGS | logstore=%s | project=%s | query=%s | "
                "from_time=%d | to_time=%d | line=%d | offset=%d | reverse=%s",
                logstore,
                self.project_name,
                query,
                from_time,
                to_time,
                line,
                offset,
                reverse,
            )
        try:
            response = self.client.get_logs(request)
            result = []
            logs = response.get_logs() if response else []
            for log in logs:
                result.append(log.get_contents())
            # Log result count if SQLALCHEMY_ECHO is enabled
            if self.log_enabled:
                logger.info(
                    "[LogStore] GET_LOGS RESULT | logstore=%s | returned_count=%d",
                    logstore,
                    len(result),
                )
            return result
        except LogException as e:
            logger.exception(
                "Failed to get logs from logstore %s with query '%s': errorCode=%s, errorMessage=%s, requestId=%s",
                logstore,
                query,
                e.get_error_code(),
                e.get_error_message(),
                e.get_request_id(),
            )
            raise
    def execute_sql(
        self,
        sql: str,
        logstore: str | None = None,
        query: str = "*",
        from_time: int | None = None,
        to_time: int | None = None,
        power_sql: bool = False,
    ) -> list[dict]:
        """
        Execute SQL query for aggregation and analysis.
        Args:
            sql: SQL query string (SELECT statement)
            logstore: Name of the logstore (required)
            query: Search/filter query for SDK mode (default: "*" for all logs).
                   Only used in SDK mode. PG mode ignores this parameter.
            from_time: Start time (Unix timestamp) - only used in SDK mode
            to_time: End time (Unix timestamp) - only used in SDK mode
            power_sql: Whether to use enhanced SQL mode (default: False)
        Returns:
            List of result rows as dictionaries
        Note:
            - PG mode: Only executes the SQL directly
            - SDK mode: Combines query and sql as "query | sql"
        """
        # Logstore is required
        if not logstore:
            raise ValueError("logstore parameter is required for execute_sql")
        # Route to PG or SDK based on protocol availability
        if self._use_pg_protocol and self._pg_client:
            # PG mode: execute SQL directly (ignore query parameter)
            return self._pg_client.execute_sql(sql, logstore, self.log_enabled)
        else:
            # SDK mode: combine query and sql as "query | sql"
            full_query = f"{query} | {sql}"
            # Provide default time range if not specified
            if from_time is None:
                from_time = 0
            if to_time is None:
                to_time = int(time.time())  # now
            request = GetLogsRequest(
                project=self.project_name,
                logstore=logstore,
                fromTime=from_time,
                toTime=to_time,
                query=full_query,
            )
            # Log query info if SQLALCHEMY_ECHO is enabled
            if self.log_enabled:
                logger.info(
                    "[LogStore-SDK] EXECUTE_SQL | logstore=%s | project=%s | from_time=%d | to_time=%d | full_query=%s",
                    logstore,
                    self.project_name,
                    from_time,
                    to_time,
                    query,
                    sql,
                )
            try:
                response = self.client.get_logs(request)
                result = []
                logs = response.get_logs() if response else []
                for log in logs:
                    result.append(log.get_contents())
                # Log result count if SQLALCHEMY_ECHO is enabled
                if self.log_enabled:
                    logger.info(
                        "[LogStore-SDK] EXECUTE_SQL RESULT | logstore=%s | returned_count=%d",
                        logstore,
                        len(result),
                    )
                return result
            except LogException as e:
                logger.exception(
                    "Failed to execute SQL, logstore %s: errorCode=%s, errorMessage=%s, requestId=%s, full_query=%s",
                    logstore,
                    e.get_error_code(),
                    e.get_error_message(),
                    e.get_request_id(),
                    full_query,
                )
                raise
 if __name__ == "__main__":
    aliyun_logstore = AliyunLogStore()
    # aliyun_logstore.init_project_logstore()
    aliyun_logstore.put_log(AliyunLogStore.workflow_execution_logstore, [("key1", "value1")])
--- a/api/extensions/logstore/aliyun_logstore_pg.py
+++ b/api/extensions/logstore/aliyun_logstore_pg.py
@@ -0,0 +1,407 @@
 import logging
 import os
 import socket
 import time
 from collections.abc import Sequence
 from contextlib import contextmanager
 from typing import Any
 import psycopg2
 import psycopg2.pool
 from psycopg2 import InterfaceError, OperationalError
 from configs import dify_config
 logger = logging.getLogger(__name__)
 class AliyunLogStorePG:
    """
    PostgreSQL protocol support for Aliyun SLS LogStore.
    Handles PG connection pooling and operations for regions that support PG protocol.
    """
    def __init__(self, access_key_id: str, access_key_secret: str, endpoint: str, project_name: str):
        """
        Initialize PG connection for SLS.
        Args:
            access_key_id: Aliyun access key ID
            access_key_secret: Aliyun access key secret
            endpoint: SLS endpoint
            project_name: SLS project name
        """
        self._access_key_id = access_key_id
        self._access_key_secret = access_key_secret
        self._endpoint = endpoint
        self.project_name = project_name
        self._pg_pool: psycopg2.pool.SimpleConnectionPool | None = None
        self._use_pg_protocol = False
    def _check_port_connectivity(self, host: str, port: int, timeout: float = 2.0) -> bool:
        """
        Check if a TCP port is reachable using socket connection.
        This provides a fast check before attempting full database connection,
        preventing long waits when connecting to unsupported regions.
        Args:
            host: Hostname or IP address
            port: Port number
            timeout: Connection timeout in seconds (default: 2.0)
        Returns:
            True if port is reachable, False otherwise
        """
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            sock.settimeout(timeout)
            result = sock.connect_ex((host, port))
            sock.close()
            return result == 0
        except Exception as e:
            logger.debug("Port connectivity check failed for %s:%d: %s", host, port, str(e))
            return False
    def init_connection(self) -> bool:
        """
        Initialize PostgreSQL connection pool for SLS PG protocol support.
        Attempts to connect to SLS using PostgreSQL protocol. If successful, sets
        _use_pg_protocol to True and creates a connection pool. If connection fails
        (region doesn't support PG protocol or other errors), returns False.
        Returns:
            True if PG protocol is supported and initialized, False otherwise
        """
        try:
            # Extract hostname from endpoint (remove protocol if present)
            pg_host = self._endpoint.replace("http://", "").replace("https://", "")
            # Get pool configuration
            pg_max_connections = int(os.environ.get("ALIYUN_SLS_PG_MAX_CONNECTIONS", 10))
            logger.debug(
                "Check PG protocol connection to SLS: host=%s, project=%s",
                pg_host,
                self.project_name,
            )
            # Fast port connectivity check before attempting full connection
            # This prevents long waits when connecting to unsupported regions
            if not self._check_port_connectivity(pg_host, 5432, timeout=1.0):
                logger.info(
                    "USE SDK mode for read/write operations, host=%s",
                    pg_host,
                )
                return False
            # Create connection pool
            self._pg_pool = psycopg2.pool.SimpleConnectionPool(
                minconn=1,
                maxconn=pg_max_connections,
                host=pg_host,
                port=5432,
                database=self.project_name,
                user=self._access_key_id,
                password=self._access_key_secret,
                sslmode="require",
                connect_timeout=5,
                application_name=f"Dify-{dify_config.project.version}",
            )
            # Note: Skip test query because SLS PG protocol only supports SELECT/INSERT on actual tables
            # Connection pool creation success already indicates connectivity
            self._use_pg_protocol = True
            logger.info(
                "PG protocol initialized successfully for SLS project=%s. Will use PG for read/write operations.",
                self.project_name,
            )
            return True
        except Exception as e:
            # PG connection failed - fallback to SDK mode
            self._use_pg_protocol = False
            if self._pg_pool:
                try:
                    self._pg_pool.closeall()
                except Exception:
                    logger.debug("Failed to close PG connection pool during cleanup, ignoring")
            self._pg_pool = None
            logger.info(
                "PG protocol connection failed (region may not support PG protocol): %s. "
                "Falling back to SDK mode for read/write operations.",
                str(e),
            )
            return False
    def _is_connection_valid(self, conn: Any) -> bool:
        """
        Check if a connection is still valid.
        Args:
            conn: psycopg2 connection object
        Returns:
            True if connection is valid, False otherwise
        """
        try:
            # Check if connection is closed
            if conn.closed:
                return False
            # Quick ping test - execute a lightweight query
            # For SLS PG protocol, we can't use SELECT 1 without FROM,
            # so we just check the connection status
            with conn.cursor() as cursor:
                cursor.execute("SELECT 1")
                cursor.fetchone()
            return True
        except Exception:
            return False
    @contextmanager
    def _get_connection(self):
        """
        Context manager to get a PostgreSQL connection from the pool.
        Automatically validates and refreshes stale connections.
        Note: Aliyun SLS PG protocol does not support transactions, so we always
        use autocommit mode.
        Yields:
            psycopg2 connection object
        Raises:
            RuntimeError: If PG pool is not initialized
        """
        if not self._pg_pool:
            raise RuntimeError("PG connection pool is not initialized")
        conn = self._pg_pool.getconn()
        try:
            # Validate connection and get a fresh one if needed
            if not self._is_connection_valid(conn):
                logger.debug("Connection is stale, marking as bad and getting a new one")
                # Mark connection as bad and get a new one
                self._pg_pool.putconn(conn, close=True)
                conn = self._pg_pool.getconn()
            # Aliyun SLS PG protocol does not support transactions, always use autocommit
            conn.autocommit = True
            yield conn
        finally:
            # Return connection to pool (or close if it's bad)
            if self._is_connection_valid(conn):
                self._pg_pool.putconn(conn)
            else:
                self._pg_pool.putconn(conn, close=True)
    def close(self) -> None:
        """Close the PostgreSQL connection pool."""
        if self._pg_pool:
            try:
                self._pg_pool.closeall()
                logger.info("PG connection pool closed")
            except Exception:
                logger.exception("Failed to close PG connection pool")
    def _is_retriable_error(self, error: Exception) -> bool:
        """
        Check if an error is retriable (connection-related issues).
        Args:
            error: Exception to check
        Returns:
            True if the error is retriable, False otherwise
        """
        # Retry on connection-related errors
        if isinstance(error, (OperationalError, InterfaceError)):
            return True
        # Check error message for specific connection issues
        error_msg = str(error).lower()
        retriable_patterns = [
            "connection",
            "timeout",
            "closed",
            "broken pipe",
            "reset by peer",
            "no route to host",
            "network",
        ]
        return any(pattern in error_msg for pattern in retriable_patterns)
    def put_log(self, logstore: str, contents: Sequence[tuple[str, str]], log_enabled: bool = False) -> None:
        """
        Write log to SLS using PostgreSQL protocol with automatic retry.
        Note: SLS PG protocol only supports INSERT (not UPDATE). This uses append-only
        writes with log_version field for versioning, same as SDK implementation.
        Args:
            logstore: Name of the logstore table
            contents: List of (field_name, value) tuples
            log_enabled: Whether to enable logging
        Raises:
            psycopg2.Error: If database operation fails after all retries
        """
        if not contents:
            return
        # Extract field names and values from contents
        fields = [field_name for field_name, _ in contents]
        values = [value for _, value in contents]
        # Build INSERT statement with literal values
        # Note: Aliyun SLS PG protocol doesn't support parameterized queries,
        # so we need to use mogrify to safely create literal values
        field_list = ", ".join([f'"{field}"' for field in fields])
        if log_enabled:
            logger.info(
                "[LogStore-PG] PUT_LOG | logstore=%s | project=%s | items_count=%d",
                logstore,
                self.project_name,
                len(contents),
            )
        # Retry configuration
        max_retries = 3
        retry_delay = 0.1  # Start with 100ms
        for attempt in range(max_retries):
            try:
                with self._get_connection() as conn:
                    with conn.cursor() as cursor:
                        # Use mogrify to safely convert values to SQL literals
                        placeholders = ", ".join(["%s"] * len(fields))
                        values_literal = cursor.mogrify(f"({placeholders})", values).decode("utf-8")
                        insert_sql = f'INSERT INTO "{logstore}" ({field_list}) VALUES {values_literal}'
                        cursor.execute(insert_sql)
                # Success - exit retry loop
                return
            except psycopg2.Error as e:
                # Check if error is retriable
                if not self._is_retriable_error(e):
                    # Not a retriable error (e.g., data validation error), fail immediately
                    logger.exception(
                        "Failed to put logs to logstore %s via PG protocol (non-retriable error)",
                        logstore,
                    )
                    raise
                # Retriable error - log and retry if we have attempts left
                if attempt < max_retries - 1:
                    logger.warning(
                        "Failed to put logs to logstore %s via PG protocol (attempt %d/%d): %s. Retrying...",
                        logstore,
                        attempt + 1,
                        max_retries,
                        str(e),
                    )
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    # Last attempt failed
                    logger.exception(
                        "Failed to put logs to logstore %s via PG protocol after %d attempts",
                        logstore,
                        max_retries,
                    )
                    raise
    def execute_sql(self, sql: str, logstore: str, log_enabled: bool = False) -> list[dict[str, Any]]:
        """
        Execute SQL query using PostgreSQL protocol with automatic retry.
        Args:
            sql: SQL query string
            logstore: Name of the logstore (for logging purposes)
            log_enabled: Whether to enable logging
        Returns:
            List of result rows as dictionaries
        Raises:
            psycopg2.Error: If database operation fails after all retries
        """
        if log_enabled:
            logger.info(
                "[LogStore-PG] EXECUTE_SQL | logstore=%s | project=%s | sql=%s",
                logstore,
                self.project_name,
                sql,
            )
        # Retry configuration
        max_retries = 3
        retry_delay = 0.1  # Start with 100ms
        for attempt in range(max_retries):
            try:
                with self._get_connection() as conn:
                    with conn.cursor() as cursor:
                        cursor.execute(sql)
                        # Get column names from cursor description
                        columns = [desc[0] for desc in cursor.description]
                        # Fetch all results and convert to list of dicts
                        result = []
                        for row in cursor.fetchall():
                            row_dict = {}
                            for col, val in zip(columns, row):
                                row_dict[col] = "" if val is None else str(val)
                            result.append(row_dict)
                        if log_enabled:
                            logger.info(
                                "[LogStore-PG] EXECUTE_SQL RESULT | logstore=%s | returned_count=%d",
                                logstore,
                                len(result),
                            )
                        return result
            except psycopg2.Error as e:
                # Check if error is retriable
                if not self._is_retriable_error(e):
                    # Not a retriable error (e.g., SQL syntax error), fail immediately
                    logger.exception(
                        "Failed to execute SQL query on logstore %s via PG protocol (non-retriable error): sql=%s",
                        logstore,
                        sql,
                    )
                    raise
                # Retriable error - log and retry if we have attempts left
                if attempt < max_retries - 1:
                    logger.warning(
                        "Failed to execute SQL query on logstore %s via PG protocol (attempt %d/%d): %s. Retrying...",
                        logstore,
                        attempt + 1,
                        max_retries,
                        str(e),
                    )
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    # Last attempt failed
                    logger.exception(
                        "Failed to execute SQL query on logstore %s via PG protocol after %d attempts: sql=%s",
                        logstore,
                        max_retries,
                        sql,
                    )
                    raise
        # This line should never be reached due to raise above, but makes type checker happy
        return []
--- a/api/extensions/logstore/repositories/init.py
+++ b/api/extensions/logstore/repositories/init.py
--- a/api/extensions/logstore/repositories/logstore_api_workflow_node_execution_repository.py
+++ b/api/extensions/logstore/repositories/logstore_api_workflow_node_execution_repository.py
@@ -0,0 +1,365 @@
 """
 LogStore implementation of DifyAPIWorkflowNodeExecutionRepository.
 This module provides the LogStore-based implementation for service-layer
 WorkflowNodeExecutionModel operations using Aliyun SLS LogStore.
 """
 import logging
 import time
 from collections.abc import Sequence
 from datetime import datetime
 from typing import Any
 from sqlalchemy.orm import sessionmaker
 from extensions.logstore.aliyun_logstore import AliyunLogStore
 from models.workflow import WorkflowNodeExecutionModel
 from repositories.api_workflow_node_execution_repository import DifyAPIWorkflowNodeExecutionRepository
 logger = logging.getLogger(__name__)
 def _dict_to_workflow_node_execution_model(data: dict[str, Any]) -> WorkflowNodeExecutionModel:
    """
    Convert LogStore result dictionary to WorkflowNodeExecutionModel instance.
    Args:
        data: Dictionary from LogStore query result
    Returns:
        WorkflowNodeExecutionModel instance (detached from session)
    Note:
        The returned model is not attached to any SQLAlchemy session.
        Relationship fields (like offload_data) are not loaded from LogStore.
    """
    logger.debug("_dict_to_workflow_node_execution_model: data keys=%s", list(data.keys())[:5])
    # Create model instance without session
    model = WorkflowNodeExecutionModel()
    # Map all required fields with validation
    # Critical fields - must not be None
    model.id = data.get("id") or ""
    model.tenant_id = data.get("tenant_id") or ""
    model.app_id = data.get("app_id") or ""
    model.workflow_id = data.get("workflow_id") or ""
    model.triggered_from = data.get("triggered_from") or ""
    model.node_id = data.get("node_id") or ""
    model.node_type = data.get("node_type") or ""
    model.status = data.get("status") or "running"  # Default status if missing
    model.title = data.get("title") or ""
    model.created_by_role = data.get("created_by_role") or ""
    model.created_by = data.get("created_by") or ""
    # Numeric fields with defaults
    model.index = int(data.get("index", 0))
    model.elapsed_time = float(data.get("elapsed_time", 0))
    # Optional fields
    model.workflow_run_id = data.get("workflow_run_id")
    model.predecessor_node_id = data.get("predecessor_node_id")
    model.node_execution_id = data.get("node_execution_id")
    model.inputs = data.get("inputs")
    model.process_data = data.get("process_data")
    model.outputs = data.get("outputs")
    model.error = data.get("error")
    model.execution_metadata = data.get("execution_metadata")
    # Handle datetime fields
    created_at = data.get("created_at")
    if created_at:
        if isinstance(created_at, str):
            model.created_at = datetime.fromisoformat(created_at)
        elif isinstance(created_at, (int, float)):
            model.created_at = datetime.fromtimestamp(created_at)
        else:
            model.created_at = created_at
    else:
        # Provide default created_at if missing
        model.created_at = datetime.now()
    finished_at = data.get("finished_at")
    if finished_at:
        if isinstance(finished_at, str):
            model.finished_at = datetime.fromisoformat(finished_at)
        elif isinstance(finished_at, (int, float)):
            model.finished_at = datetime.fromtimestamp(finished_at)
        else:
            model.finished_at = finished_at
    return model
 class LogstoreAPIWorkflowNodeExecutionRepository(DifyAPIWorkflowNodeExecutionRepository):
    """
    LogStore implementation of DifyAPIWorkflowNodeExecutionRepository.
    Provides service-layer database operations for WorkflowNodeExecutionModel
    using LogStore SQL queries with optimized deduplication strategies.
    """
    def __init__(self, session_maker: sessionmaker | None = None):
        """
        Initialize the repository with LogStore client.
        Args:
            session_maker: SQLAlchemy sessionmaker (unused, for compatibility with factory pattern)
        """
        logger.debug("LogstoreAPIWorkflowNodeExecutionRepository.__init__: initializing")
        self.logstore_client = AliyunLogStore()
    def get_node_last_execution(
        self,
        tenant_id: str,
        app_id: str,
        workflow_id: str,
        node_id: str,
    ) -> WorkflowNodeExecutionModel | None:
        """
        Get the most recent execution for a specific node.
        Uses query syntax to get raw logs and selects the one with max log_version.
        Returns the most recent execution ordered by created_at.
        """
        logger.debug(
            "get_node_last_execution: tenant_id=%s, app_id=%s, workflow_id=%s, node_id=%s",
            tenant_id,
            app_id,
            workflow_id,
            node_id,
        )
        try:
            # Check if PG protocol is supported
            if self.logstore_client.supports_pg_protocol:
                # Use PG protocol with SQL query (get latest version of each record)
                sql_query = f"""
                    SELECT * FROM (
                        SELECT *, 
                            ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
                        FROM "{AliyunLogStore.workflow_node_execution_logstore}"
                        WHERE tenant_id = '{tenant_id}' 
                          AND app_id = '{app_id}' 
                          AND workflow_id = '{workflow_id}' 
                          AND node_id = '{node_id}'
                          AND __time__ > 0
                    ) AS subquery WHERE rn = 1
                    LIMIT 100
                """
                results = self.logstore_client.execute_sql(
                    sql=sql_query,
                    logstore=AliyunLogStore.workflow_node_execution_logstore,
                )
            else:
                # Use SDK with LogStore query syntax
                query = (
                    f"tenant_id: {tenant_id} and app_id: {app_id} and workflow_id: {workflow_id} and node_id: {node_id}"
                )
                from_time = 0
                to_time = int(time.time())  # now
                results = self.logstore_client.get_logs(
                    logstore=AliyunLogStore.workflow_node_execution_logstore,
                    from_time=from_time,
                    to_time=to_time,
                    query=query,
                    line=100,
                    reverse=False,
                )
            if not results:
                return None
            # For SDK mode, group by id and select the one with max log_version for each group
            # For PG mode, this is already done by the SQL query
            if not self.logstore_client.supports_pg_protocol:
                id_to_results: dict[str, list[dict[str, Any]]] = {}
                for row in results:
                    row_id = row.get("id")
                    if row_id:
                        if row_id not in id_to_results:
                            id_to_results[row_id] = []
                        id_to_results[row_id].append(row)
                # For each id, select the row with max log_version
                deduplicated_results = []
                for rows in id_to_results.values():
                    if len(rows) > 1:
                        max_row = max(rows, key=lambda x: int(x.get("log_version", 0)))
                    else:
                        max_row = rows[0]
                    deduplicated_results.append(max_row)
            else:
                # For PG mode, results are already deduplicated by the SQL query
                deduplicated_results = results
            # Sort by created_at DESC and return the most recent one
            deduplicated_results.sort(
                key=lambda x: x.get("created_at", 0) if isinstance(x.get("created_at"), (int, float)) else 0,
                reverse=True,
            )
            if deduplicated_results:
                return _dict_to_workflow_node_execution_model(deduplicated_results[0])
            return None
        except Exception:
            logger.exception("Failed to get node last execution from LogStore")
            raise
    def get_executions_by_workflow_run(
        self,
        tenant_id: str,
        app_id: str,
        workflow_run_id: str,
    ) -> Sequence[WorkflowNodeExecutionModel]:
        """
        Get all node executions for a specific workflow run.
        Uses query syntax to get raw logs and selects the one with max log_version for each node execution.
        Ordered by index DESC for trace visualization.
        """
        logger.debug(
            "[LogStore] get_executions_by_workflow_run: tenant_id=%s, app_id=%s, workflow_run_id=%s",
            tenant_id,
            app_id,
            workflow_run_id,
        )
        try:
            # Check if PG protocol is supported
            if self.logstore_client.supports_pg_protocol:
                # Use PG protocol with SQL query (get latest version of each record)
                sql_query = f"""
                    SELECT * FROM (
                        SELECT *, 
                            ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
                        FROM "{AliyunLogStore.workflow_node_execution_logstore}"
                        WHERE tenant_id = '{tenant_id}' 
                          AND app_id = '{app_id}' 
                          AND workflow_run_id = '{workflow_run_id}'
                          AND __time__ > 0
                    ) AS subquery WHERE rn = 1
                    LIMIT 1000
                """
                results = self.logstore_client.execute_sql(
                    sql=sql_query,
                    logstore=AliyunLogStore.workflow_node_execution_logstore,
                )
            else:
                # Use SDK with LogStore query syntax
                query = f"tenant_id: {tenant_id} and app_id: {app_id} and workflow_run_id: {workflow_run_id}"
                from_time = 0
                to_time = int(time.time())  # now
                results = self.logstore_client.get_logs(
                    logstore=AliyunLogStore.workflow_node_execution_logstore,
                    from_time=from_time,
                    to_time=to_time,
                    query=query,
                    line=1000,  # Get more results for node executions
                    reverse=False,
                )
            if not results:
                return []
            # For SDK mode, group by id and select the one with max log_version for each group
            # For PG mode, this is already done by the SQL query
            models = []
            if not self.logstore_client.supports_pg_protocol:
                id_to_results: dict[str, list[dict[str, Any]]] = {}
                for row in results:
                    row_id = row.get("id")
                    if row_id:
                        if row_id not in id_to_results:
                            id_to_results[row_id] = []
                        id_to_results[row_id].append(row)
                # For each id, select the row with max log_version
                for rows in id_to_results.values():
                    if len(rows) > 1:
                        max_row = max(rows, key=lambda x: int(x.get("log_version", 0)))
                    else:
                        max_row = rows[0]
                    model = _dict_to_workflow_node_execution_model(max_row)
                    if model and model.id:  # Ensure model is valid
                        models.append(model)
            else:
                # For PG mode, results are already deduplicated by the SQL query
                for row in results:
                    model = _dict_to_workflow_node_execution_model(row)
                    if model and model.id:  # Ensure model is valid
                        models.append(model)
            # Sort by index DESC for trace visualization
            models.sort(key=lambda x: x.index, reverse=True)
            return models
        except Exception:
            logger.exception("Failed to get executions by workflow run from LogStore")
            raise
    def get_execution_by_id(
        self,
        execution_id: str,
        tenant_id: str | None = None,
    ) -> WorkflowNodeExecutionModel | None:
        """
        Get a workflow node execution by its ID.
        Uses query syntax to get raw logs and selects the one with max log_version.
        """
        logger.debug("get_execution_by_id: execution_id=%s, tenant_id=%s", execution_id, tenant_id)
        try:
            # Check if PG protocol is supported
            if self.logstore_client.supports_pg_protocol:
                # Use PG protocol with SQL query (get latest version of record)
                tenant_filter = f"AND tenant_id = '{tenant_id}'" if tenant_id else ""
                sql_query = f"""
                    SELECT * FROM (
                        SELECT *, 
                            ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
                        FROM "{AliyunLogStore.workflow_node_execution_logstore}"
                        WHERE id = '{execution_id}' {tenant_filter} AND __time__ > 0
                    ) AS subquery WHERE rn = 1
                    LIMIT 1
                """
                results = self.logstore_client.execute_sql(
                    sql=sql_query,
                    logstore=AliyunLogStore.workflow_node_execution_logstore,
                )
            else:
                # Use SDK with LogStore query syntax
                if tenant_id:
                    query = f"id: {execution_id} and tenant_id: {tenant_id}"
                else:
                    query = f"id: {execution_id}"
                from_time = 0
                to_time = int(time.time())  # now
                results = self.logstore_client.get_logs(
                    logstore=AliyunLogStore.workflow_node_execution_logstore,
                    from_time=from_time,
                    to_time=to_time,
                    query=query,
                    line=100,
                    reverse=False,
                )
            if not results:
                return None
            # For PG mode, result is already the latest version
            # For SDK mode, if multiple results, select the one with max log_version
            if self.logstore_client.supports_pg_protocol or len(results) == 1:
                return _dict_to_workflow_node_execution_model(results[0])
            else:
                max_result = max(results, key=lambda x: int(x.get("log_version", 0)))
                return _dict_to_workflow_node_execution_model(max_result)
        except Exception:
            logger.exception("Failed to get execution by ID from LogStore: execution_id=%s", execution_id)
            raise
--- a/api/extensions/logstore/repositories/logstore_api_workflow_run_repository.py
+++ b/api/extensions/logstore/repositories/logstore_api_workflow_run_repository.py
@@ -0,0 +1,757 @@
 """
 LogStore API WorkflowRun Repository Implementation
 This module provides the LogStore-based implementation of the APIWorkflowRunRepository
 protocol. It handles service-layer WorkflowRun database operations using Aliyun SLS LogStore
 with optimized queries for statistics and pagination.
 Key Features:
 - LogStore SQL queries for aggregation and statistics
 - Optimized deduplication using finished_at IS NOT NULL filter
 - Window functions only when necessary (running status queries)
 - Multi-tenant data isolation and security
 """
 import logging
 import os
 import time
 from collections.abc import Sequence
 from datetime import datetime
 from typing import Any, cast
 from sqlalchemy.orm import sessionmaker
 from extensions.logstore.aliyun_logstore import AliyunLogStore
 from libs.infinite_scroll_pagination import InfiniteScrollPagination
 from models.enums import WorkflowRunTriggeredFrom
 from models.workflow import WorkflowRun
 from repositories.api_workflow_run_repository import APIWorkflowRunRepository
 from repositories.types import (
    AverageInteractionStats,
    DailyRunsStats,
    DailyTerminalsStats,
    DailyTokenCostStats,
 )
 logger = logging.getLogger(__name__)
 def _dict_to_workflow_run(data: dict[str, Any]) -> WorkflowRun:
    """
    Convert LogStore result dictionary to WorkflowRun instance.
    Args:
        data: Dictionary from LogStore query result
    Returns:
        WorkflowRun instance
    """
    logger.debug("_dict_to_workflow_run: data keys=%s", list(data.keys())[:5])
    # Create model instance without session
    model = WorkflowRun()
    # Map all required fields with validation
    # Critical fields - must not be None
    model.id = data.get("id") or ""
    model.tenant_id = data.get("tenant_id") or ""
    model.app_id = data.get("app_id") or ""
    model.workflow_id = data.get("workflow_id") or ""
    model.type = data.get("type") or ""
    model.triggered_from = data.get("triggered_from") or ""
    model.version = data.get("version") or ""
    model.status = data.get("status") or "running"  # Default status if missing
    model.created_by_role = data.get("created_by_role") or ""
    model.created_by = data.get("created_by") or ""
    # Numeric fields with defaults
    model.total_tokens = int(data.get("total_tokens", 0))
    model.total_steps = int(data.get("total_steps", 0))
    model.exceptions_count = int(data.get("exceptions_count", 0))
    # Optional fields
    model.graph = data.get("graph")
    model.inputs = data.get("inputs")
    model.outputs = data.get("outputs")
    model.error = data.get("error_message") or data.get("error")
    # Handle datetime fields
    started_at = data.get("started_at") or data.get("created_at")
    if started_at:
        if isinstance(started_at, str):
            model.created_at = datetime.fromisoformat(started_at)
        elif isinstance(started_at, (int, float)):
            model.created_at = datetime.fromtimestamp(started_at)
        else:
            model.created_at = started_at
    else:
        # Provide default created_at if missing
        model.created_at = datetime.now()
    finished_at = data.get("finished_at")
    if finished_at:
        if isinstance(finished_at, str):
            model.finished_at = datetime.fromisoformat(finished_at)
        elif isinstance(finished_at, (int, float)):
            model.finished_at = datetime.fromtimestamp(finished_at)
        else:
            model.finished_at = finished_at
    # Compute elapsed_time from started_at and finished_at
    # LogStore doesn't store elapsed_time, it's computed in WorkflowExecution domain entity
    if model.finished_at and model.created_at:
        model.elapsed_time = (model.finished_at - model.created_at).total_seconds()
    else:
        model.elapsed_time = float(data.get("elapsed_time", 0))
    return model
 class LogstoreAPIWorkflowRunRepository(APIWorkflowRunRepository):
    """
    LogStore implementation of APIWorkflowRunRepository.
    Provides service-layer WorkflowRun database operations using LogStore SQL
    with optimized query strategies:
    - Use finished_at IS NOT NULL for deduplication (10-100x faster)
    - Use window functions only when running status is required
    - Proper time range filtering for LogStore queries
    """
    def __init__(self, session_maker: sessionmaker | None = None):
        """
        Initialize the repository with LogStore client.
        Args:
            session_maker: SQLAlchemy sessionmaker (unused, for compatibility with factory pattern)
        """
        logger.debug("LogstoreAPIWorkflowRunRepository.__init__: initializing")
        self.logstore_client = AliyunLogStore()
        # Control flag for dual-read (fallback to PostgreSQL when LogStore returns no results)
        # Set to True to enable fallback for safe migration from PostgreSQL to LogStore
        # Set to False for new deployments without legacy data in PostgreSQL
        self._enable_dual_read = os.environ.get("LOGSTORE_DUAL_READ_ENABLED", "true").lower() == "true"
    def get_paginated_workflow_runs(
        self,
        tenant_id: str,
        app_id: str,
        triggered_from: WorkflowRunTriggeredFrom | Sequence[WorkflowRunTriggeredFrom],
        limit: int = 20,
        last_id: str | None = None,
        status: str | None = None,
    ) -> InfiniteScrollPagination:
        """
        Get paginated workflow runs with filtering.
        Uses window function for deduplication to support both running and finished states.
        Args:
            tenant_id: Tenant identifier for multi-tenant isolation
            app_id: Application identifier
            triggered_from: Filter by trigger source(s)
            limit: Maximum number of records to return (default: 20)
            last_id: Cursor for pagination - ID of the last record from previous page
            status: Optional filter by status
        Returns:
            InfiniteScrollPagination object
        """
        logger.debug(
            "get_paginated_workflow_runs: tenant_id=%s, app_id=%s, limit=%d, status=%s",
            tenant_id,
            app_id,
            limit,
            status,
        )
        # Convert triggered_from to list if needed
        if isinstance(triggered_from, WorkflowRunTriggeredFrom):
            triggered_from_list = [triggered_from]
        else:
            triggered_from_list = list(triggered_from)
        # Build triggered_from filter
        triggered_from_filter = " OR ".join([f"triggered_from='{tf.value}'" for tf in triggered_from_list])
        # Build status filter
        status_filter = f"AND status='{status}'" if status else ""
        # Build last_id filter for pagination
        # Note: This is simplified. In production, you'd need to track created_at from last record
        last_id_filter = ""
        if last_id:
            # TODO: Implement proper cursor-based pagination with created_at
            logger.warning("last_id pagination not fully implemented for LogStore")
        # Use window function to get latest log_version of each workflow run
        sql = f"""
            SELECT * FROM (
                SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) AS rn
                FROM {AliyunLogStore.workflow_execution_logstore}
                WHERE tenant_id='{tenant_id}'
                  AND app_id='{app_id}'
                  AND ({triggered_from_filter})
                  {status_filter}
                  {last_id_filter}
            ) t
            WHERE rn = 1
            ORDER BY created_at DESC
            LIMIT {limit + 1}
        """
        try:
            results = self.logstore_client.execute_sql(
                sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore, from_time=None, to_time=None
            )
            # Check if there are more records
            has_more = len(results) > limit
            if has_more:
                results = results[:limit]
            # Convert results to WorkflowRun models
            workflow_runs = [_dict_to_workflow_run(row) for row in results]
            return InfiniteScrollPagination(data=workflow_runs, limit=limit, has_more=has_more)
        except Exception:
            logger.exception("Failed to get paginated workflow runs from LogStore")
            raise
    def get_workflow_run_by_id(
        self,
        tenant_id: str,
        app_id: str,
        run_id: str,
    ) -> WorkflowRun | None:
        """
        Get a specific workflow run by ID with tenant and app isolation.
        Uses query syntax to get raw logs and selects the one with max log_version in code.
        Falls back to PostgreSQL if not found in LogStore (for data consistency during migration).
        """
        logger.debug("get_workflow_run_by_id: tenant_id=%s, app_id=%s, run_id=%s", tenant_id, app_id, run_id)
        try:
            # Check if PG protocol is supported
            if self.logstore_client.supports_pg_protocol:
                # Use PG protocol with SQL query (get latest version of record)
                sql_query = f"""
                    SELECT * FROM (
                        SELECT *, 
                            ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
                        FROM "{AliyunLogStore.workflow_execution_logstore}"
                        WHERE id = '{run_id}' AND tenant_id = '{tenant_id}' AND app_id = '{app_id}' AND __time__ > 0
                    ) AS subquery WHERE rn = 1
                    LIMIT 100
                """
                results = self.logstore_client.execute_sql(
                    sql=sql_query,
                    logstore=AliyunLogStore.workflow_execution_logstore,
                )
            else:
                # Use SDK with LogStore query syntax
                query = f"id: {run_id} and tenant_id: {tenant_id} and app_id: {app_id}"
                from_time = 0
                to_time = int(time.time())  # now
                results = self.logstore_client.get_logs(
                    logstore=AliyunLogStore.workflow_execution_logstore,
                    from_time=from_time,
                    to_time=to_time,
                    query=query,
                    line=100,
                    reverse=False,
                )
            if not results:
                # Fallback to PostgreSQL for records created before LogStore migration
                if self._enable_dual_read:
                    logger.debug(
                        "WorkflowRun not found in LogStore, falling back to PostgreSQL: "
                        "run_id=%s, tenant_id=%s, app_id=%s",
                        run_id,
                        tenant_id,
                        app_id,
                    )
                    return self._fallback_get_workflow_run_by_id_with_tenant(run_id, tenant_id, app_id)
                return None
            # For PG mode, results are already deduplicated by the SQL query
            # For SDK mode, if multiple results, select the one with max log_version
            if self.logstore_client.supports_pg_protocol or len(results) == 1:
                return _dict_to_workflow_run(results[0])
            else:
                max_result = max(results, key=lambda x: int(x.get("log_version", 0)))
                return _dict_to_workflow_run(max_result)
        except Exception:
            logger.exception("Failed to get workflow run by ID from LogStore: run_id=%s", run_id)
            # Try PostgreSQL fallback on any error (only if dual-read is enabled)
            if self._enable_dual_read:
                try:
                    return self._fallback_get_workflow_run_by_id_with_tenant(run_id, tenant_id, app_id)
                except Exception:
                    logger.exception(
                        "PostgreSQL fallback also failed: run_id=%s, tenant_id=%s, app_id=%s", run_id, tenant_id, app_id
                    )
            raise
    def _fallback_get_workflow_run_by_id_with_tenant(
        self, run_id: str, tenant_id: str, app_id: str
    ) -> WorkflowRun | None:
        """Fallback to PostgreSQL query for records not in LogStore (with tenant isolation)."""
        from sqlalchemy import select
        from sqlalchemy.orm import Session
        from extensions.ext_database import db
        with Session(db.engine) as session:
            stmt = select(WorkflowRun).where(
                WorkflowRun.id == run_id, WorkflowRun.tenant_id == tenant_id, WorkflowRun.app_id == app_id
            )
            return session.scalar(stmt)
    def get_workflow_run_by_id_without_tenant(
        self,
        run_id: str,
    ) -> WorkflowRun | None:
        """
        Get a specific workflow run by ID without tenant/app context.
        Uses query syntax to get raw logs and selects the one with max log_version.
        Falls back to PostgreSQL if not found in LogStore (controlled by LOGSTORE_DUAL_READ_ENABLED).
        """
        logger.debug("get_workflow_run_by_id_without_tenant: run_id=%s", run_id)
        try:
            # Check if PG protocol is supported
            if self.logstore_client.supports_pg_protocol:
                # Use PG protocol with SQL query (get latest version of record)
                sql_query = f"""
                    SELECT * FROM (
                        SELECT *, 
                            ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
                        FROM "{AliyunLogStore.workflow_execution_logstore}"
                        WHERE id = '{run_id}' AND __time__ > 0
                    ) AS subquery WHERE rn = 1
                    LIMIT 100
                """
                results = self.logstore_client.execute_sql(
                    sql=sql_query,
                    logstore=AliyunLogStore.workflow_execution_logstore,
                )
            else:
                # Use SDK with LogStore query syntax
                query = f"id: {run_id}"
                from_time = 0
                to_time = int(time.time())  # now
                results = self.logstore_client.get_logs(
                    logstore=AliyunLogStore.workflow_execution_logstore,
                    from_time=from_time,
                    to_time=to_time,
                    query=query,
                    line=100,
                    reverse=False,
                )
            if not results:
                # Fallback to PostgreSQL for records created before LogStore migration
                if self._enable_dual_read:
                    logger.debug("WorkflowRun not found in LogStore, falling back to PostgreSQL: run_id=%s", run_id)
                    return self._fallback_get_workflow_run_by_id(run_id)
                return None
            # For PG mode, results are already deduplicated by the SQL query
            # For SDK mode, if multiple results, select the one with max log_version
            if self.logstore_client.supports_pg_protocol or len(results) == 1:
                return _dict_to_workflow_run(results[0])
            else:
                max_result = max(results, key=lambda x: int(x.get("log_version", 0)))
                return _dict_to_workflow_run(max_result)
        except Exception:
            logger.exception("Failed to get workflow run without tenant: run_id=%s", run_id)
            # Try PostgreSQL fallback on any error (only if dual-read is enabled)
            if self._enable_dual_read:
                try:
                    return self._fallback_get_workflow_run_by_id(run_id)
                except Exception:
                    logger.exception("PostgreSQL fallback also failed: run_id=%s", run_id)
            raise
    def _fallback_get_workflow_run_by_id(self, run_id: str) -> WorkflowRun | None:
        """Fallback to PostgreSQL query for records not in LogStore."""
        from sqlalchemy import select
        from sqlalchemy.orm import Session
        from extensions.ext_database import db
        with Session(db.engine) as session:
            stmt = select(WorkflowRun).where(WorkflowRun.id == run_id)
            return session.scalar(stmt)
    def get_workflow_runs_count(
        self,
        tenant_id: str,
        app_id: str,
        triggered_from: str,
        status: str | None = None,
        time_range: str | None = None,
    ) -> dict[str, int]:
        """
        Get workflow runs count statistics grouped by status.
        Optimization: Use finished_at IS NOT NULL for completed runs (10-50x faster)
        """
        logger.debug(
            "get_workflow_runs_count: tenant_id=%s, app_id=%s, triggered_from=%s, status=%s",
            tenant_id,
            app_id,
            triggered_from,
            status,
        )
        # Build time range filter
        time_filter = ""
        if time_range:
            # TODO: Parse time_range and convert to from_time/to_time
            logger.warning("time_range filter not implemented")
        # If status is provided, simple count
        if status:
            if status == "running":
                # Running status requires window function
                sql = f"""
                    SELECT COUNT(*) as count
                    FROM (
                        SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) AS rn
                        FROM {AliyunLogStore.workflow_execution_logstore}
                        WHERE tenant_id='{tenant_id}'
                          AND app_id='{app_id}'
                          AND triggered_from='{triggered_from}'
                          AND status='running'
                          {time_filter}
                    ) t
                    WHERE rn = 1
                """
            else:
                # Finished status uses optimized filter
                sql = f"""
                    SELECT COUNT(DISTINCT id) as count
                    FROM {AliyunLogStore.workflow_execution_logstore}
                    WHERE tenant_id='{tenant_id}'
                      AND app_id='{app_id}'
                      AND triggered_from='{triggered_from}'
                      AND status='{status}'
                      AND finished_at IS NOT NULL
                      {time_filter}
                """
            try:
                results = self.logstore_client.execute_sql(
                    sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
                )
                count = results[0]["count"] if results and len(results) > 0 else 0
                return {
                    "total": count,
                    "running": count if status == "running" else 0,
                    "succeeded": count if status == "succeeded" else 0,
                    "failed": count if status == "failed" else 0,
                    "stopped": count if status == "stopped" else 0,
                    "partial-succeeded": count if status == "partial-succeeded" else 0,
                }
            except Exception:
                logger.exception("Failed to get workflow runs count")
                raise
        # No status filter - get counts grouped by status
        # Use optimized query for finished runs, separate query for running
        try:
            # Count finished runs grouped by status
            finished_sql = f"""
                SELECT status, COUNT(DISTINCT id) as count
                FROM {AliyunLogStore.workflow_execution_logstore}
                WHERE tenant_id='{tenant_id}'
                  AND app_id='{app_id}'
                  AND triggered_from='{triggered_from}'
                  AND finished_at IS NOT NULL
                  {time_filter}
                GROUP BY status
            """
            # Count running runs
            running_sql = f"""
                SELECT COUNT(*) as count
                FROM (
                    SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) AS rn
                    FROM {AliyunLogStore.workflow_execution_logstore}
                    WHERE tenant_id='{tenant_id}'
                      AND app_id='{app_id}'
                      AND triggered_from='{triggered_from}'
                      AND status='running'
                      {time_filter}
                ) t
                WHERE rn = 1
            """
            finished_results = self.logstore_client.execute_sql(
                sql=finished_sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
            )
            running_results = self.logstore_client.execute_sql(
                sql=running_sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
            )
            # Build response
            status_counts = {
                "running": 0,
                "succeeded": 0,
                "failed": 0,
                "stopped": 0,
                "partial-succeeded": 0,
            }
            total = 0
            for result in finished_results:
                status_val = result.get("status")
                count = result.get("count", 0)
                if status_val in status_counts:
                    status_counts[status_val] = count
                    total += count
            # Add running count
            running_count = running_results[0]["count"] if running_results and len(running_results) > 0 else 0
            status_counts["running"] = running_count
            total += running_count
            return {"total": total} | status_counts
        except Exception:
            logger.exception("Failed to get workflow runs count")
            raise
    def get_daily_runs_statistics(
        self,
        tenant_id: str,
        app_id: str,
        triggered_from: str,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
        timezone: str = "UTC",
    ) -> list[DailyRunsStats]:
        """
        Get daily runs statistics using optimized query.
        Optimization: Use finished_at IS NOT NULL + COUNT(DISTINCT id) (20-100x faster)
        """
        logger.debug(
            "get_daily_runs_statistics: tenant_id=%s, app_id=%s, triggered_from=%s", tenant_id, app_id, triggered_from
        )
        # Build time range filter
        time_filter = ""
        if start_date:
            time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
        if end_date:
            time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
        # Optimized query: Use finished_at filter to avoid window function
        sql = f"""
            SELECT DATE(from_unixtime(__time__)) as date, COUNT(DISTINCT id) as runs
            FROM {AliyunLogStore.workflow_execution_logstore}
            WHERE tenant_id='{tenant_id}'
              AND app_id='{app_id}'
              AND triggered_from='{triggered_from}'
              AND finished_at IS NOT NULL
              {time_filter}
            GROUP BY date
            ORDER BY date
        """
        try:
            results = self.logstore_client.execute_sql(
                sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
            )
            response_data = []
            for row in results:
                response_data.append({"date": str(row.get("date", "")), "runs": row.get("runs", 0)})
            return cast(list[DailyRunsStats], response_data)
        except Exception:
            logger.exception("Failed to get daily runs statistics")
            raise
    def get_daily_terminals_statistics(
        self,
        tenant_id: str,
        app_id: str,
        triggered_from: str,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
        timezone: str = "UTC",
    ) -> list[DailyTerminalsStats]:
        """
        Get daily terminals statistics using optimized query.
        Optimization: Use finished_at IS NOT NULL + COUNT(DISTINCT created_by) (20-100x faster)
        """
        logger.debug(
            "get_daily_terminals_statistics: tenant_id=%s, app_id=%s, triggered_from=%s",
            tenant_id,
            app_id,
            triggered_from,
        )
        # Build time range filter
        time_filter = ""
        if start_date:
            time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
        if end_date:
            time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
        sql = f"""
            SELECT DATE(from_unixtime(__time__)) as date, COUNT(DISTINCT created_by) as terminal_count
            FROM {AliyunLogStore.workflow_execution_logstore}
            WHERE tenant_id='{tenant_id}'
              AND app_id='{app_id}'
              AND triggered_from='{triggered_from}'
              AND finished_at IS NOT NULL
              {time_filter}
            GROUP BY date
            ORDER BY date
        """
        try:
            results = self.logstore_client.execute_sql(
                sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
            )
            response_data = []
            for row in results:
                response_data.append({"date": str(row.get("date", "")), "terminal_count": row.get("terminal_count", 0)})
            return cast(list[DailyTerminalsStats], response_data)
        except Exception:
            logger.exception("Failed to get daily terminals statistics")
            raise
    def get_daily_token_cost_statistics(
        self,
        tenant_id: str,
        app_id: str,
        triggered_from: str,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
        timezone: str = "UTC",
    ) -> list[DailyTokenCostStats]:
        """
        Get daily token cost statistics using optimized query.
        Optimization: Use finished_at IS NOT NULL + SUM(total_tokens) (20-100x faster)
        """
        logger.debug(
            "get_daily_token_cost_statistics: tenant_id=%s, app_id=%s, triggered_from=%s",
            tenant_id,
            app_id,
            triggered_from,
        )
        # Build time range filter
        time_filter = ""
        if start_date:
            time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
        if end_date:
            time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
        sql = f"""
            SELECT DATE(from_unixtime(__time__)) as date, SUM(total_tokens) as token_count
            FROM {AliyunLogStore.workflow_execution_logstore}
            WHERE tenant_id='{tenant_id}'
              AND app_id='{app_id}'
              AND triggered_from='{triggered_from}'
              AND finished_at IS NOT NULL
              {time_filter}
            GROUP BY date
            ORDER BY date
        """
        try:
            results = self.logstore_client.execute_sql(
                sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
            )
            response_data = []
            for row in results:
                response_data.append({"date": str(row.get("date", "")), "token_count": row.get("token_count", 0)})
            return cast(list[DailyTokenCostStats], response_data)
        except Exception:
            logger.exception("Failed to get daily token cost statistics")
            raise
    def get_average_app_interaction_statistics(
        self,
        tenant_id: str,
        app_id: str,
        triggered_from: str,
        start_date: datetime | None = None,
        end_date: datetime | None = None,
        timezone: str = "UTC",
    ) -> list[AverageInteractionStats]:
        """
        Get average app interaction statistics using optimized query.
        Optimization: Use finished_at IS NOT NULL + AVG (20-100x faster)
        """
        logger.debug(
            "get_average_app_interaction_statistics: tenant_id=%s, app_id=%s, triggered_from=%s",
            tenant_id,
            app_id,
            triggered_from,
        )
        # Build time range filter
        time_filter = ""
        if start_date:
            time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
        if end_date:
            time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
        sql = f"""
            SELECT
                AVG(sub.interactions) AS interactions,
                sub.date
            FROM (
                SELECT
                    DATE(from_unixtime(__time__)) AS date,
                    created_by,
                    COUNT(DISTINCT id) AS interactions
                FROM {AliyunLogStore.workflow_execution_logstore}
                WHERE tenant_id='{tenant_id}'
                  AND app_id='{app_id}'
                  AND triggered_from='{triggered_from}'
                  AND finished_at IS NOT NULL
                  {time_filter}
                GROUP BY date, created_by
            ) sub
            GROUP BY sub.date
        """
        try:
            results = self.logstore_client.execute_sql(
                sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
            )
            response_data = []
            for row in results:
                response_data.append(
                    {
                        "date": str(row.get("date", "")),
                        "interactions": float(row.get("interactions", 0)),
                    }
                )
            return cast(list[AverageInteractionStats], response_data)
        except Exception:
            logger.exception("Failed to get average app interaction statistics")
            raise
--- a/api/extensions/logstore/repositories/logstore_workflow_execution_repository.py
+++ b/api/extensions/logstore/repositories/logstore_workflow_execution_repository.py
@@ -0,0 +1,164 @@
 import json
 import logging
 import os
 import time
 from typing import Union
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import sessionmaker
 from core.repositories.sqlalchemy_workflow_execution_repository import SQLAlchemyWorkflowExecutionRepository
 from core.workflow.entities import WorkflowExecution
 from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
 from extensions.logstore.aliyun_logstore import AliyunLogStore
 from libs.helper import extract_tenant_id
 from models import (
    Account,
    CreatorUserRole,
    EndUser,
 )
 from models.enums import WorkflowRunTriggeredFrom
 logger = logging.getLogger(__name__)
 class LogstoreWorkflowExecutionRepository(WorkflowExecutionRepository):
    def __init__(
        self,
        session_factory: sessionmaker | Engine,
        user: Union[Account, EndUser],
        app_id: str | None,
        triggered_from: WorkflowRunTriggeredFrom | None,
    ):
        """
        Initialize the repository with a SQLAlchemy sessionmaker or engine and context information.
        Args:
            session_factory: SQLAlchemy sessionmaker or engine for creating sessions
            user: Account or EndUser object containing tenant_id, user ID, and role information
            app_id: App ID for filtering by application (can be None)
            triggered_from: Source of the execution trigger (DEBUGGING or APP_RUN)
        """
        logger.debug(
            "LogstoreWorkflowExecutionRepository.__init__: app_id=%s, triggered_from=%s", app_id, triggered_from
        )
        # Initialize LogStore client
        # Note: Project/logstore/index initialization is done at app startup via ext_logstore
        self.logstore_client = AliyunLogStore()
        # Extract tenant_id from user
        tenant_id = extract_tenant_id(user)
        if not tenant_id:
            raise ValueError("User must have a tenant_id or current_tenant_id")
        self._tenant_id = tenant_id
        # Store app context
        self._app_id = app_id
        # Extract user context
        self._triggered_from = triggered_from
        self._creator_user_id = user.id
        # Determine user role based on user type
        self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
        # Initialize SQL repository for dual-write support
        self.sql_repository = SQLAlchemyWorkflowExecutionRepository(session_factory, user, app_id, triggered_from)
        # Control flag for dual-write (write to both LogStore and SQL database)
        # Set to True to enable dual-write for safe migration, False to use LogStore only
        self._enable_dual_write = os.environ.get("LOGSTORE_DUAL_WRITE_ENABLED", "true").lower() == "true"
    def _to_logstore_model(self, domain_model: WorkflowExecution) -> list[tuple[str, str]]:
        """
        Convert a domain model to a logstore model (List[Tuple[str, str]]).
        Args:
            domain_model: The domain model to convert
        Returns:
            The logstore model as a list of key-value tuples
        """
        logger.debug(
            "_to_logstore_model: id=%s, workflow_id=%s, status=%s",
            domain_model.id_,
            domain_model.workflow_id,
            domain_model.status.value,
        )
        # Use values from constructor if provided
        if not self._triggered_from:
            raise ValueError("triggered_from is required in repository constructor")
        if not self._creator_user_id:
            raise ValueError("created_by is required in repository constructor")
        if not self._creator_user_role:
            raise ValueError("created_by_role is required in repository constructor")
        # Generate log_version as nanosecond timestamp for record versioning
        log_version = str(time.time_ns())
        logstore_model = [
            ("id", domain_model.id_),
            ("log_version", log_version),  # Add log_version field for append-only writes
            ("tenant_id", self._tenant_id),
            ("app_id", self._app_id or ""),
            ("workflow_id", domain_model.workflow_id),
            (
                "triggered_from",
                self._triggered_from.value if hasattr(self._triggered_from, "value") else str(self._triggered_from),
            ),
            ("type", domain_model.workflow_type.value),
            ("version", domain_model.workflow_version),
            ("graph", json.dumps(domain_model.graph, ensure_ascii=False) if domain_model.graph else "{}"),
            ("inputs", json.dumps(domain_model.inputs, ensure_ascii=False) if domain_model.inputs else "{}"),
            ("outputs", json.dumps(domain_model.outputs, ensure_ascii=False) if domain_model.outputs else "{}"),
            ("status", domain_model.status.value),
            ("error_message", domain_model.error_message or ""),
            ("total_tokens", str(domain_model.total_tokens)),
            ("total_steps", str(domain_model.total_steps)),
            ("exceptions_count", str(domain_model.exceptions_count)),
            (
                "created_by_role",
                self._creator_user_role.value
                if hasattr(self._creator_user_role, "value")
                else str(self._creator_user_role),
            ),
            ("created_by", self._creator_user_id),
            ("started_at", domain_model.started_at.isoformat() if domain_model.started_at else ""),
            ("finished_at", domain_model.finished_at.isoformat() if domain_model.finished_at else ""),
        ]
        return logstore_model
    def save(self, execution: WorkflowExecution) -> None:
        """
        Save or update a WorkflowExecution domain entity to the logstore.
        This method serves as a domain-to-logstore adapter that:
        1. Converts the domain entity to its logstore representation
        2. Persists the logstore model using Aliyun SLS
        3. Maintains proper multi-tenancy by including tenant context during conversion
        4. Optionally writes to SQL database for dual-write support (controlled by LOGSTORE_DUAL_WRITE_ENABLED)
        Args:
            execution: The WorkflowExecution domain entity to persist
        """
        logger.debug(
            "save: id=%s, workflow_id=%s, status=%s", execution.id_, execution.workflow_id, execution.status.value
        )
        try:
            logstore_model = self._to_logstore_model(execution)
            self.logstore_client.put_log(AliyunLogStore.workflow_execution_logstore, logstore_model)
            logger.debug("Saved workflow execution to logstore: id=%s", execution.id_)
        except Exception:
            logger.exception("Failed to save workflow execution to logstore: id=%s", execution.id_)
            raise
        # Dual-write to SQL database if enabled (for safe migration)
        if self._enable_dual_write:
            try:
                self.sql_repository.save(execution)
                logger.debug("Dual-write: saved workflow execution to SQL database: id=%s", execution.id_)
            except Exception:
                logger.exception("Failed to dual-write workflow execution to SQL database: id=%s", execution.id_)
                # Don't raise - LogStore write succeeded, SQL is just a backup
--- a/api/extensions/logstore/repositories/logstore_workflow_node_execution_repository.py
+++ b/api/extensions/logstore/repositories/logstore_workflow_node_execution_repository.py
@@ -0,0 +1,366 @@
 """
 LogStore implementation of the WorkflowNodeExecutionRepository.
 This module provides a LogStore-based repository for WorkflowNodeExecution entities,
 using Aliyun SLS LogStore with append-only writes and version control.
 """
 import json
 import logging
 import os
 import time
 from collections.abc import Sequence
 from datetime import datetime
 from typing import Any, Union
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import sessionmaker
 from core.model_runtime.utils.encoders import jsonable_encoder
 from core.repositories import SQLAlchemyWorkflowNodeExecutionRepository
 from core.workflow.entities import WorkflowNodeExecution
 from core.workflow.entities.workflow_node_execution import WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
 from core.workflow.enums import NodeType
 from core.workflow.repositories.workflow_node_execution_repository import OrderConfig, WorkflowNodeExecutionRepository
 from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
 from extensions.logstore.aliyun_logstore import AliyunLogStore
 from libs.helper import extract_tenant_id
 from models import (
    Account,
    CreatorUserRole,
    EndUser,
    WorkflowNodeExecutionTriggeredFrom,
 )
 logger = logging.getLogger(__name__)
 def _dict_to_workflow_node_execution(data: dict[str, Any]) -> WorkflowNodeExecution:
    """
    Convert LogStore result dictionary to WorkflowNodeExecution domain model.
    Args:
        data: Dictionary from LogStore query result
    Returns:
        WorkflowNodeExecution domain model instance
    """
    logger.debug("_dict_to_workflow_node_execution: data keys=%s", list(data.keys())[:5])
    # Parse JSON fields
    inputs = json.loads(data.get("inputs", "{}"))
    process_data = json.loads(data.get("process_data", "{}"))
    outputs = json.loads(data.get("outputs", "{}"))
    metadata = json.loads(data.get("execution_metadata", "{}"))
    # Convert metadata to domain enum keys
    domain_metadata = {}
    for k, v in metadata.items():
        try:
            domain_metadata[WorkflowNodeExecutionMetadataKey(k)] = v
        except ValueError:
            # Skip invalid metadata keys
            continue
    # Convert status to domain enum
    status = WorkflowNodeExecutionStatus(data.get("status", "running"))
    # Parse datetime fields
    created_at = datetime.fromisoformat(data.get("created_at", "")) if data.get("created_at") else datetime.now()
    finished_at = datetime.fromisoformat(data.get("finished_at", "")) if data.get("finished_at") else None
    return WorkflowNodeExecution(
        id=data.get("id", ""),
        node_execution_id=data.get("node_execution_id"),
        workflow_id=data.get("workflow_id", ""),
        workflow_execution_id=data.get("workflow_run_id"),
        index=int(data.get("index", 0)),
        predecessor_node_id=data.get("predecessor_node_id"),
        node_id=data.get("node_id", ""),
        node_type=NodeType(data.get("node_type", "start")),
        title=data.get("title", ""),
        inputs=inputs,
        process_data=process_data,
        outputs=outputs,
        status=status,
        error=data.get("error"),
        elapsed_time=float(data.get("elapsed_time", 0.0)),
        metadata=domain_metadata,
        created_at=created_at,
        finished_at=finished_at,
    )
 class LogstoreWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository):
    """
    LogStore implementation of the WorkflowNodeExecutionRepository interface.
    This implementation uses Aliyun SLS LogStore with an append-only write strategy:
    - Each save() operation appends a new record with a version timestamp
    - Updates are simulated by writing new records with higher version numbers
    - Queries retrieve the latest version using finished_at IS NOT NULL filter
    - Multi-tenancy is maintained through tenant_id filtering
    Version Strategy:
        version = time.time_ns()  # Nanosecond timestamp for unique ordering
    """
    def __init__(
        self,
        session_factory: sessionmaker | Engine,
        user: Union[Account, EndUser],
        app_id: str | None,
        triggered_from: WorkflowNodeExecutionTriggeredFrom | None,
    ):
        """
        Initialize the repository with a SQLAlchemy sessionmaker or engine and context information.
        Args:
            session_factory: SQLAlchemy sessionmaker or engine for creating sessions
            user: Account or EndUser object containing tenant_id, user ID, and role information
            app_id: App ID for filtering by application (can be None)
            triggered_from: Source of the execution trigger (SINGLE_STEP or WORKFLOW_RUN)
        """
        logger.debug(
            "LogstoreWorkflowNodeExecutionRepository.__init__: app_id=%s, triggered_from=%s", app_id, triggered_from
        )
        # Initialize LogStore client
        self.logstore_client = AliyunLogStore()
        # Extract tenant_id from user
        tenant_id = extract_tenant_id(user)
        if not tenant_id:
            raise ValueError("User must have a tenant_id or current_tenant_id")
        self._tenant_id = tenant_id
        # Store app context
        self._app_id = app_id
        # Extract user context
        self._triggered_from = triggered_from
        self._creator_user_id = user.id
        # Determine user role based on user type
        self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
        # Initialize SQL repository for dual-write support
        self.sql_repository = SQLAlchemyWorkflowNodeExecutionRepository(session_factory, user, app_id, triggered_from)
        # Control flag for dual-write (write to both LogStore and SQL database)
        # Set to True to enable dual-write for safe migration, False to use LogStore only
        self._enable_dual_write = os.environ.get("LOGSTORE_DUAL_WRITE_ENABLED", "true").lower() == "true"
    def _to_logstore_model(self, domain_model: WorkflowNodeExecution) -> Sequence[tuple[str, str]]:
        logger.debug(
            "_to_logstore_model: id=%s, node_id=%s, status=%s",
            domain_model.id,
            domain_model.node_id,
            domain_model.status.value,
        )
        if not self._triggered_from:
            raise ValueError("triggered_from is required in repository constructor")
        if not self._creator_user_id:
            raise ValueError("created_by is required in repository constructor")
        if not self._creator_user_role:
            raise ValueError("created_by_role is required in repository constructor")
        # Generate log_version as nanosecond timestamp for record versioning
        log_version = str(time.time_ns())
        json_converter = WorkflowRuntimeTypeConverter()
        logstore_model = [
            ("id", domain_model.id),
            ("log_version", log_version),  # Add log_version field for append-only writes
            ("tenant_id", self._tenant_id),
            ("app_id", self._app_id or ""),
            ("workflow_id", domain_model.workflow_id),
            (
                "triggered_from",
                self._triggered_from.value if hasattr(self._triggered_from, "value") else str(self._triggered_from),
            ),
            ("workflow_run_id", domain_model.workflow_execution_id or ""),
            ("index", str(domain_model.index)),
            ("predecessor_node_id", domain_model.predecessor_node_id or ""),
            ("node_execution_id", domain_model.node_execution_id or ""),
            ("node_id", domain_model.node_id),
            ("node_type", domain_model.node_type.value),
            ("title", domain_model.title),
            (
                "inputs",
                json.dumps(json_converter.to_json_encodable(domain_model.inputs), ensure_ascii=False)
                if domain_model.inputs
                else "{}",
            ),
            (
                "process_data",
                json.dumps(json_converter.to_json_encodable(domain_model.process_data), ensure_ascii=False)
                if domain_model.process_data
                else "{}",
            ),
            (
                "outputs",
                json.dumps(json_converter.to_json_encodable(domain_model.outputs), ensure_ascii=False)
                if domain_model.outputs
                else "{}",
            ),
            ("status", domain_model.status.value),
            ("error", domain_model.error or ""),
            ("elapsed_time", str(domain_model.elapsed_time)),
            (
                "execution_metadata",
                json.dumps(jsonable_encoder(domain_model.metadata), ensure_ascii=False)
                if domain_model.metadata
                else "{}",
            ),
            ("created_at", domain_model.created_at.isoformat() if domain_model.created_at else ""),
            ("created_by_role", self._creator_user_role.value),
            ("created_by", self._creator_user_id),
            ("finished_at", domain_model.finished_at.isoformat() if domain_model.finished_at else ""),
        ]
        return logstore_model
    def save(self, execution: WorkflowNodeExecution) -> None:
        """
        Save or update a NodeExecution domain entity to LogStore.
        This method serves as a domain-to-logstore adapter that:
        1. Converts the domain entity to its logstore representation
        2. Appends a new record with a log_version timestamp
        3. Maintains proper multi-tenancy by including tenant context during conversion
        4. Optionally writes to SQL database for dual-write support (controlled by LOGSTORE_DUAL_WRITE_ENABLED)
        Each save operation creates a new record. Updates are simulated by writing
        new records with higher log_version numbers.
        Args:
            execution: The NodeExecution domain entity to persist
        """
        logger.debug(
            "save: id=%s, node_execution_id=%s, status=%s",
            execution.id,
            execution.node_execution_id,
            execution.status.value,
        )
        try:
            logstore_model = self._to_logstore_model(execution)
            self.logstore_client.put_log(AliyunLogStore.workflow_node_execution_logstore, logstore_model)
            logger.debug(
                "Saved node execution to LogStore: id=%s, node_execution_id=%s, status=%s",
                execution.id,
                execution.node_execution_id,
                execution.status.value,
            )
        except Exception:
            logger.exception(
                "Failed to save node execution to LogStore: id=%s, node_execution_id=%s",
                execution.id,
                execution.node_execution_id,
            )
            raise
        # Dual-write to SQL database if enabled (for safe migration)
        if self._enable_dual_write:
            try:
                self.sql_repository.save(execution)
                logger.debug("Dual-write: saved node execution to SQL database: id=%s", execution.id)
            except Exception:
                logger.exception("Failed to dual-write node execution to SQL database: id=%s", execution.id)
                # Don't raise - LogStore write succeeded, SQL is just a backup
    def save_execution_data(self, execution: WorkflowNodeExecution) -> None:
        """
        Save or update the inputs, process_data, or outputs associated with a specific
        node_execution record.
        For LogStore implementation, this is similar to save() since we always write
        complete records. We append a new record with updated data fields.
        Args:
            execution: The NodeExecution instance with data to save
        """
        logger.debug("save_execution_data: id=%s, node_execution_id=%s", execution.id, execution.node_execution_id)
        # In LogStore, we simply write a new complete record with the data
        # The log_version timestamp will ensure this is treated as the latest version
        self.save(execution)
    def get_by_workflow_run(
        self,
        workflow_run_id: str,
        order_config: OrderConfig | None = None,
    ) -> Sequence[WorkflowNodeExecution]:
        """
        Retrieve all NodeExecution instances for a specific workflow run.
        Uses LogStore SQL query with finished_at IS NOT NULL filter for deduplication.
        This ensures we only get the final version of each node execution.
        Args:
            workflow_run_id: The workflow run ID
            order_config: Optional configuration for ordering results
                order_config.order_by: List of fields to order by (e.g., ["index", "created_at"])
                order_config.order_direction: Direction to order ("asc" or "desc")
        Returns:
            A list of NodeExecution instances
        Note:
            This method filters by finished_at IS NOT NULL to avoid duplicates from
            version updates. For complete history including intermediate states,
            a different query strategy would be needed.
        """
        logger.debug("get_by_workflow_run: workflow_run_id=%s, order_config=%s", workflow_run_id, order_config)
        # Build SQL query with deduplication using finished_at IS NOT NULL
        # This optimization avoids window functions for common case where we only
        # want the final state of each node execution
        # Build ORDER BY clause
        order_clause = ""
        if order_config and order_config.order_by:
            order_fields = []
            for field in order_config.order_by:
                # Map domain field names to logstore field names if needed
                field_name = field
                if order_config.order_direction == "desc":
                    order_fields.append(f"{field_name} DESC")
                else:
                    order_fields.append(f"{field_name} ASC")
            if order_fields:
                order_clause = "ORDER BY " + ", ".join(order_fields)
        sql = f"""
            SELECT *
            FROM {AliyunLogStore.workflow_node_execution_logstore}
            WHERE workflow_run_id='{workflow_run_id}'
              AND tenant_id='{self._tenant_id}'
              AND finished_at IS NOT NULL
        """
        if self._app_id:
            sql += f" AND app_id='{self._app_id}'"
        if order_clause:
            sql += f" {order_clause}"
        try:
            # Execute SQL query
            results = self.logstore_client.execute_sql(
                sql=sql,
                query="*",
                logstore=AliyunLogStore.workflow_node_execution_logstore,
            )
            # Convert LogStore results to WorkflowNodeExecution domain models
            executions = []
            for row in results:
                try:
                    execution = _dict_to_workflow_node_execution(row)
                    executions.append(execution)
                except Exception as e:
                    logger.warning("Failed to convert row to WorkflowNodeExecution: %s, row=%s", e, row)
                    continue
            return executions
        except Exception:
            logger.exception("Failed to retrieve node executions from LogStore: workflow_run_id=%s", workflow_run_id)
            raise
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -4,6 +4,7 @@ version = "1.11.1"
 requires-python = ">=3.11,<3.13"
 dependencies = [
    "aliyun-log-python-sdk~=0.9.37",
    "arize-phoenix-otel~=0.9.2",
    "azure-identity==1.16.1",
    "beautifulsoup4==4.12.2",
@@ -11,7 +12,7 @@ dependencies = [
    "bs4~=0.0.1",
    "cachetools~=5.3.0",
    "celery~=5.5.2",
-    "charset-normalizer>=3.4.4",
+    "chardet~=5.1.0",
    "flask~=3.1.2",
    "flask-compress>=1.17,<1.18",
    "flask-cors~=6.0.0",
@@ -91,7 +92,6 @@ dependencies = [
    "weaviate-client==4.17.0",
    "apscheduler>=3.11.0",
    "weave>=0.52.16",
    "jsonschema>=4.25.1",
 ]
 # Before adding new dependency, consider place it in
 # alphabet order (a-z) and suitable group.
--- a/api/uv.lock
+++ b/api/uv.lock
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -1044,6 +1044,25 @@ WORKFLOW_LOG_RETENTION_DAYS=30
 # Batch size for workflow log cleanup operations (default: 100)
 WORKFLOW_LOG_CLEANUP_BATCH_SIZE=100
 # Aliyun SLS Logstore Configuration
 # Aliyun Access Key ID
 ALIYUN_SLS_ACCESS_KEY_ID=
 # Aliyun Access Key Secret
 ALIYUN_SLS_ACCESS_KEY_SECRET=
 # Aliyun SLS Endpoint (e.g., cn-hangzhou.log.aliyuncs.com)
 ALIYUN_SLS_ENDPOINT=
 # Aliyun SLS Region (e.g., cn-hangzhou)
 ALIYUN_SLS_REGION=
 # Aliyun SLS Project Name
 ALIYUN_SLS_PROJECT_NAME=
 # Number of days to retain workflow run logs (default: 365 days， 3650 for permanent storage)
 ALIYUN_SLS_LOGSTORE_TTL=365
 # Enable dual-write to both SLS LogStore and SQL database (default: false)
 LOGSTORE_DUAL_WRITE_ENABLED=false
 # Enable dual-read fallback to SQL database when LogStore returns no results (default: true)
 # Useful for migration scenarios where historical data exists only in SQL database
 LOGSTORE_DUAL_READ_ENABLED=true
 # HTTP request node in workflow configuration
 HTTP_REQUEST_NODE_MAX_BINARY_SIZE=10485760
 HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -455,6 +455,14 @@ x-shared-env: &shared-api-worker-env
  WORKFLOW_LOG_CLEANUP_ENABLED: ${WORKFLOW_LOG_CLEANUP_ENABLED:-false}
  WORKFLOW_LOG_RETENTION_DAYS: ${WORKFLOW_LOG_RETENTION_DAYS:-30}
  WORKFLOW_LOG_CLEANUP_BATCH_SIZE: ${WORKFLOW_LOG_CLEANUP_BATCH_SIZE:-100}
  ALIYUN_SLS_ACCESS_KEY_ID: ${ALIYUN_SLS_ACCESS_KEY_ID:-}
  ALIYUN_SLS_ACCESS_KEY_SECRET: ${ALIYUN_SLS_ACCESS_KEY_SECRET:-}
  ALIYUN_SLS_ENDPOINT: ${ALIYUN_SLS_ENDPOINT:-}
  ALIYUN_SLS_REGION: ${ALIYUN_SLS_REGION:-}
  ALIYUN_SLS_PROJECT_NAME: ${ALIYUN_SLS_PROJECT_NAME:-}
  ALIYUN_SLS_LOGSTORE_TTL: ${ALIYUN_SLS_LOGSTORE_TTL:-365}
  LOGSTORE_DUAL_WRITE_ENABLED: ${LOGSTORE_DUAL_WRITE_ENABLED:-false}
  LOGSTORE_DUAL_READ_ENABLED: ${LOGSTORE_DUAL_READ_ENABLED:-true}
  HTTP_REQUEST_NODE_MAX_BINARY_SIZE: ${HTTP_REQUEST_NODE_MAX_BINARY_SIZE:-10485760}
  HTTP_REQUEST_NODE_MAX_TEXT_SIZE: ${HTTP_REQUEST_NODE_MAX_TEXT_SIZE:-1048576}
  HTTP_REQUEST_NODE_SSL_VERIFY: ${HTTP_REQUEST_NODE_SSL_VERIFY:-True}
--- a/docker/middleware.env.example
+++ b/docker/middleware.env.example
@@ -213,3 +213,24 @@ PLUGIN_VOLCENGINE_TOS_ENDPOINT=
 PLUGIN_VOLCENGINE_TOS_ACCESS_KEY=
 PLUGIN_VOLCENGINE_TOS_SECRET_KEY=
 PLUGIN_VOLCENGINE_TOS_REGION=
 # ------------------------------
 # Environment Variables for Aliyun SLS (Simple Log Service)
 # ------------------------------
 # Aliyun SLS Access Key ID
 ALIYUN_SLS_ACCESS_KEY_ID=
 # Aliyun SLS Access Key Secret
 ALIYUN_SLS_ACCESS_KEY_SECRET=
 # Aliyun SLS Endpoint (e.g., cn-hangzhou.log.aliyuncs.com)
 ALIYUN_SLS_ENDPOINT=
 # Aliyun SLS Region (e.g., cn-hangzhou)
 ALIYUN_SLS_REGION=
 # Aliyun SLS Project Name
 ALIYUN_SLS_PROJECT_NAME=
 # Aliyun SLS Logstore TTL (default: 365 days， 3650 for permanent storage)
 ALIYUN_SLS_LOGSTORE_TTL=365
 # Enable dual-write to both LogStore and SQL database (default: true)
 LOGSTORE_DUAL_WRITE_ENABLED=true
 # Enable dual-read fallback to SQL database when LogStore returns no results (default: true)
 # Useful for migration scenarios where historical data exists only in SQL database
 LOGSTORE_DUAL_READ_ENABLED=true