mirror of
https://github.com/langgenius/dify.git
synced 2025-12-19 09:17:19 -05:00
feat: Add Aliyun SLS (Simple Log Service) integration for workflow execution logging (#28986)
Co-authored-by: hieheihei <270985384@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: -LAN- <laipz8200@outlook.com>
This commit is contained in:
@@ -543,6 +543,25 @@ APP_MAX_EXECUTION_TIME=1200
|
||||
APP_DEFAULT_ACTIVE_REQUESTS=0
|
||||
APP_MAX_ACTIVE_REQUESTS=0
|
||||
|
||||
# Aliyun SLS Logstore Configuration
|
||||
# Aliyun Access Key ID
|
||||
ALIYUN_SLS_ACCESS_KEY_ID=
|
||||
# Aliyun Access Key Secret
|
||||
ALIYUN_SLS_ACCESS_KEY_SECRET=
|
||||
# Aliyun SLS Endpoint (e.g., cn-hangzhou.log.aliyuncs.com)
|
||||
ALIYUN_SLS_ENDPOINT=
|
||||
# Aliyun SLS Region (e.g., cn-hangzhou)
|
||||
ALIYUN_SLS_REGION=
|
||||
# Aliyun SLS Project Name
|
||||
ALIYUN_SLS_PROJECT_NAME=
|
||||
# Number of days to retain workflow run logs (default: 365 days, 3650 for permanent storage)
|
||||
ALIYUN_SLS_LOGSTORE_TTL=365
|
||||
# Enable dual-write to both SLS LogStore and SQL database (default: false)
|
||||
LOGSTORE_DUAL_WRITE_ENABLED=false
|
||||
# Enable dual-read fallback to SQL database when LogStore returns no results (default: true)
|
||||
# Useful for migration scenarios where historical data exists only in SQL database
|
||||
LOGSTORE_DUAL_READ_ENABLED=true
|
||||
|
||||
# Celery beat configuration
|
||||
CELERY_BEAT_SCHEDULER_TIME=1
|
||||
|
||||
|
||||
@@ -75,6 +75,7 @@ def initialize_extensions(app: DifyApp):
|
||||
ext_import_modules,
|
||||
ext_logging,
|
||||
ext_login,
|
||||
ext_logstore,
|
||||
ext_mail,
|
||||
ext_migrate,
|
||||
ext_orjson,
|
||||
@@ -105,6 +106,7 @@ def initialize_extensions(app: DifyApp):
|
||||
ext_migrate,
|
||||
ext_redis,
|
||||
ext_storage,
|
||||
ext_logstore, # Initialize logstore after storage, before celery
|
||||
ext_celery,
|
||||
ext_login,
|
||||
ext_mail,
|
||||
|
||||
74
api/extensions/ext_logstore.py
Normal file
74
api/extensions/ext_logstore.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Logstore extension for Dify application.
|
||||
|
||||
This extension initializes the logstore (Aliyun SLS) on application startup,
|
||||
creating necessary projects, logstores, and indexes if they don't exist.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from dify_app import DifyApp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def is_enabled() -> bool:
|
||||
"""
|
||||
Check if logstore extension is enabled.
|
||||
|
||||
Returns:
|
||||
True if all required Aliyun SLS environment variables are set, False otherwise
|
||||
"""
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
required_vars = [
|
||||
"ALIYUN_SLS_ACCESS_KEY_ID",
|
||||
"ALIYUN_SLS_ACCESS_KEY_SECRET",
|
||||
"ALIYUN_SLS_ENDPOINT",
|
||||
"ALIYUN_SLS_REGION",
|
||||
"ALIYUN_SLS_PROJECT_NAME",
|
||||
]
|
||||
|
||||
all_set = all(os.environ.get(var) for var in required_vars)
|
||||
|
||||
if not all_set:
|
||||
logger.info("Logstore extension disabled: required Aliyun SLS environment variables not set")
|
||||
|
||||
return all_set
|
||||
|
||||
|
||||
def init_app(app: DifyApp):
|
||||
"""
|
||||
Initialize logstore on application startup.
|
||||
|
||||
This function:
|
||||
1. Creates Aliyun SLS project if it doesn't exist
|
||||
2. Creates logstores (workflow_execution, workflow_node_execution) if they don't exist
|
||||
3. Creates indexes with field configurations based on PostgreSQL table structures
|
||||
|
||||
This operation is idempotent and only executes once during application startup.
|
||||
|
||||
Args:
|
||||
app: The Dify application instance
|
||||
"""
|
||||
try:
|
||||
from extensions.logstore.aliyun_logstore import AliyunLogStore
|
||||
|
||||
logger.info("Initializing logstore...")
|
||||
|
||||
# Create logstore client and initialize project/logstores/indexes
|
||||
logstore_client = AliyunLogStore()
|
||||
logstore_client.init_project_logstore()
|
||||
|
||||
# Attach to app for potential later use
|
||||
app.extensions["logstore"] = logstore_client
|
||||
|
||||
logger.info("Logstore initialized successfully")
|
||||
except Exception:
|
||||
logger.exception("Failed to initialize logstore")
|
||||
# Don't raise - allow application to continue even if logstore init fails
|
||||
# This ensures that the application can still run if logstore is misconfigured
|
||||
0
api/extensions/logstore/__init__.py
Normal file
0
api/extensions/logstore/__init__.py
Normal file
890
api/extensions/logstore/aliyun_logstore.py
Normal file
890
api/extensions/logstore/aliyun_logstore.py
Normal file
@@ -0,0 +1,890 @@
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
import sqlalchemy as sa
|
||||
from aliyun.log import ( # type: ignore[import-untyped]
|
||||
GetLogsRequest,
|
||||
IndexConfig,
|
||||
IndexKeyConfig,
|
||||
IndexLineConfig,
|
||||
LogClient,
|
||||
LogItem,
|
||||
PutLogsRequest,
|
||||
)
|
||||
from aliyun.log.auth import AUTH_VERSION_4 # type: ignore[import-untyped]
|
||||
from aliyun.log.logexception import LogException # type: ignore[import-untyped]
|
||||
from dotenv import load_dotenv
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from configs import dify_config
|
||||
from extensions.logstore.aliyun_logstore_pg import AliyunLogStorePG
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AliyunLogStore:
|
||||
"""
|
||||
Singleton class for Aliyun SLS LogStore operations.
|
||||
|
||||
Ensures only one instance exists to prevent multiple PG connection pools.
|
||||
"""
|
||||
|
||||
_instance: "AliyunLogStore | None" = None
|
||||
_initialized: bool = False
|
||||
|
||||
# Track delayed PG connection for newly created projects
|
||||
_pg_connection_timer: threading.Timer | None = None
|
||||
_pg_connection_delay: int = 90 # delay seconds
|
||||
|
||||
# Default tokenizer for text/json fields and full-text index
|
||||
# Common delimiters: comma, space, quotes, punctuation, operators, brackets, special chars
|
||||
DEFAULT_TOKEN_LIST = [
|
||||
",",
|
||||
" ",
|
||||
'"',
|
||||
'"',
|
||||
";",
|
||||
"=",
|
||||
"(",
|
||||
")",
|
||||
"[",
|
||||
"]",
|
||||
"{",
|
||||
"}",
|
||||
"?",
|
||||
"@",
|
||||
"&",
|
||||
"<",
|
||||
">",
|
||||
"/",
|
||||
":",
|
||||
"\n",
|
||||
"\t",
|
||||
]
|
||||
|
||||
def __new__(cls) -> "AliyunLogStore":
|
||||
"""Implement singleton pattern."""
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
project_des = "dify"
|
||||
|
||||
workflow_execution_logstore = "workflow_execution"
|
||||
|
||||
workflow_node_execution_logstore = "workflow_node_execution"
|
||||
|
||||
@staticmethod
|
||||
def _sqlalchemy_type_to_logstore_type(column: Any) -> str:
|
||||
"""
|
||||
Map SQLAlchemy column type to Aliyun LogStore index type.
|
||||
|
||||
Args:
|
||||
column: SQLAlchemy column object
|
||||
|
||||
Returns:
|
||||
LogStore index type: 'text', 'long', 'double', or 'json'
|
||||
"""
|
||||
column_type = column.type
|
||||
|
||||
# Integer types -> long
|
||||
if isinstance(column_type, (sa.Integer, sa.BigInteger, sa.SmallInteger)):
|
||||
return "long"
|
||||
|
||||
# Float types -> double
|
||||
if isinstance(column_type, (sa.Float, sa.Numeric)):
|
||||
return "double"
|
||||
|
||||
# String and Text types -> text
|
||||
if isinstance(column_type, (sa.String, sa.Text)):
|
||||
return "text"
|
||||
|
||||
# DateTime -> text (stored as ISO format string in logstore)
|
||||
if isinstance(column_type, sa.DateTime):
|
||||
return "text"
|
||||
|
||||
# Boolean -> long (stored as 0/1)
|
||||
if isinstance(column_type, sa.Boolean):
|
||||
return "long"
|
||||
|
||||
# JSON -> json
|
||||
if isinstance(column_type, sa.JSON):
|
||||
return "json"
|
||||
|
||||
# Default to text for unknown types
|
||||
return "text"
|
||||
|
||||
@staticmethod
|
||||
def _generate_index_keys_from_model(model_class: type[DeclarativeBase]) -> dict[str, IndexKeyConfig]:
|
||||
"""
|
||||
Automatically generate LogStore field index configuration from SQLAlchemy model.
|
||||
|
||||
This method introspects the SQLAlchemy model's column definitions and creates
|
||||
corresponding LogStore index configurations. When the PG schema is updated via
|
||||
Flask-Migrate, this method will automatically pick up the new fields on next startup.
|
||||
|
||||
Args:
|
||||
model_class: SQLAlchemy model class (e.g., WorkflowRun, WorkflowNodeExecutionModel)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping field names to IndexKeyConfig objects
|
||||
"""
|
||||
index_keys = {}
|
||||
|
||||
# Iterate over all mapped columns in the model
|
||||
if hasattr(model_class, "__mapper__"):
|
||||
for column_name, column_property in model_class.__mapper__.columns.items():
|
||||
# Skip relationship properties and other non-column attributes
|
||||
if not hasattr(column_property, "type"):
|
||||
continue
|
||||
|
||||
# Map SQLAlchemy type to LogStore type
|
||||
logstore_type = AliyunLogStore._sqlalchemy_type_to_logstore_type(column_property)
|
||||
|
||||
# Create index configuration
|
||||
# - text fields: case_insensitive for better search, with tokenizer and Chinese support
|
||||
# - all fields: doc_value=True for analytics
|
||||
if logstore_type == "text":
|
||||
index_keys[column_name] = IndexKeyConfig(
|
||||
index_type="text",
|
||||
case_sensitive=False,
|
||||
doc_value=True,
|
||||
token_list=AliyunLogStore.DEFAULT_TOKEN_LIST,
|
||||
chinese=True,
|
||||
)
|
||||
else:
|
||||
index_keys[column_name] = IndexKeyConfig(index_type=logstore_type, doc_value=True)
|
||||
|
||||
# Add log_version field (not in PG model, but used in logstore for versioning)
|
||||
index_keys["log_version"] = IndexKeyConfig(index_type="long", doc_value=True)
|
||||
|
||||
return index_keys
|
||||
|
||||
def __init__(self) -> None:
|
||||
# Skip initialization if already initialized (singleton pattern)
|
||||
if self.__class__._initialized:
|
||||
return
|
||||
|
||||
load_dotenv()
|
||||
|
||||
self.access_key_id: str = os.environ.get("ALIYUN_SLS_ACCESS_KEY_ID", "")
|
||||
self.access_key_secret: str = os.environ.get("ALIYUN_SLS_ACCESS_KEY_SECRET", "")
|
||||
self.endpoint: str = os.environ.get("ALIYUN_SLS_ENDPOINT", "")
|
||||
self.region: str = os.environ.get("ALIYUN_SLS_REGION", "")
|
||||
self.project_name: str = os.environ.get("ALIYUN_SLS_PROJECT_NAME", "")
|
||||
self.logstore_ttl: int = int(os.environ.get("ALIYUN_SLS_LOGSTORE_TTL", 365))
|
||||
self.log_enabled: bool = os.environ.get("SQLALCHEMY_ECHO", "false").lower() == "true"
|
||||
self.pg_mode_enabled: bool = os.environ.get("LOGSTORE_PG_MODE_ENABLED", "true").lower() == "true"
|
||||
|
||||
# Initialize SDK client
|
||||
self.client = LogClient(
|
||||
self.endpoint, self.access_key_id, self.access_key_secret, auth_version=AUTH_VERSION_4, region=self.region
|
||||
)
|
||||
|
||||
# Append Dify identification to the existing user agent
|
||||
original_user_agent = self.client._user_agent # pyright: ignore[reportPrivateUsage]
|
||||
dify_version = dify_config.project.version
|
||||
enhanced_user_agent = f"Dify,Dify-{dify_version},{original_user_agent}"
|
||||
self.client.set_user_agent(enhanced_user_agent)
|
||||
|
||||
# PG client will be initialized in init_project_logstore
|
||||
self._pg_client: AliyunLogStorePG | None = None
|
||||
self._use_pg_protocol: bool = False
|
||||
|
||||
self.__class__._initialized = True
|
||||
|
||||
@property
|
||||
def supports_pg_protocol(self) -> bool:
|
||||
"""Check if PG protocol is supported and enabled."""
|
||||
return self._use_pg_protocol
|
||||
|
||||
def _attempt_pg_connection_init(self) -> bool:
|
||||
"""
|
||||
Attempt to initialize PG connection.
|
||||
|
||||
This method tries to establish PG connection and performs necessary checks.
|
||||
It's used both for immediate connection (existing projects) and delayed connection (new projects).
|
||||
|
||||
Returns:
|
||||
True if PG connection was successfully established, False otherwise.
|
||||
"""
|
||||
if not self.pg_mode_enabled or not self._pg_client:
|
||||
return False
|
||||
|
||||
try:
|
||||
self._use_pg_protocol = self._pg_client.init_connection()
|
||||
if self._use_pg_protocol:
|
||||
logger.info("Successfully connected to project %s using PG protocol", self.project_name)
|
||||
# Check if scan_index is enabled for all logstores
|
||||
self._check_and_disable_pg_if_scan_index_disabled()
|
||||
return True
|
||||
else:
|
||||
logger.info("PG connection failed for project %s. Will use SDK mode.", self.project_name)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to establish PG connection for project %s: %s. Will use SDK mode.",
|
||||
self.project_name,
|
||||
str(e),
|
||||
)
|
||||
self._use_pg_protocol = False
|
||||
return False
|
||||
|
||||
def _delayed_pg_connection_init(self) -> None:
|
||||
"""
|
||||
Delayed initialization of PG connection for newly created projects.
|
||||
|
||||
This method is called by a background timer 3 minutes after project creation.
|
||||
"""
|
||||
# Double check conditions in case state changed
|
||||
if self._use_pg_protocol:
|
||||
return
|
||||
|
||||
logger.info(
|
||||
"Attempting delayed PG connection for newly created project %s ...",
|
||||
self.project_name,
|
||||
)
|
||||
self._attempt_pg_connection_init()
|
||||
self.__class__._pg_connection_timer = None
|
||||
|
||||
def init_project_logstore(self):
|
||||
"""
|
||||
Initialize project, logstore, index, and PG connection.
|
||||
|
||||
This method should be called once during application startup to ensure
|
||||
all required resources exist and connections are established.
|
||||
"""
|
||||
# Step 1: Ensure project and logstore exist
|
||||
project_is_new = False
|
||||
if not self.is_project_exist():
|
||||
self.create_project()
|
||||
project_is_new = True
|
||||
|
||||
self.create_logstore_if_not_exist()
|
||||
|
||||
# Step 2: Initialize PG client and connection (if enabled)
|
||||
if not self.pg_mode_enabled:
|
||||
logger.info("PG mode is disabled. Will use SDK mode.")
|
||||
return
|
||||
|
||||
# Create PG client if not already created
|
||||
if self._pg_client is None:
|
||||
logger.info("Initializing PG client for project %s...", self.project_name)
|
||||
self._pg_client = AliyunLogStorePG(
|
||||
self.access_key_id, self.access_key_secret, self.endpoint, self.project_name
|
||||
)
|
||||
|
||||
# Step 3: Establish PG connection based on project status
|
||||
if project_is_new:
|
||||
# For newly created projects, schedule delayed PG connection
|
||||
self._use_pg_protocol = False
|
||||
logger.info(
|
||||
"Project %s is newly created. Will use SDK mode and schedule PG connection attempt in %d seconds.",
|
||||
self.project_name,
|
||||
self.__class__._pg_connection_delay,
|
||||
)
|
||||
if self.__class__._pg_connection_timer is not None:
|
||||
self.__class__._pg_connection_timer.cancel()
|
||||
self.__class__._pg_connection_timer = threading.Timer(
|
||||
self.__class__._pg_connection_delay,
|
||||
self._delayed_pg_connection_init,
|
||||
)
|
||||
self.__class__._pg_connection_timer.daemon = True # Don't block app shutdown
|
||||
self.__class__._pg_connection_timer.start()
|
||||
else:
|
||||
# For existing projects, attempt PG connection immediately
|
||||
logger.info("Project %s already exists. Attempting PG connection...", self.project_name)
|
||||
self._attempt_pg_connection_init()
|
||||
|
||||
def _check_and_disable_pg_if_scan_index_disabled(self) -> None:
|
||||
"""
|
||||
Check if scan_index is enabled for all logstores.
|
||||
If any logstore has scan_index=false, disable PG protocol.
|
||||
|
||||
This is necessary because PG protocol requires scan_index to be enabled.
|
||||
"""
|
||||
logstore_name_list = [
|
||||
AliyunLogStore.workflow_execution_logstore,
|
||||
AliyunLogStore.workflow_node_execution_logstore,
|
||||
]
|
||||
|
||||
for logstore_name in logstore_name_list:
|
||||
existing_config = self.get_existing_index_config(logstore_name)
|
||||
if existing_config and not existing_config.scan_index:
|
||||
logger.info(
|
||||
"Logstore %s has scan_index=false, USE SDK mode for read/write operations. "
|
||||
"PG protocol requires scan_index to be enabled.",
|
||||
logstore_name,
|
||||
)
|
||||
self._use_pg_protocol = False
|
||||
# Close PG connection if it was initialized
|
||||
if self._pg_client:
|
||||
self._pg_client.close()
|
||||
self._pg_client = None
|
||||
return
|
||||
|
||||
def is_project_exist(self) -> bool:
|
||||
try:
|
||||
self.client.get_project(self.project_name)
|
||||
return True
|
||||
except Exception as e:
|
||||
if e.args[0] == "ProjectNotExist":
|
||||
return False
|
||||
else:
|
||||
raise e
|
||||
|
||||
def create_project(self):
|
||||
try:
|
||||
self.client.create_project(self.project_name, AliyunLogStore.project_des)
|
||||
logger.info("Project %s created successfully", self.project_name)
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to create project %s: errorCode=%s, errorMessage=%s, requestId=%s",
|
||||
self.project_name,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise
|
||||
|
||||
def is_logstore_exist(self, logstore_name: str) -> bool:
|
||||
try:
|
||||
_ = self.client.get_logstore(self.project_name, logstore_name)
|
||||
return True
|
||||
except Exception as e:
|
||||
if e.args[0] == "LogStoreNotExist":
|
||||
return False
|
||||
else:
|
||||
raise e
|
||||
|
||||
def create_logstore_if_not_exist(self) -> None:
|
||||
logstore_name_list = [
|
||||
AliyunLogStore.workflow_execution_logstore,
|
||||
AliyunLogStore.workflow_node_execution_logstore,
|
||||
]
|
||||
|
||||
for logstore_name in logstore_name_list:
|
||||
if not self.is_logstore_exist(logstore_name):
|
||||
try:
|
||||
self.client.create_logstore(
|
||||
project_name=self.project_name, logstore_name=logstore_name, ttl=self.logstore_ttl
|
||||
)
|
||||
logger.info("logstore %s created successfully", logstore_name)
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to create logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
|
||||
logstore_name,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise
|
||||
|
||||
# Ensure index contains all Dify-required fields
|
||||
# This intelligently merges with existing config, preserving custom indexes
|
||||
self.ensure_index_config(logstore_name)
|
||||
|
||||
def is_index_exist(self, logstore_name: str) -> bool:
|
||||
try:
|
||||
_ = self.client.get_index_config(self.project_name, logstore_name)
|
||||
return True
|
||||
except Exception as e:
|
||||
if e.args[0] == "IndexConfigNotExist":
|
||||
return False
|
||||
else:
|
||||
raise e
|
||||
|
||||
def get_existing_index_config(self, logstore_name: str) -> IndexConfig | None:
|
||||
"""
|
||||
Get existing index configuration from logstore.
|
||||
|
||||
Args:
|
||||
logstore_name: Name of the logstore
|
||||
|
||||
Returns:
|
||||
IndexConfig object if index exists, None otherwise
|
||||
"""
|
||||
try:
|
||||
response = self.client.get_index_config(self.project_name, logstore_name)
|
||||
return response.get_index_config()
|
||||
except Exception as e:
|
||||
if e.args[0] == "IndexConfigNotExist":
|
||||
return None
|
||||
else:
|
||||
logger.exception("Failed to get index config for logstore %s", logstore_name)
|
||||
raise e
|
||||
|
||||
def _get_workflow_execution_index_keys(self) -> dict[str, IndexKeyConfig]:
|
||||
"""
|
||||
Get field index configuration for workflow_execution logstore.
|
||||
|
||||
This method automatically generates index configuration from the WorkflowRun SQLAlchemy model.
|
||||
When the PG schema is updated via Flask-Migrate, the index configuration will be automatically
|
||||
updated on next application startup.
|
||||
"""
|
||||
from models.workflow import WorkflowRun
|
||||
|
||||
index_keys = self._generate_index_keys_from_model(WorkflowRun)
|
||||
|
||||
# Add custom fields that are in logstore but not in PG model
|
||||
# These fields are added by the repository layer
|
||||
index_keys["error_message"] = IndexKeyConfig(
|
||||
index_type="text",
|
||||
case_sensitive=False,
|
||||
doc_value=True,
|
||||
token_list=self.DEFAULT_TOKEN_LIST,
|
||||
chinese=True,
|
||||
) # Maps to 'error' in PG
|
||||
index_keys["started_at"] = IndexKeyConfig(
|
||||
index_type="text",
|
||||
case_sensitive=False,
|
||||
doc_value=True,
|
||||
token_list=self.DEFAULT_TOKEN_LIST,
|
||||
chinese=True,
|
||||
) # Maps to 'created_at' in PG
|
||||
|
||||
logger.info("Generated %d index keys for workflow_execution from WorkflowRun model", len(index_keys))
|
||||
return index_keys
|
||||
|
||||
def _get_workflow_node_execution_index_keys(self) -> dict[str, IndexKeyConfig]:
|
||||
"""
|
||||
Get field index configuration for workflow_node_execution logstore.
|
||||
|
||||
This method automatically generates index configuration from the WorkflowNodeExecutionModel.
|
||||
When the PG schema is updated via Flask-Migrate, the index configuration will be automatically
|
||||
updated on next application startup.
|
||||
"""
|
||||
from models.workflow import WorkflowNodeExecutionModel
|
||||
|
||||
index_keys = self._generate_index_keys_from_model(WorkflowNodeExecutionModel)
|
||||
|
||||
logger.debug(
|
||||
"Generated %d index keys for workflow_node_execution from WorkflowNodeExecutionModel", len(index_keys)
|
||||
)
|
||||
return index_keys
|
||||
|
||||
def _get_index_config(self, logstore_name: str) -> IndexConfig:
|
||||
"""
|
||||
Get index configuration for the specified logstore.
|
||||
|
||||
Args:
|
||||
logstore_name: Name of the logstore
|
||||
|
||||
Returns:
|
||||
IndexConfig object with line and field indexes
|
||||
"""
|
||||
# Create full-text index (line config) with tokenizer
|
||||
line_config = IndexLineConfig(token_list=self.DEFAULT_TOKEN_LIST, case_sensitive=False, chinese=True)
|
||||
|
||||
# Get field index configuration based on logstore name
|
||||
field_keys = {}
|
||||
if logstore_name == AliyunLogStore.workflow_execution_logstore:
|
||||
field_keys = self._get_workflow_execution_index_keys()
|
||||
elif logstore_name == AliyunLogStore.workflow_node_execution_logstore:
|
||||
field_keys = self._get_workflow_node_execution_index_keys()
|
||||
|
||||
# key_config_list should be a dict, not a list
|
||||
# Create index config with both line and field indexes
|
||||
return IndexConfig(line_config=line_config, key_config_list=field_keys, scan_index=True)
|
||||
|
||||
def create_index(self, logstore_name: str) -> None:
|
||||
"""
|
||||
Create index for the specified logstore with both full-text and field indexes.
|
||||
Field indexes are automatically generated from the corresponding SQLAlchemy model.
|
||||
"""
|
||||
index_config = self._get_index_config(logstore_name)
|
||||
|
||||
try:
|
||||
self.client.create_index(self.project_name, logstore_name, index_config)
|
||||
logger.info(
|
||||
"index for %s created successfully with %d field indexes",
|
||||
logstore_name,
|
||||
len(index_config.key_config_list or {}),
|
||||
)
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to create index for logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
|
||||
logstore_name,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise
|
||||
|
||||
def _merge_index_configs(
|
||||
self, existing_config: IndexConfig, required_keys: dict[str, IndexKeyConfig], logstore_name: str
|
||||
) -> tuple[IndexConfig, bool]:
|
||||
"""
|
||||
Intelligently merge existing index config with Dify's required field indexes.
|
||||
|
||||
This method:
|
||||
1. Preserves all existing field indexes in logstore (including custom fields)
|
||||
2. Adds missing Dify-required fields
|
||||
3. Updates fields where type doesn't match (with json/text compatibility)
|
||||
4. Corrects case mismatches (e.g., if Dify needs 'status' but logstore has 'Status')
|
||||
|
||||
Type compatibility rules:
|
||||
- json and text types are considered compatible (users can manually choose either)
|
||||
- All other type mismatches will be corrected to match Dify requirements
|
||||
|
||||
Note: Logstore is case-sensitive and doesn't allow duplicate fields with different cases.
|
||||
Case mismatch means: existing field name differs from required name only in case.
|
||||
|
||||
Args:
|
||||
existing_config: Current index configuration from logstore
|
||||
required_keys: Dify's required field index configurations
|
||||
logstore_name: Name of the logstore (for logging)
|
||||
|
||||
Returns:
|
||||
Tuple of (merged_config, needs_update)
|
||||
"""
|
||||
# key_config_list is already a dict in the SDK
|
||||
# Make a copy to avoid modifying the original
|
||||
existing_keys = dict(existing_config.key_config_list) if existing_config.key_config_list else {}
|
||||
|
||||
# Track changes
|
||||
needs_update = False
|
||||
case_corrections = [] # Fields that need case correction (e.g., 'Status' -> 'status')
|
||||
missing_fields = []
|
||||
type_mismatches = []
|
||||
|
||||
# First pass: Check for and resolve case mismatches with required fields
|
||||
# Note: Logstore itself doesn't allow duplicate fields with different cases,
|
||||
# so we only need to check if the existing case matches the required case
|
||||
for required_name in required_keys:
|
||||
lower_name = required_name.lower()
|
||||
# Find key that matches case-insensitively but not exactly
|
||||
wrong_case_key = None
|
||||
for existing_key in existing_keys:
|
||||
if existing_key.lower() == lower_name and existing_key != required_name:
|
||||
wrong_case_key = existing_key
|
||||
break
|
||||
|
||||
if wrong_case_key:
|
||||
# Field exists but with wrong case (e.g., 'Status' when we need 'status')
|
||||
# Remove the wrong-case key, will be added back with correct case later
|
||||
case_corrections.append((wrong_case_key, required_name))
|
||||
del existing_keys[wrong_case_key]
|
||||
needs_update = True
|
||||
|
||||
# Second pass: Check each required field
|
||||
for required_name, required_config in required_keys.items():
|
||||
# Check for exact match (case-sensitive)
|
||||
if required_name in existing_keys:
|
||||
existing_type = existing_keys[required_name].index_type
|
||||
required_type = required_config.index_type
|
||||
|
||||
# Check if type matches
|
||||
# Special case: json and text are interchangeable for JSON content fields
|
||||
# Allow users to manually configure text instead of json (or vice versa) without forcing updates
|
||||
is_compatible = existing_type == required_type or ({existing_type, required_type} == {"json", "text"})
|
||||
|
||||
if not is_compatible:
|
||||
type_mismatches.append((required_name, existing_type, required_type))
|
||||
# Update with correct type
|
||||
existing_keys[required_name] = required_config
|
||||
needs_update = True
|
||||
# else: field exists with compatible type, no action needed
|
||||
else:
|
||||
# Field doesn't exist (may have been removed in first pass due to case conflict)
|
||||
missing_fields.append(required_name)
|
||||
existing_keys[required_name] = required_config
|
||||
needs_update = True
|
||||
|
||||
# Log changes
|
||||
if missing_fields:
|
||||
logger.info(
|
||||
"Logstore %s: Adding %d missing Dify-required fields: %s",
|
||||
logstore_name,
|
||||
len(missing_fields),
|
||||
", ".join(missing_fields[:10]) + ("..." if len(missing_fields) > 10 else ""),
|
||||
)
|
||||
|
||||
if type_mismatches:
|
||||
logger.info(
|
||||
"Logstore %s: Fixing %d type mismatches: %s",
|
||||
logstore_name,
|
||||
len(type_mismatches),
|
||||
", ".join([f"{name}({old}->{new})" for name, old, new in type_mismatches[:5]])
|
||||
+ ("..." if len(type_mismatches) > 5 else ""),
|
||||
)
|
||||
|
||||
if case_corrections:
|
||||
logger.info(
|
||||
"Logstore %s: Correcting %d field name cases: %s",
|
||||
logstore_name,
|
||||
len(case_corrections),
|
||||
", ".join([f"'{old}' -> '{new}'" for old, new in case_corrections[:5]])
|
||||
+ ("..." if len(case_corrections) > 5 else ""),
|
||||
)
|
||||
|
||||
# Create merged config
|
||||
# key_config_list should be a dict, not a list
|
||||
# Preserve the original scan_index value - don't force it to True
|
||||
merged_config = IndexConfig(
|
||||
line_config=existing_config.line_config
|
||||
or IndexLineConfig(token_list=self.DEFAULT_TOKEN_LIST, case_sensitive=False, chinese=True),
|
||||
key_config_list=existing_keys,
|
||||
scan_index=existing_config.scan_index,
|
||||
)
|
||||
|
||||
return merged_config, needs_update
|
||||
|
||||
def ensure_index_config(self, logstore_name: str) -> None:
|
||||
"""
|
||||
Ensure index configuration includes all Dify-required fields.
|
||||
|
||||
This method intelligently manages index configuration:
|
||||
1. If index doesn't exist, create it with Dify's required fields
|
||||
2. If index exists:
|
||||
- Check if all Dify-required fields are present
|
||||
- Check if field types match requirements
|
||||
- Only update if fields are missing or types are incorrect
|
||||
- Preserve any additional custom index configurations
|
||||
|
||||
This approach allows users to add their own custom indexes without being overwritten.
|
||||
"""
|
||||
# Get Dify's required field indexes
|
||||
required_keys = {}
|
||||
if logstore_name == AliyunLogStore.workflow_execution_logstore:
|
||||
required_keys = self._get_workflow_execution_index_keys()
|
||||
elif logstore_name == AliyunLogStore.workflow_node_execution_logstore:
|
||||
required_keys = self._get_workflow_node_execution_index_keys()
|
||||
|
||||
# Check if index exists
|
||||
existing_config = self.get_existing_index_config(logstore_name)
|
||||
|
||||
if existing_config is None:
|
||||
# Index doesn't exist, create it
|
||||
logger.info(
|
||||
"Logstore %s: Index doesn't exist, creating with %d required fields",
|
||||
logstore_name,
|
||||
len(required_keys),
|
||||
)
|
||||
self.create_index(logstore_name)
|
||||
else:
|
||||
merged_config, needs_update = self._merge_index_configs(existing_config, required_keys, logstore_name)
|
||||
|
||||
if needs_update:
|
||||
logger.info("Logstore %s: Updating index to include Dify-required fields", logstore_name)
|
||||
try:
|
||||
self.client.update_index(self.project_name, logstore_name, merged_config)
|
||||
logger.info(
|
||||
"Logstore %s: Index updated successfully, now has %d total field indexes",
|
||||
logstore_name,
|
||||
len(merged_config.key_config_list or {}),
|
||||
)
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to update index for logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
|
||||
logstore_name,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise
|
||||
else:
|
||||
logger.info(
|
||||
"Logstore %s: Index already contains all %d Dify-required fields with correct types, "
|
||||
"no update needed",
|
||||
logstore_name,
|
||||
len(required_keys),
|
||||
)
|
||||
|
||||
def put_log(self, logstore: str, contents: Sequence[tuple[str, str]]) -> None:
|
||||
# Route to PG or SDK based on protocol availability
|
||||
if self._use_pg_protocol and self._pg_client:
|
||||
self._pg_client.put_log(logstore, contents, self.log_enabled)
|
||||
else:
|
||||
log_item = LogItem(contents=contents)
|
||||
request = PutLogsRequest(project=self.project_name, logstore=logstore, logitems=[log_item])
|
||||
|
||||
if self.log_enabled:
|
||||
logger.info(
|
||||
"[LogStore-SDK] PUT_LOG | logstore=%s | project=%s | items_count=%d",
|
||||
logstore,
|
||||
self.project_name,
|
||||
len(contents),
|
||||
)
|
||||
|
||||
try:
|
||||
self.client.put_logs(request)
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to put logs to logstore %s: errorCode=%s, errorMessage=%s, requestId=%s",
|
||||
logstore,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise
|
||||
|
||||
def get_logs(
|
||||
self,
|
||||
logstore: str,
|
||||
from_time: int,
|
||||
to_time: int,
|
||||
topic: str = "",
|
||||
query: str = "",
|
||||
line: int = 100,
|
||||
offset: int = 0,
|
||||
reverse: bool = True,
|
||||
) -> list[dict]:
|
||||
request = GetLogsRequest(
|
||||
project=self.project_name,
|
||||
logstore=logstore,
|
||||
fromTime=from_time,
|
||||
toTime=to_time,
|
||||
topic=topic,
|
||||
query=query,
|
||||
line=line,
|
||||
offset=offset,
|
||||
reverse=reverse,
|
||||
)
|
||||
|
||||
# Log query info if SQLALCHEMY_ECHO is enabled
|
||||
if self.log_enabled:
|
||||
logger.info(
|
||||
"[LogStore] GET_LOGS | logstore=%s | project=%s | query=%s | "
|
||||
"from_time=%d | to_time=%d | line=%d | offset=%d | reverse=%s",
|
||||
logstore,
|
||||
self.project_name,
|
||||
query,
|
||||
from_time,
|
||||
to_time,
|
||||
line,
|
||||
offset,
|
||||
reverse,
|
||||
)
|
||||
|
||||
try:
|
||||
response = self.client.get_logs(request)
|
||||
result = []
|
||||
logs = response.get_logs() if response else []
|
||||
for log in logs:
|
||||
result.append(log.get_contents())
|
||||
|
||||
# Log result count if SQLALCHEMY_ECHO is enabled
|
||||
if self.log_enabled:
|
||||
logger.info(
|
||||
"[LogStore] GET_LOGS RESULT | logstore=%s | returned_count=%d",
|
||||
logstore,
|
||||
len(result),
|
||||
)
|
||||
|
||||
return result
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to get logs from logstore %s with query '%s': errorCode=%s, errorMessage=%s, requestId=%s",
|
||||
logstore,
|
||||
query,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
)
|
||||
raise
|
||||
|
||||
def execute_sql(
|
||||
self,
|
||||
sql: str,
|
||||
logstore: str | None = None,
|
||||
query: str = "*",
|
||||
from_time: int | None = None,
|
||||
to_time: int | None = None,
|
||||
power_sql: bool = False,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Execute SQL query for aggregation and analysis.
|
||||
|
||||
Args:
|
||||
sql: SQL query string (SELECT statement)
|
||||
logstore: Name of the logstore (required)
|
||||
query: Search/filter query for SDK mode (default: "*" for all logs).
|
||||
Only used in SDK mode. PG mode ignores this parameter.
|
||||
from_time: Start time (Unix timestamp) - only used in SDK mode
|
||||
to_time: End time (Unix timestamp) - only used in SDK mode
|
||||
power_sql: Whether to use enhanced SQL mode (default: False)
|
||||
|
||||
Returns:
|
||||
List of result rows as dictionaries
|
||||
|
||||
Note:
|
||||
- PG mode: Only executes the SQL directly
|
||||
- SDK mode: Combines query and sql as "query | sql"
|
||||
"""
|
||||
# Logstore is required
|
||||
if not logstore:
|
||||
raise ValueError("logstore parameter is required for execute_sql")
|
||||
|
||||
# Route to PG or SDK based on protocol availability
|
||||
if self._use_pg_protocol and self._pg_client:
|
||||
# PG mode: execute SQL directly (ignore query parameter)
|
||||
return self._pg_client.execute_sql(sql, logstore, self.log_enabled)
|
||||
else:
|
||||
# SDK mode: combine query and sql as "query | sql"
|
||||
full_query = f"{query} | {sql}"
|
||||
|
||||
# Provide default time range if not specified
|
||||
if from_time is None:
|
||||
from_time = 0
|
||||
|
||||
if to_time is None:
|
||||
to_time = int(time.time()) # now
|
||||
|
||||
request = GetLogsRequest(
|
||||
project=self.project_name,
|
||||
logstore=logstore,
|
||||
fromTime=from_time,
|
||||
toTime=to_time,
|
||||
query=full_query,
|
||||
)
|
||||
|
||||
# Log query info if SQLALCHEMY_ECHO is enabled
|
||||
if self.log_enabled:
|
||||
logger.info(
|
||||
"[LogStore-SDK] EXECUTE_SQL | logstore=%s | project=%s | from_time=%d | to_time=%d | full_query=%s",
|
||||
logstore,
|
||||
self.project_name,
|
||||
from_time,
|
||||
to_time,
|
||||
query,
|
||||
sql,
|
||||
)
|
||||
|
||||
try:
|
||||
response = self.client.get_logs(request)
|
||||
|
||||
result = []
|
||||
logs = response.get_logs() if response else []
|
||||
for log in logs:
|
||||
result.append(log.get_contents())
|
||||
|
||||
# Log result count if SQLALCHEMY_ECHO is enabled
|
||||
if self.log_enabled:
|
||||
logger.info(
|
||||
"[LogStore-SDK] EXECUTE_SQL RESULT | logstore=%s | returned_count=%d",
|
||||
logstore,
|
||||
len(result),
|
||||
)
|
||||
|
||||
return result
|
||||
except LogException as e:
|
||||
logger.exception(
|
||||
"Failed to execute SQL, logstore %s: errorCode=%s, errorMessage=%s, requestId=%s, full_query=%s",
|
||||
logstore,
|
||||
e.get_error_code(),
|
||||
e.get_error_message(),
|
||||
e.get_request_id(),
|
||||
full_query,
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
aliyun_logstore = AliyunLogStore()
|
||||
# aliyun_logstore.init_project_logstore()
|
||||
aliyun_logstore.put_log(AliyunLogStore.workflow_execution_logstore, [("key1", "value1")])
|
||||
407
api/extensions/logstore/aliyun_logstore_pg.py
Normal file
407
api/extensions/logstore/aliyun_logstore_pg.py
Normal file
@@ -0,0 +1,407 @@
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from contextlib import contextmanager
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.pool
|
||||
from psycopg2 import InterfaceError, OperationalError
|
||||
|
||||
from configs import dify_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AliyunLogStorePG:
|
||||
"""
|
||||
PostgreSQL protocol support for Aliyun SLS LogStore.
|
||||
|
||||
Handles PG connection pooling and operations for regions that support PG protocol.
|
||||
"""
|
||||
|
||||
def __init__(self, access_key_id: str, access_key_secret: str, endpoint: str, project_name: str):
|
||||
"""
|
||||
Initialize PG connection for SLS.
|
||||
|
||||
Args:
|
||||
access_key_id: Aliyun access key ID
|
||||
access_key_secret: Aliyun access key secret
|
||||
endpoint: SLS endpoint
|
||||
project_name: SLS project name
|
||||
"""
|
||||
self._access_key_id = access_key_id
|
||||
self._access_key_secret = access_key_secret
|
||||
self._endpoint = endpoint
|
||||
self.project_name = project_name
|
||||
self._pg_pool: psycopg2.pool.SimpleConnectionPool | None = None
|
||||
self._use_pg_protocol = False
|
||||
|
||||
def _check_port_connectivity(self, host: str, port: int, timeout: float = 2.0) -> bool:
|
||||
"""
|
||||
Check if a TCP port is reachable using socket connection.
|
||||
|
||||
This provides a fast check before attempting full database connection,
|
||||
preventing long waits when connecting to unsupported regions.
|
||||
|
||||
Args:
|
||||
host: Hostname or IP address
|
||||
port: Port number
|
||||
timeout: Connection timeout in seconds (default: 2.0)
|
||||
|
||||
Returns:
|
||||
True if port is reachable, False otherwise
|
||||
"""
|
||||
try:
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(timeout)
|
||||
result = sock.connect_ex((host, port))
|
||||
sock.close()
|
||||
return result == 0
|
||||
except Exception as e:
|
||||
logger.debug("Port connectivity check failed for %s:%d: %s", host, port, str(e))
|
||||
return False
|
||||
|
||||
def init_connection(self) -> bool:
|
||||
"""
|
||||
Initialize PostgreSQL connection pool for SLS PG protocol support.
|
||||
|
||||
Attempts to connect to SLS using PostgreSQL protocol. If successful, sets
|
||||
_use_pg_protocol to True and creates a connection pool. If connection fails
|
||||
(region doesn't support PG protocol or other errors), returns False.
|
||||
|
||||
Returns:
|
||||
True if PG protocol is supported and initialized, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Extract hostname from endpoint (remove protocol if present)
|
||||
pg_host = self._endpoint.replace("http://", "").replace("https://", "")
|
||||
|
||||
# Get pool configuration
|
||||
pg_max_connections = int(os.environ.get("ALIYUN_SLS_PG_MAX_CONNECTIONS", 10))
|
||||
|
||||
logger.debug(
|
||||
"Check PG protocol connection to SLS: host=%s, project=%s",
|
||||
pg_host,
|
||||
self.project_name,
|
||||
)
|
||||
|
||||
# Fast port connectivity check before attempting full connection
|
||||
# This prevents long waits when connecting to unsupported regions
|
||||
if not self._check_port_connectivity(pg_host, 5432, timeout=1.0):
|
||||
logger.info(
|
||||
"USE SDK mode for read/write operations, host=%s",
|
||||
pg_host,
|
||||
)
|
||||
return False
|
||||
|
||||
# Create connection pool
|
||||
self._pg_pool = psycopg2.pool.SimpleConnectionPool(
|
||||
minconn=1,
|
||||
maxconn=pg_max_connections,
|
||||
host=pg_host,
|
||||
port=5432,
|
||||
database=self.project_name,
|
||||
user=self._access_key_id,
|
||||
password=self._access_key_secret,
|
||||
sslmode="require",
|
||||
connect_timeout=5,
|
||||
application_name=f"Dify-{dify_config.project.version}",
|
||||
)
|
||||
|
||||
# Note: Skip test query because SLS PG protocol only supports SELECT/INSERT on actual tables
|
||||
# Connection pool creation success already indicates connectivity
|
||||
|
||||
self._use_pg_protocol = True
|
||||
logger.info(
|
||||
"PG protocol initialized successfully for SLS project=%s. Will use PG for read/write operations.",
|
||||
self.project_name,
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# PG connection failed - fallback to SDK mode
|
||||
self._use_pg_protocol = False
|
||||
if self._pg_pool:
|
||||
try:
|
||||
self._pg_pool.closeall()
|
||||
except Exception:
|
||||
logger.debug("Failed to close PG connection pool during cleanup, ignoring")
|
||||
self._pg_pool = None
|
||||
|
||||
logger.info(
|
||||
"PG protocol connection failed (region may not support PG protocol): %s. "
|
||||
"Falling back to SDK mode for read/write operations.",
|
||||
str(e),
|
||||
)
|
||||
return False
|
||||
|
||||
def _is_connection_valid(self, conn: Any) -> bool:
|
||||
"""
|
||||
Check if a connection is still valid.
|
||||
|
||||
Args:
|
||||
conn: psycopg2 connection object
|
||||
|
||||
Returns:
|
||||
True if connection is valid, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Check if connection is closed
|
||||
if conn.closed:
|
||||
return False
|
||||
|
||||
# Quick ping test - execute a lightweight query
|
||||
# For SLS PG protocol, we can't use SELECT 1 without FROM,
|
||||
# so we just check the connection status
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute("SELECT 1")
|
||||
cursor.fetchone()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@contextmanager
|
||||
def _get_connection(self):
|
||||
"""
|
||||
Context manager to get a PostgreSQL connection from the pool.
|
||||
|
||||
Automatically validates and refreshes stale connections.
|
||||
|
||||
Note: Aliyun SLS PG protocol does not support transactions, so we always
|
||||
use autocommit mode.
|
||||
|
||||
Yields:
|
||||
psycopg2 connection object
|
||||
|
||||
Raises:
|
||||
RuntimeError: If PG pool is not initialized
|
||||
"""
|
||||
if not self._pg_pool:
|
||||
raise RuntimeError("PG connection pool is not initialized")
|
||||
|
||||
conn = self._pg_pool.getconn()
|
||||
try:
|
||||
# Validate connection and get a fresh one if needed
|
||||
if not self._is_connection_valid(conn):
|
||||
logger.debug("Connection is stale, marking as bad and getting a new one")
|
||||
# Mark connection as bad and get a new one
|
||||
self._pg_pool.putconn(conn, close=True)
|
||||
conn = self._pg_pool.getconn()
|
||||
|
||||
# Aliyun SLS PG protocol does not support transactions, always use autocommit
|
||||
conn.autocommit = True
|
||||
yield conn
|
||||
finally:
|
||||
# Return connection to pool (or close if it's bad)
|
||||
if self._is_connection_valid(conn):
|
||||
self._pg_pool.putconn(conn)
|
||||
else:
|
||||
self._pg_pool.putconn(conn, close=True)
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the PostgreSQL connection pool."""
|
||||
if self._pg_pool:
|
||||
try:
|
||||
self._pg_pool.closeall()
|
||||
logger.info("PG connection pool closed")
|
||||
except Exception:
|
||||
logger.exception("Failed to close PG connection pool")
|
||||
|
||||
def _is_retriable_error(self, error: Exception) -> bool:
|
||||
"""
|
||||
Check if an error is retriable (connection-related issues).
|
||||
|
||||
Args:
|
||||
error: Exception to check
|
||||
|
||||
Returns:
|
||||
True if the error is retriable, False otherwise
|
||||
"""
|
||||
# Retry on connection-related errors
|
||||
if isinstance(error, (OperationalError, InterfaceError)):
|
||||
return True
|
||||
|
||||
# Check error message for specific connection issues
|
||||
error_msg = str(error).lower()
|
||||
retriable_patterns = [
|
||||
"connection",
|
||||
"timeout",
|
||||
"closed",
|
||||
"broken pipe",
|
||||
"reset by peer",
|
||||
"no route to host",
|
||||
"network",
|
||||
]
|
||||
return any(pattern in error_msg for pattern in retriable_patterns)
|
||||
|
||||
def put_log(self, logstore: str, contents: Sequence[tuple[str, str]], log_enabled: bool = False) -> None:
|
||||
"""
|
||||
Write log to SLS using PostgreSQL protocol with automatic retry.
|
||||
|
||||
Note: SLS PG protocol only supports INSERT (not UPDATE). This uses append-only
|
||||
writes with log_version field for versioning, same as SDK implementation.
|
||||
|
||||
Args:
|
||||
logstore: Name of the logstore table
|
||||
contents: List of (field_name, value) tuples
|
||||
log_enabled: Whether to enable logging
|
||||
|
||||
Raises:
|
||||
psycopg2.Error: If database operation fails after all retries
|
||||
"""
|
||||
if not contents:
|
||||
return
|
||||
|
||||
# Extract field names and values from contents
|
||||
fields = [field_name for field_name, _ in contents]
|
||||
values = [value for _, value in contents]
|
||||
|
||||
# Build INSERT statement with literal values
|
||||
# Note: Aliyun SLS PG protocol doesn't support parameterized queries,
|
||||
# so we need to use mogrify to safely create literal values
|
||||
field_list = ", ".join([f'"{field}"' for field in fields])
|
||||
|
||||
if log_enabled:
|
||||
logger.info(
|
||||
"[LogStore-PG] PUT_LOG | logstore=%s | project=%s | items_count=%d",
|
||||
logstore,
|
||||
self.project_name,
|
||||
len(contents),
|
||||
)
|
||||
|
||||
# Retry configuration
|
||||
max_retries = 3
|
||||
retry_delay = 0.1 # Start with 100ms
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with self._get_connection() as conn:
|
||||
with conn.cursor() as cursor:
|
||||
# Use mogrify to safely convert values to SQL literals
|
||||
placeholders = ", ".join(["%s"] * len(fields))
|
||||
values_literal = cursor.mogrify(f"({placeholders})", values).decode("utf-8")
|
||||
insert_sql = f'INSERT INTO "{logstore}" ({field_list}) VALUES {values_literal}'
|
||||
cursor.execute(insert_sql)
|
||||
# Success - exit retry loop
|
||||
return
|
||||
|
||||
except psycopg2.Error as e:
|
||||
# Check if error is retriable
|
||||
if not self._is_retriable_error(e):
|
||||
# Not a retriable error (e.g., data validation error), fail immediately
|
||||
logger.exception(
|
||||
"Failed to put logs to logstore %s via PG protocol (non-retriable error)",
|
||||
logstore,
|
||||
)
|
||||
raise
|
||||
|
||||
# Retriable error - log and retry if we have attempts left
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(
|
||||
"Failed to put logs to logstore %s via PG protocol (attempt %d/%d): %s. Retrying...",
|
||||
logstore,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
str(e),
|
||||
)
|
||||
time.sleep(retry_delay)
|
||||
retry_delay *= 2 # Exponential backoff
|
||||
else:
|
||||
# Last attempt failed
|
||||
logger.exception(
|
||||
"Failed to put logs to logstore %s via PG protocol after %d attempts",
|
||||
logstore,
|
||||
max_retries,
|
||||
)
|
||||
raise
|
||||
|
||||
def execute_sql(self, sql: str, logstore: str, log_enabled: bool = False) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Execute SQL query using PostgreSQL protocol with automatic retry.
|
||||
|
||||
Args:
|
||||
sql: SQL query string
|
||||
logstore: Name of the logstore (for logging purposes)
|
||||
log_enabled: Whether to enable logging
|
||||
|
||||
Returns:
|
||||
List of result rows as dictionaries
|
||||
|
||||
Raises:
|
||||
psycopg2.Error: If database operation fails after all retries
|
||||
"""
|
||||
if log_enabled:
|
||||
logger.info(
|
||||
"[LogStore-PG] EXECUTE_SQL | logstore=%s | project=%s | sql=%s",
|
||||
logstore,
|
||||
self.project_name,
|
||||
sql,
|
||||
)
|
||||
|
||||
# Retry configuration
|
||||
max_retries = 3
|
||||
retry_delay = 0.1 # Start with 100ms
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with self._get_connection() as conn:
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute(sql)
|
||||
|
||||
# Get column names from cursor description
|
||||
columns = [desc[0] for desc in cursor.description]
|
||||
|
||||
# Fetch all results and convert to list of dicts
|
||||
result = []
|
||||
for row in cursor.fetchall():
|
||||
row_dict = {}
|
||||
for col, val in zip(columns, row):
|
||||
row_dict[col] = "" if val is None else str(val)
|
||||
result.append(row_dict)
|
||||
|
||||
if log_enabled:
|
||||
logger.info(
|
||||
"[LogStore-PG] EXECUTE_SQL RESULT | logstore=%s | returned_count=%d",
|
||||
logstore,
|
||||
len(result),
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except psycopg2.Error as e:
|
||||
# Check if error is retriable
|
||||
if not self._is_retriable_error(e):
|
||||
# Not a retriable error (e.g., SQL syntax error), fail immediately
|
||||
logger.exception(
|
||||
"Failed to execute SQL query on logstore %s via PG protocol (non-retriable error): sql=%s",
|
||||
logstore,
|
||||
sql,
|
||||
)
|
||||
raise
|
||||
|
||||
# Retriable error - log and retry if we have attempts left
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(
|
||||
"Failed to execute SQL query on logstore %s via PG protocol (attempt %d/%d): %s. Retrying...",
|
||||
logstore,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
str(e),
|
||||
)
|
||||
time.sleep(retry_delay)
|
||||
retry_delay *= 2 # Exponential backoff
|
||||
else:
|
||||
# Last attempt failed
|
||||
logger.exception(
|
||||
"Failed to execute SQL query on logstore %s via PG protocol after %d attempts: sql=%s",
|
||||
logstore,
|
||||
max_retries,
|
||||
sql,
|
||||
)
|
||||
raise
|
||||
|
||||
# This line should never be reached due to raise above, but makes type checker happy
|
||||
return []
|
||||
0
api/extensions/logstore/repositories/__init__.py
Normal file
0
api/extensions/logstore/repositories/__init__.py
Normal file
@@ -0,0 +1,365 @@
|
||||
"""
|
||||
LogStore implementation of DifyAPIWorkflowNodeExecutionRepository.
|
||||
|
||||
This module provides the LogStore-based implementation for service-layer
|
||||
WorkflowNodeExecutionModel operations using Aliyun SLS LogStore.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from extensions.logstore.aliyun_logstore import AliyunLogStore
|
||||
from models.workflow import WorkflowNodeExecutionModel
|
||||
from repositories.api_workflow_node_execution_repository import DifyAPIWorkflowNodeExecutionRepository
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _dict_to_workflow_node_execution_model(data: dict[str, Any]) -> WorkflowNodeExecutionModel:
|
||||
"""
|
||||
Convert LogStore result dictionary to WorkflowNodeExecutionModel instance.
|
||||
|
||||
Args:
|
||||
data: Dictionary from LogStore query result
|
||||
|
||||
Returns:
|
||||
WorkflowNodeExecutionModel instance (detached from session)
|
||||
|
||||
Note:
|
||||
The returned model is not attached to any SQLAlchemy session.
|
||||
Relationship fields (like offload_data) are not loaded from LogStore.
|
||||
"""
|
||||
logger.debug("_dict_to_workflow_node_execution_model: data keys=%s", list(data.keys())[:5])
|
||||
# Create model instance without session
|
||||
model = WorkflowNodeExecutionModel()
|
||||
|
||||
# Map all required fields with validation
|
||||
# Critical fields - must not be None
|
||||
model.id = data.get("id") or ""
|
||||
model.tenant_id = data.get("tenant_id") or ""
|
||||
model.app_id = data.get("app_id") or ""
|
||||
model.workflow_id = data.get("workflow_id") or ""
|
||||
model.triggered_from = data.get("triggered_from") or ""
|
||||
model.node_id = data.get("node_id") or ""
|
||||
model.node_type = data.get("node_type") or ""
|
||||
model.status = data.get("status") or "running" # Default status if missing
|
||||
model.title = data.get("title") or ""
|
||||
model.created_by_role = data.get("created_by_role") or ""
|
||||
model.created_by = data.get("created_by") or ""
|
||||
|
||||
# Numeric fields with defaults
|
||||
model.index = int(data.get("index", 0))
|
||||
model.elapsed_time = float(data.get("elapsed_time", 0))
|
||||
|
||||
# Optional fields
|
||||
model.workflow_run_id = data.get("workflow_run_id")
|
||||
model.predecessor_node_id = data.get("predecessor_node_id")
|
||||
model.node_execution_id = data.get("node_execution_id")
|
||||
model.inputs = data.get("inputs")
|
||||
model.process_data = data.get("process_data")
|
||||
model.outputs = data.get("outputs")
|
||||
model.error = data.get("error")
|
||||
model.execution_metadata = data.get("execution_metadata")
|
||||
|
||||
# Handle datetime fields
|
||||
created_at = data.get("created_at")
|
||||
if created_at:
|
||||
if isinstance(created_at, str):
|
||||
model.created_at = datetime.fromisoformat(created_at)
|
||||
elif isinstance(created_at, (int, float)):
|
||||
model.created_at = datetime.fromtimestamp(created_at)
|
||||
else:
|
||||
model.created_at = created_at
|
||||
else:
|
||||
# Provide default created_at if missing
|
||||
model.created_at = datetime.now()
|
||||
|
||||
finished_at = data.get("finished_at")
|
||||
if finished_at:
|
||||
if isinstance(finished_at, str):
|
||||
model.finished_at = datetime.fromisoformat(finished_at)
|
||||
elif isinstance(finished_at, (int, float)):
|
||||
model.finished_at = datetime.fromtimestamp(finished_at)
|
||||
else:
|
||||
model.finished_at = finished_at
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class LogstoreAPIWorkflowNodeExecutionRepository(DifyAPIWorkflowNodeExecutionRepository):
|
||||
"""
|
||||
LogStore implementation of DifyAPIWorkflowNodeExecutionRepository.
|
||||
|
||||
Provides service-layer database operations for WorkflowNodeExecutionModel
|
||||
using LogStore SQL queries with optimized deduplication strategies.
|
||||
"""
|
||||
|
||||
def __init__(self, session_maker: sessionmaker | None = None):
|
||||
"""
|
||||
Initialize the repository with LogStore client.
|
||||
|
||||
Args:
|
||||
session_maker: SQLAlchemy sessionmaker (unused, for compatibility with factory pattern)
|
||||
"""
|
||||
logger.debug("LogstoreAPIWorkflowNodeExecutionRepository.__init__: initializing")
|
||||
self.logstore_client = AliyunLogStore()
|
||||
|
||||
def get_node_last_execution(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
workflow_id: str,
|
||||
node_id: str,
|
||||
) -> WorkflowNodeExecutionModel | None:
|
||||
"""
|
||||
Get the most recent execution for a specific node.
|
||||
|
||||
Uses query syntax to get raw logs and selects the one with max log_version.
|
||||
Returns the most recent execution ordered by created_at.
|
||||
"""
|
||||
logger.debug(
|
||||
"get_node_last_execution: tenant_id=%s, app_id=%s, workflow_id=%s, node_id=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
workflow_id,
|
||||
node_id,
|
||||
)
|
||||
try:
|
||||
# Check if PG protocol is supported
|
||||
if self.logstore_client.supports_pg_protocol:
|
||||
# Use PG protocol with SQL query (get latest version of each record)
|
||||
sql_query = f"""
|
||||
SELECT * FROM (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
|
||||
FROM "{AliyunLogStore.workflow_node_execution_logstore}"
|
||||
WHERE tenant_id = '{tenant_id}'
|
||||
AND app_id = '{app_id}'
|
||||
AND workflow_id = '{workflow_id}'
|
||||
AND node_id = '{node_id}'
|
||||
AND __time__ > 0
|
||||
) AS subquery WHERE rn = 1
|
||||
LIMIT 100
|
||||
"""
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql_query,
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
)
|
||||
else:
|
||||
# Use SDK with LogStore query syntax
|
||||
query = (
|
||||
f"tenant_id: {tenant_id} and app_id: {app_id} and workflow_id: {workflow_id} and node_id: {node_id}"
|
||||
)
|
||||
from_time = 0
|
||||
to_time = int(time.time()) # now
|
||||
|
||||
results = self.logstore_client.get_logs(
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
from_time=from_time,
|
||||
to_time=to_time,
|
||||
query=query,
|
||||
line=100,
|
||||
reverse=False,
|
||||
)
|
||||
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# For SDK mode, group by id and select the one with max log_version for each group
|
||||
# For PG mode, this is already done by the SQL query
|
||||
if not self.logstore_client.supports_pg_protocol:
|
||||
id_to_results: dict[str, list[dict[str, Any]]] = {}
|
||||
for row in results:
|
||||
row_id = row.get("id")
|
||||
if row_id:
|
||||
if row_id not in id_to_results:
|
||||
id_to_results[row_id] = []
|
||||
id_to_results[row_id].append(row)
|
||||
|
||||
# For each id, select the row with max log_version
|
||||
deduplicated_results = []
|
||||
for rows in id_to_results.values():
|
||||
if len(rows) > 1:
|
||||
max_row = max(rows, key=lambda x: int(x.get("log_version", 0)))
|
||||
else:
|
||||
max_row = rows[0]
|
||||
deduplicated_results.append(max_row)
|
||||
else:
|
||||
# For PG mode, results are already deduplicated by the SQL query
|
||||
deduplicated_results = results
|
||||
|
||||
# Sort by created_at DESC and return the most recent one
|
||||
deduplicated_results.sort(
|
||||
key=lambda x: x.get("created_at", 0) if isinstance(x.get("created_at"), (int, float)) else 0,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
if deduplicated_results:
|
||||
return _dict_to_workflow_node_execution_model(deduplicated_results[0])
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get node last execution from LogStore")
|
||||
raise
|
||||
|
||||
def get_executions_by_workflow_run(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
workflow_run_id: str,
|
||||
) -> Sequence[WorkflowNodeExecutionModel]:
|
||||
"""
|
||||
Get all node executions for a specific workflow run.
|
||||
|
||||
Uses query syntax to get raw logs and selects the one with max log_version for each node execution.
|
||||
Ordered by index DESC for trace visualization.
|
||||
"""
|
||||
logger.debug(
|
||||
"[LogStore] get_executions_by_workflow_run: tenant_id=%s, app_id=%s, workflow_run_id=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
workflow_run_id,
|
||||
)
|
||||
try:
|
||||
# Check if PG protocol is supported
|
||||
if self.logstore_client.supports_pg_protocol:
|
||||
# Use PG protocol with SQL query (get latest version of each record)
|
||||
sql_query = f"""
|
||||
SELECT * FROM (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
|
||||
FROM "{AliyunLogStore.workflow_node_execution_logstore}"
|
||||
WHERE tenant_id = '{tenant_id}'
|
||||
AND app_id = '{app_id}'
|
||||
AND workflow_run_id = '{workflow_run_id}'
|
||||
AND __time__ > 0
|
||||
) AS subquery WHERE rn = 1
|
||||
LIMIT 1000
|
||||
"""
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql_query,
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
)
|
||||
else:
|
||||
# Use SDK with LogStore query syntax
|
||||
query = f"tenant_id: {tenant_id} and app_id: {app_id} and workflow_run_id: {workflow_run_id}"
|
||||
from_time = 0
|
||||
to_time = int(time.time()) # now
|
||||
|
||||
results = self.logstore_client.get_logs(
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
from_time=from_time,
|
||||
to_time=to_time,
|
||||
query=query,
|
||||
line=1000, # Get more results for node executions
|
||||
reverse=False,
|
||||
)
|
||||
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# For SDK mode, group by id and select the one with max log_version for each group
|
||||
# For PG mode, this is already done by the SQL query
|
||||
models = []
|
||||
if not self.logstore_client.supports_pg_protocol:
|
||||
id_to_results: dict[str, list[dict[str, Any]]] = {}
|
||||
for row in results:
|
||||
row_id = row.get("id")
|
||||
if row_id:
|
||||
if row_id not in id_to_results:
|
||||
id_to_results[row_id] = []
|
||||
id_to_results[row_id].append(row)
|
||||
|
||||
# For each id, select the row with max log_version
|
||||
for rows in id_to_results.values():
|
||||
if len(rows) > 1:
|
||||
max_row = max(rows, key=lambda x: int(x.get("log_version", 0)))
|
||||
else:
|
||||
max_row = rows[0]
|
||||
|
||||
model = _dict_to_workflow_node_execution_model(max_row)
|
||||
if model and model.id: # Ensure model is valid
|
||||
models.append(model)
|
||||
else:
|
||||
# For PG mode, results are already deduplicated by the SQL query
|
||||
for row in results:
|
||||
model = _dict_to_workflow_node_execution_model(row)
|
||||
if model and model.id: # Ensure model is valid
|
||||
models.append(model)
|
||||
|
||||
# Sort by index DESC for trace visualization
|
||||
models.sort(key=lambda x: x.index, reverse=True)
|
||||
|
||||
return models
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get executions by workflow run from LogStore")
|
||||
raise
|
||||
|
||||
def get_execution_by_id(
|
||||
self,
|
||||
execution_id: str,
|
||||
tenant_id: str | None = None,
|
||||
) -> WorkflowNodeExecutionModel | None:
|
||||
"""
|
||||
Get a workflow node execution by its ID.
|
||||
Uses query syntax to get raw logs and selects the one with max log_version.
|
||||
"""
|
||||
logger.debug("get_execution_by_id: execution_id=%s, tenant_id=%s", execution_id, tenant_id)
|
||||
try:
|
||||
# Check if PG protocol is supported
|
||||
if self.logstore_client.supports_pg_protocol:
|
||||
# Use PG protocol with SQL query (get latest version of record)
|
||||
tenant_filter = f"AND tenant_id = '{tenant_id}'" if tenant_id else ""
|
||||
sql_query = f"""
|
||||
SELECT * FROM (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
|
||||
FROM "{AliyunLogStore.workflow_node_execution_logstore}"
|
||||
WHERE id = '{execution_id}' {tenant_filter} AND __time__ > 0
|
||||
) AS subquery WHERE rn = 1
|
||||
LIMIT 1
|
||||
"""
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql_query,
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
)
|
||||
else:
|
||||
# Use SDK with LogStore query syntax
|
||||
if tenant_id:
|
||||
query = f"id: {execution_id} and tenant_id: {tenant_id}"
|
||||
else:
|
||||
query = f"id: {execution_id}"
|
||||
|
||||
from_time = 0
|
||||
to_time = int(time.time()) # now
|
||||
|
||||
results = self.logstore_client.get_logs(
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
from_time=from_time,
|
||||
to_time=to_time,
|
||||
query=query,
|
||||
line=100,
|
||||
reverse=False,
|
||||
)
|
||||
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# For PG mode, result is already the latest version
|
||||
# For SDK mode, if multiple results, select the one with max log_version
|
||||
if self.logstore_client.supports_pg_protocol or len(results) == 1:
|
||||
return _dict_to_workflow_node_execution_model(results[0])
|
||||
else:
|
||||
max_result = max(results, key=lambda x: int(x.get("log_version", 0)))
|
||||
return _dict_to_workflow_node_execution_model(max_result)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get execution by ID from LogStore: execution_id=%s", execution_id)
|
||||
raise
|
||||
@@ -0,0 +1,757 @@
|
||||
"""
|
||||
LogStore API WorkflowRun Repository Implementation
|
||||
|
||||
This module provides the LogStore-based implementation of the APIWorkflowRunRepository
|
||||
protocol. It handles service-layer WorkflowRun database operations using Aliyun SLS LogStore
|
||||
with optimized queries for statistics and pagination.
|
||||
|
||||
Key Features:
|
||||
- LogStore SQL queries for aggregation and statistics
|
||||
- Optimized deduplication using finished_at IS NOT NULL filter
|
||||
- Window functions only when necessary (running status queries)
|
||||
- Multi-tenant data isolation and security
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from typing import Any, cast
|
||||
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from extensions.logstore.aliyun_logstore import AliyunLogStore
|
||||
from libs.infinite_scroll_pagination import InfiniteScrollPagination
|
||||
from models.enums import WorkflowRunTriggeredFrom
|
||||
from models.workflow import WorkflowRun
|
||||
from repositories.api_workflow_run_repository import APIWorkflowRunRepository
|
||||
from repositories.types import (
|
||||
AverageInteractionStats,
|
||||
DailyRunsStats,
|
||||
DailyTerminalsStats,
|
||||
DailyTokenCostStats,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _dict_to_workflow_run(data: dict[str, Any]) -> WorkflowRun:
|
||||
"""
|
||||
Convert LogStore result dictionary to WorkflowRun instance.
|
||||
|
||||
Args:
|
||||
data: Dictionary from LogStore query result
|
||||
|
||||
Returns:
|
||||
WorkflowRun instance
|
||||
"""
|
||||
logger.debug("_dict_to_workflow_run: data keys=%s", list(data.keys())[:5])
|
||||
# Create model instance without session
|
||||
model = WorkflowRun()
|
||||
|
||||
# Map all required fields with validation
|
||||
# Critical fields - must not be None
|
||||
model.id = data.get("id") or ""
|
||||
model.tenant_id = data.get("tenant_id") or ""
|
||||
model.app_id = data.get("app_id") or ""
|
||||
model.workflow_id = data.get("workflow_id") or ""
|
||||
model.type = data.get("type") or ""
|
||||
model.triggered_from = data.get("triggered_from") or ""
|
||||
model.version = data.get("version") or ""
|
||||
model.status = data.get("status") or "running" # Default status if missing
|
||||
model.created_by_role = data.get("created_by_role") or ""
|
||||
model.created_by = data.get("created_by") or ""
|
||||
|
||||
# Numeric fields with defaults
|
||||
model.total_tokens = int(data.get("total_tokens", 0))
|
||||
model.total_steps = int(data.get("total_steps", 0))
|
||||
model.exceptions_count = int(data.get("exceptions_count", 0))
|
||||
|
||||
# Optional fields
|
||||
model.graph = data.get("graph")
|
||||
model.inputs = data.get("inputs")
|
||||
model.outputs = data.get("outputs")
|
||||
model.error = data.get("error_message") or data.get("error")
|
||||
|
||||
# Handle datetime fields
|
||||
started_at = data.get("started_at") or data.get("created_at")
|
||||
if started_at:
|
||||
if isinstance(started_at, str):
|
||||
model.created_at = datetime.fromisoformat(started_at)
|
||||
elif isinstance(started_at, (int, float)):
|
||||
model.created_at = datetime.fromtimestamp(started_at)
|
||||
else:
|
||||
model.created_at = started_at
|
||||
else:
|
||||
# Provide default created_at if missing
|
||||
model.created_at = datetime.now()
|
||||
|
||||
finished_at = data.get("finished_at")
|
||||
if finished_at:
|
||||
if isinstance(finished_at, str):
|
||||
model.finished_at = datetime.fromisoformat(finished_at)
|
||||
elif isinstance(finished_at, (int, float)):
|
||||
model.finished_at = datetime.fromtimestamp(finished_at)
|
||||
else:
|
||||
model.finished_at = finished_at
|
||||
|
||||
# Compute elapsed_time from started_at and finished_at
|
||||
# LogStore doesn't store elapsed_time, it's computed in WorkflowExecution domain entity
|
||||
if model.finished_at and model.created_at:
|
||||
model.elapsed_time = (model.finished_at - model.created_at).total_seconds()
|
||||
else:
|
||||
model.elapsed_time = float(data.get("elapsed_time", 0))
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class LogstoreAPIWorkflowRunRepository(APIWorkflowRunRepository):
|
||||
"""
|
||||
LogStore implementation of APIWorkflowRunRepository.
|
||||
|
||||
Provides service-layer WorkflowRun database operations using LogStore SQL
|
||||
with optimized query strategies:
|
||||
- Use finished_at IS NOT NULL for deduplication (10-100x faster)
|
||||
- Use window functions only when running status is required
|
||||
- Proper time range filtering for LogStore queries
|
||||
"""
|
||||
|
||||
def __init__(self, session_maker: sessionmaker | None = None):
|
||||
"""
|
||||
Initialize the repository with LogStore client.
|
||||
|
||||
Args:
|
||||
session_maker: SQLAlchemy sessionmaker (unused, for compatibility with factory pattern)
|
||||
"""
|
||||
logger.debug("LogstoreAPIWorkflowRunRepository.__init__: initializing")
|
||||
self.logstore_client = AliyunLogStore()
|
||||
|
||||
# Control flag for dual-read (fallback to PostgreSQL when LogStore returns no results)
|
||||
# Set to True to enable fallback for safe migration from PostgreSQL to LogStore
|
||||
# Set to False for new deployments without legacy data in PostgreSQL
|
||||
self._enable_dual_read = os.environ.get("LOGSTORE_DUAL_READ_ENABLED", "true").lower() == "true"
|
||||
|
||||
def get_paginated_workflow_runs(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
triggered_from: WorkflowRunTriggeredFrom | Sequence[WorkflowRunTriggeredFrom],
|
||||
limit: int = 20,
|
||||
last_id: str | None = None,
|
||||
status: str | None = None,
|
||||
) -> InfiniteScrollPagination:
|
||||
"""
|
||||
Get paginated workflow runs with filtering.
|
||||
|
||||
Uses window function for deduplication to support both running and finished states.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier for multi-tenant isolation
|
||||
app_id: Application identifier
|
||||
triggered_from: Filter by trigger source(s)
|
||||
limit: Maximum number of records to return (default: 20)
|
||||
last_id: Cursor for pagination - ID of the last record from previous page
|
||||
status: Optional filter by status
|
||||
|
||||
Returns:
|
||||
InfiniteScrollPagination object
|
||||
"""
|
||||
logger.debug(
|
||||
"get_paginated_workflow_runs: tenant_id=%s, app_id=%s, limit=%d, status=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
limit,
|
||||
status,
|
||||
)
|
||||
# Convert triggered_from to list if needed
|
||||
if isinstance(triggered_from, WorkflowRunTriggeredFrom):
|
||||
triggered_from_list = [triggered_from]
|
||||
else:
|
||||
triggered_from_list = list(triggered_from)
|
||||
|
||||
# Build triggered_from filter
|
||||
triggered_from_filter = " OR ".join([f"triggered_from='{tf.value}'" for tf in triggered_from_list])
|
||||
|
||||
# Build status filter
|
||||
status_filter = f"AND status='{status}'" if status else ""
|
||||
|
||||
# Build last_id filter for pagination
|
||||
# Note: This is simplified. In production, you'd need to track created_at from last record
|
||||
last_id_filter = ""
|
||||
if last_id:
|
||||
# TODO: Implement proper cursor-based pagination with created_at
|
||||
logger.warning("last_id pagination not fully implemented for LogStore")
|
||||
|
||||
# Use window function to get latest log_version of each workflow run
|
||||
sql = f"""
|
||||
SELECT * FROM (
|
||||
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) AS rn
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND ({triggered_from_filter})
|
||||
{status_filter}
|
||||
{last_id_filter}
|
||||
) t
|
||||
WHERE rn = 1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT {limit + 1}
|
||||
"""
|
||||
|
||||
try:
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore, from_time=None, to_time=None
|
||||
)
|
||||
|
||||
# Check if there are more records
|
||||
has_more = len(results) > limit
|
||||
if has_more:
|
||||
results = results[:limit]
|
||||
|
||||
# Convert results to WorkflowRun models
|
||||
workflow_runs = [_dict_to_workflow_run(row) for row in results]
|
||||
return InfiniteScrollPagination(data=workflow_runs, limit=limit, has_more=has_more)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get paginated workflow runs from LogStore")
|
||||
raise
|
||||
|
||||
def get_workflow_run_by_id(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
run_id: str,
|
||||
) -> WorkflowRun | None:
|
||||
"""
|
||||
Get a specific workflow run by ID with tenant and app isolation.
|
||||
|
||||
Uses query syntax to get raw logs and selects the one with max log_version in code.
|
||||
Falls back to PostgreSQL if not found in LogStore (for data consistency during migration).
|
||||
"""
|
||||
logger.debug("get_workflow_run_by_id: tenant_id=%s, app_id=%s, run_id=%s", tenant_id, app_id, run_id)
|
||||
|
||||
try:
|
||||
# Check if PG protocol is supported
|
||||
if self.logstore_client.supports_pg_protocol:
|
||||
# Use PG protocol with SQL query (get latest version of record)
|
||||
sql_query = f"""
|
||||
SELECT * FROM (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
|
||||
FROM "{AliyunLogStore.workflow_execution_logstore}"
|
||||
WHERE id = '{run_id}' AND tenant_id = '{tenant_id}' AND app_id = '{app_id}' AND __time__ > 0
|
||||
) AS subquery WHERE rn = 1
|
||||
LIMIT 100
|
||||
"""
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql_query,
|
||||
logstore=AliyunLogStore.workflow_execution_logstore,
|
||||
)
|
||||
else:
|
||||
# Use SDK with LogStore query syntax
|
||||
query = f"id: {run_id} and tenant_id: {tenant_id} and app_id: {app_id}"
|
||||
from_time = 0
|
||||
to_time = int(time.time()) # now
|
||||
|
||||
results = self.logstore_client.get_logs(
|
||||
logstore=AliyunLogStore.workflow_execution_logstore,
|
||||
from_time=from_time,
|
||||
to_time=to_time,
|
||||
query=query,
|
||||
line=100,
|
||||
reverse=False,
|
||||
)
|
||||
|
||||
if not results:
|
||||
# Fallback to PostgreSQL for records created before LogStore migration
|
||||
if self._enable_dual_read:
|
||||
logger.debug(
|
||||
"WorkflowRun not found in LogStore, falling back to PostgreSQL: "
|
||||
"run_id=%s, tenant_id=%s, app_id=%s",
|
||||
run_id,
|
||||
tenant_id,
|
||||
app_id,
|
||||
)
|
||||
return self._fallback_get_workflow_run_by_id_with_tenant(run_id, tenant_id, app_id)
|
||||
return None
|
||||
|
||||
# For PG mode, results are already deduplicated by the SQL query
|
||||
# For SDK mode, if multiple results, select the one with max log_version
|
||||
if self.logstore_client.supports_pg_protocol or len(results) == 1:
|
||||
return _dict_to_workflow_run(results[0])
|
||||
else:
|
||||
max_result = max(results, key=lambda x: int(x.get("log_version", 0)))
|
||||
return _dict_to_workflow_run(max_result)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get workflow run by ID from LogStore: run_id=%s", run_id)
|
||||
# Try PostgreSQL fallback on any error (only if dual-read is enabled)
|
||||
if self._enable_dual_read:
|
||||
try:
|
||||
return self._fallback_get_workflow_run_by_id_with_tenant(run_id, tenant_id, app_id)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"PostgreSQL fallback also failed: run_id=%s, tenant_id=%s, app_id=%s", run_id, tenant_id, app_id
|
||||
)
|
||||
raise
|
||||
|
||||
def _fallback_get_workflow_run_by_id_with_tenant(
|
||||
self, run_id: str, tenant_id: str, app_id: str
|
||||
) -> WorkflowRun | None:
|
||||
"""Fallback to PostgreSQL query for records not in LogStore (with tenant isolation)."""
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from extensions.ext_database import db
|
||||
|
||||
with Session(db.engine) as session:
|
||||
stmt = select(WorkflowRun).where(
|
||||
WorkflowRun.id == run_id, WorkflowRun.tenant_id == tenant_id, WorkflowRun.app_id == app_id
|
||||
)
|
||||
return session.scalar(stmt)
|
||||
|
||||
def get_workflow_run_by_id_without_tenant(
|
||||
self,
|
||||
run_id: str,
|
||||
) -> WorkflowRun | None:
|
||||
"""
|
||||
Get a specific workflow run by ID without tenant/app context.
|
||||
Uses query syntax to get raw logs and selects the one with max log_version.
|
||||
Falls back to PostgreSQL if not found in LogStore (controlled by LOGSTORE_DUAL_READ_ENABLED).
|
||||
"""
|
||||
logger.debug("get_workflow_run_by_id_without_tenant: run_id=%s", run_id)
|
||||
|
||||
try:
|
||||
# Check if PG protocol is supported
|
||||
if self.logstore_client.supports_pg_protocol:
|
||||
# Use PG protocol with SQL query (get latest version of record)
|
||||
sql_query = f"""
|
||||
SELECT * FROM (
|
||||
SELECT *,
|
||||
ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) as rn
|
||||
FROM "{AliyunLogStore.workflow_execution_logstore}"
|
||||
WHERE id = '{run_id}' AND __time__ > 0
|
||||
) AS subquery WHERE rn = 1
|
||||
LIMIT 100
|
||||
"""
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql_query,
|
||||
logstore=AliyunLogStore.workflow_execution_logstore,
|
||||
)
|
||||
else:
|
||||
# Use SDK with LogStore query syntax
|
||||
query = f"id: {run_id}"
|
||||
from_time = 0
|
||||
to_time = int(time.time()) # now
|
||||
|
||||
results = self.logstore_client.get_logs(
|
||||
logstore=AliyunLogStore.workflow_execution_logstore,
|
||||
from_time=from_time,
|
||||
to_time=to_time,
|
||||
query=query,
|
||||
line=100,
|
||||
reverse=False,
|
||||
)
|
||||
|
||||
if not results:
|
||||
# Fallback to PostgreSQL for records created before LogStore migration
|
||||
if self._enable_dual_read:
|
||||
logger.debug("WorkflowRun not found in LogStore, falling back to PostgreSQL: run_id=%s", run_id)
|
||||
return self._fallback_get_workflow_run_by_id(run_id)
|
||||
return None
|
||||
|
||||
# For PG mode, results are already deduplicated by the SQL query
|
||||
# For SDK mode, if multiple results, select the one with max log_version
|
||||
if self.logstore_client.supports_pg_protocol or len(results) == 1:
|
||||
return _dict_to_workflow_run(results[0])
|
||||
else:
|
||||
max_result = max(results, key=lambda x: int(x.get("log_version", 0)))
|
||||
return _dict_to_workflow_run(max_result)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get workflow run without tenant: run_id=%s", run_id)
|
||||
# Try PostgreSQL fallback on any error (only if dual-read is enabled)
|
||||
if self._enable_dual_read:
|
||||
try:
|
||||
return self._fallback_get_workflow_run_by_id(run_id)
|
||||
except Exception:
|
||||
logger.exception("PostgreSQL fallback also failed: run_id=%s", run_id)
|
||||
raise
|
||||
|
||||
def _fallback_get_workflow_run_by_id(self, run_id: str) -> WorkflowRun | None:
|
||||
"""Fallback to PostgreSQL query for records not in LogStore."""
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from extensions.ext_database import db
|
||||
|
||||
with Session(db.engine) as session:
|
||||
stmt = select(WorkflowRun).where(WorkflowRun.id == run_id)
|
||||
return session.scalar(stmt)
|
||||
|
||||
def get_workflow_runs_count(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
triggered_from: str,
|
||||
status: str | None = None,
|
||||
time_range: str | None = None,
|
||||
) -> dict[str, int]:
|
||||
"""
|
||||
Get workflow runs count statistics grouped by status.
|
||||
|
||||
Optimization: Use finished_at IS NOT NULL for completed runs (10-50x faster)
|
||||
"""
|
||||
logger.debug(
|
||||
"get_workflow_runs_count: tenant_id=%s, app_id=%s, triggered_from=%s, status=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
triggered_from,
|
||||
status,
|
||||
)
|
||||
# Build time range filter
|
||||
time_filter = ""
|
||||
if time_range:
|
||||
# TODO: Parse time_range and convert to from_time/to_time
|
||||
logger.warning("time_range filter not implemented")
|
||||
|
||||
# If status is provided, simple count
|
||||
if status:
|
||||
if status == "running":
|
||||
# Running status requires window function
|
||||
sql = f"""
|
||||
SELECT COUNT(*) as count
|
||||
FROM (
|
||||
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) AS rn
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND status='running'
|
||||
{time_filter}
|
||||
) t
|
||||
WHERE rn = 1
|
||||
"""
|
||||
else:
|
||||
# Finished status uses optimized filter
|
||||
sql = f"""
|
||||
SELECT COUNT(DISTINCT id) as count
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND status='{status}'
|
||||
AND finished_at IS NOT NULL
|
||||
{time_filter}
|
||||
"""
|
||||
|
||||
try:
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
count = results[0]["count"] if results and len(results) > 0 else 0
|
||||
|
||||
return {
|
||||
"total": count,
|
||||
"running": count if status == "running" else 0,
|
||||
"succeeded": count if status == "succeeded" else 0,
|
||||
"failed": count if status == "failed" else 0,
|
||||
"stopped": count if status == "stopped" else 0,
|
||||
"partial-succeeded": count if status == "partial-succeeded" else 0,
|
||||
}
|
||||
except Exception:
|
||||
logger.exception("Failed to get workflow runs count")
|
||||
raise
|
||||
|
||||
# No status filter - get counts grouped by status
|
||||
# Use optimized query for finished runs, separate query for running
|
||||
try:
|
||||
# Count finished runs grouped by status
|
||||
finished_sql = f"""
|
||||
SELECT status, COUNT(DISTINCT id) as count
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND finished_at IS NOT NULL
|
||||
{time_filter}
|
||||
GROUP BY status
|
||||
"""
|
||||
|
||||
# Count running runs
|
||||
running_sql = f"""
|
||||
SELECT COUNT(*) as count
|
||||
FROM (
|
||||
SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY log_version DESC) AS rn
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND status='running'
|
||||
{time_filter}
|
||||
) t
|
||||
WHERE rn = 1
|
||||
"""
|
||||
|
||||
finished_results = self.logstore_client.execute_sql(
|
||||
sql=finished_sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
running_results = self.logstore_client.execute_sql(
|
||||
sql=running_sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
|
||||
# Build response
|
||||
status_counts = {
|
||||
"running": 0,
|
||||
"succeeded": 0,
|
||||
"failed": 0,
|
||||
"stopped": 0,
|
||||
"partial-succeeded": 0,
|
||||
}
|
||||
|
||||
total = 0
|
||||
for result in finished_results:
|
||||
status_val = result.get("status")
|
||||
count = result.get("count", 0)
|
||||
if status_val in status_counts:
|
||||
status_counts[status_val] = count
|
||||
total += count
|
||||
|
||||
# Add running count
|
||||
running_count = running_results[0]["count"] if running_results and len(running_results) > 0 else 0
|
||||
status_counts["running"] = running_count
|
||||
total += running_count
|
||||
|
||||
return {"total": total} | status_counts
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get workflow runs count")
|
||||
raise
|
||||
|
||||
def get_daily_runs_statistics(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
triggered_from: str,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
timezone: str = "UTC",
|
||||
) -> list[DailyRunsStats]:
|
||||
"""
|
||||
Get daily runs statistics using optimized query.
|
||||
|
||||
Optimization: Use finished_at IS NOT NULL + COUNT(DISTINCT id) (20-100x faster)
|
||||
"""
|
||||
logger.debug(
|
||||
"get_daily_runs_statistics: tenant_id=%s, app_id=%s, triggered_from=%s", tenant_id, app_id, triggered_from
|
||||
)
|
||||
# Build time range filter
|
||||
time_filter = ""
|
||||
if start_date:
|
||||
time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
|
||||
if end_date:
|
||||
time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
|
||||
|
||||
# Optimized query: Use finished_at filter to avoid window function
|
||||
sql = f"""
|
||||
SELECT DATE(from_unixtime(__time__)) as date, COUNT(DISTINCT id) as runs
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND finished_at IS NOT NULL
|
||||
{time_filter}
|
||||
GROUP BY date
|
||||
ORDER BY date
|
||||
"""
|
||||
|
||||
try:
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
|
||||
response_data = []
|
||||
for row in results:
|
||||
response_data.append({"date": str(row.get("date", "")), "runs": row.get("runs", 0)})
|
||||
|
||||
return cast(list[DailyRunsStats], response_data)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get daily runs statistics")
|
||||
raise
|
||||
|
||||
def get_daily_terminals_statistics(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
triggered_from: str,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
timezone: str = "UTC",
|
||||
) -> list[DailyTerminalsStats]:
|
||||
"""
|
||||
Get daily terminals statistics using optimized query.
|
||||
|
||||
Optimization: Use finished_at IS NOT NULL + COUNT(DISTINCT created_by) (20-100x faster)
|
||||
"""
|
||||
logger.debug(
|
||||
"get_daily_terminals_statistics: tenant_id=%s, app_id=%s, triggered_from=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
triggered_from,
|
||||
)
|
||||
# Build time range filter
|
||||
time_filter = ""
|
||||
if start_date:
|
||||
time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
|
||||
if end_date:
|
||||
time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
|
||||
|
||||
sql = f"""
|
||||
SELECT DATE(from_unixtime(__time__)) as date, COUNT(DISTINCT created_by) as terminal_count
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND finished_at IS NOT NULL
|
||||
{time_filter}
|
||||
GROUP BY date
|
||||
ORDER BY date
|
||||
"""
|
||||
|
||||
try:
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
|
||||
response_data = []
|
||||
for row in results:
|
||||
response_data.append({"date": str(row.get("date", "")), "terminal_count": row.get("terminal_count", 0)})
|
||||
|
||||
return cast(list[DailyTerminalsStats], response_data)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get daily terminals statistics")
|
||||
raise
|
||||
|
||||
def get_daily_token_cost_statistics(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
triggered_from: str,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
timezone: str = "UTC",
|
||||
) -> list[DailyTokenCostStats]:
|
||||
"""
|
||||
Get daily token cost statistics using optimized query.
|
||||
|
||||
Optimization: Use finished_at IS NOT NULL + SUM(total_tokens) (20-100x faster)
|
||||
"""
|
||||
logger.debug(
|
||||
"get_daily_token_cost_statistics: tenant_id=%s, app_id=%s, triggered_from=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
triggered_from,
|
||||
)
|
||||
# Build time range filter
|
||||
time_filter = ""
|
||||
if start_date:
|
||||
time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
|
||||
if end_date:
|
||||
time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
|
||||
|
||||
sql = f"""
|
||||
SELECT DATE(from_unixtime(__time__)) as date, SUM(total_tokens) as token_count
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND finished_at IS NOT NULL
|
||||
{time_filter}
|
||||
GROUP BY date
|
||||
ORDER BY date
|
||||
"""
|
||||
|
||||
try:
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
|
||||
response_data = []
|
||||
for row in results:
|
||||
response_data.append({"date": str(row.get("date", "")), "token_count": row.get("token_count", 0)})
|
||||
|
||||
return cast(list[DailyTokenCostStats], response_data)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get daily token cost statistics")
|
||||
raise
|
||||
|
||||
def get_average_app_interaction_statistics(
|
||||
self,
|
||||
tenant_id: str,
|
||||
app_id: str,
|
||||
triggered_from: str,
|
||||
start_date: datetime | None = None,
|
||||
end_date: datetime | None = None,
|
||||
timezone: str = "UTC",
|
||||
) -> list[AverageInteractionStats]:
|
||||
"""
|
||||
Get average app interaction statistics using optimized query.
|
||||
|
||||
Optimization: Use finished_at IS NOT NULL + AVG (20-100x faster)
|
||||
"""
|
||||
logger.debug(
|
||||
"get_average_app_interaction_statistics: tenant_id=%s, app_id=%s, triggered_from=%s",
|
||||
tenant_id,
|
||||
app_id,
|
||||
triggered_from,
|
||||
)
|
||||
# Build time range filter
|
||||
time_filter = ""
|
||||
if start_date:
|
||||
time_filter += f" AND __time__ >= to_unixtime(from_iso8601_timestamp('{start_date.isoformat()}'))"
|
||||
if end_date:
|
||||
time_filter += f" AND __time__ < to_unixtime(from_iso8601_timestamp('{end_date.isoformat()}'))"
|
||||
|
||||
sql = f"""
|
||||
SELECT
|
||||
AVG(sub.interactions) AS interactions,
|
||||
sub.date
|
||||
FROM (
|
||||
SELECT
|
||||
DATE(from_unixtime(__time__)) AS date,
|
||||
created_by,
|
||||
COUNT(DISTINCT id) AS interactions
|
||||
FROM {AliyunLogStore.workflow_execution_logstore}
|
||||
WHERE tenant_id='{tenant_id}'
|
||||
AND app_id='{app_id}'
|
||||
AND triggered_from='{triggered_from}'
|
||||
AND finished_at IS NOT NULL
|
||||
{time_filter}
|
||||
GROUP BY date, created_by
|
||||
) sub
|
||||
GROUP BY sub.date
|
||||
"""
|
||||
|
||||
try:
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql, query="*", logstore=AliyunLogStore.workflow_execution_logstore
|
||||
)
|
||||
|
||||
response_data = []
|
||||
for row in results:
|
||||
response_data.append(
|
||||
{
|
||||
"date": str(row.get("date", "")),
|
||||
"interactions": float(row.get("interactions", 0)),
|
||||
}
|
||||
)
|
||||
|
||||
return cast(list[AverageInteractionStats], response_data)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to get average app interaction statistics")
|
||||
raise
|
||||
@@ -0,0 +1,164 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Union
|
||||
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from core.repositories.sqlalchemy_workflow_execution_repository import SQLAlchemyWorkflowExecutionRepository
|
||||
from core.workflow.entities import WorkflowExecution
|
||||
from core.workflow.repositories.workflow_execution_repository import WorkflowExecutionRepository
|
||||
from extensions.logstore.aliyun_logstore import AliyunLogStore
|
||||
from libs.helper import extract_tenant_id
|
||||
from models import (
|
||||
Account,
|
||||
CreatorUserRole,
|
||||
EndUser,
|
||||
)
|
||||
from models.enums import WorkflowRunTriggeredFrom
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogstoreWorkflowExecutionRepository(WorkflowExecutionRepository):
|
||||
def __init__(
|
||||
self,
|
||||
session_factory: sessionmaker | Engine,
|
||||
user: Union[Account, EndUser],
|
||||
app_id: str | None,
|
||||
triggered_from: WorkflowRunTriggeredFrom | None,
|
||||
):
|
||||
"""
|
||||
Initialize the repository with a SQLAlchemy sessionmaker or engine and context information.
|
||||
|
||||
Args:
|
||||
session_factory: SQLAlchemy sessionmaker or engine for creating sessions
|
||||
user: Account or EndUser object containing tenant_id, user ID, and role information
|
||||
app_id: App ID for filtering by application (can be None)
|
||||
triggered_from: Source of the execution trigger (DEBUGGING or APP_RUN)
|
||||
"""
|
||||
logger.debug(
|
||||
"LogstoreWorkflowExecutionRepository.__init__: app_id=%s, triggered_from=%s", app_id, triggered_from
|
||||
)
|
||||
# Initialize LogStore client
|
||||
# Note: Project/logstore/index initialization is done at app startup via ext_logstore
|
||||
self.logstore_client = AliyunLogStore()
|
||||
|
||||
# Extract tenant_id from user
|
||||
tenant_id = extract_tenant_id(user)
|
||||
if not tenant_id:
|
||||
raise ValueError("User must have a tenant_id or current_tenant_id")
|
||||
self._tenant_id = tenant_id
|
||||
|
||||
# Store app context
|
||||
self._app_id = app_id
|
||||
|
||||
# Extract user context
|
||||
self._triggered_from = triggered_from
|
||||
self._creator_user_id = user.id
|
||||
|
||||
# Determine user role based on user type
|
||||
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
|
||||
|
||||
# Initialize SQL repository for dual-write support
|
||||
self.sql_repository = SQLAlchemyWorkflowExecutionRepository(session_factory, user, app_id, triggered_from)
|
||||
|
||||
# Control flag for dual-write (write to both LogStore and SQL database)
|
||||
# Set to True to enable dual-write for safe migration, False to use LogStore only
|
||||
self._enable_dual_write = os.environ.get("LOGSTORE_DUAL_WRITE_ENABLED", "true").lower() == "true"
|
||||
|
||||
def _to_logstore_model(self, domain_model: WorkflowExecution) -> list[tuple[str, str]]:
|
||||
"""
|
||||
Convert a domain model to a logstore model (List[Tuple[str, str]]).
|
||||
|
||||
Args:
|
||||
domain_model: The domain model to convert
|
||||
|
||||
Returns:
|
||||
The logstore model as a list of key-value tuples
|
||||
"""
|
||||
logger.debug(
|
||||
"_to_logstore_model: id=%s, workflow_id=%s, status=%s",
|
||||
domain_model.id_,
|
||||
domain_model.workflow_id,
|
||||
domain_model.status.value,
|
||||
)
|
||||
# Use values from constructor if provided
|
||||
if not self._triggered_from:
|
||||
raise ValueError("triggered_from is required in repository constructor")
|
||||
if not self._creator_user_id:
|
||||
raise ValueError("created_by is required in repository constructor")
|
||||
if not self._creator_user_role:
|
||||
raise ValueError("created_by_role is required in repository constructor")
|
||||
|
||||
# Generate log_version as nanosecond timestamp for record versioning
|
||||
log_version = str(time.time_ns())
|
||||
|
||||
logstore_model = [
|
||||
("id", domain_model.id_),
|
||||
("log_version", log_version), # Add log_version field for append-only writes
|
||||
("tenant_id", self._tenant_id),
|
||||
("app_id", self._app_id or ""),
|
||||
("workflow_id", domain_model.workflow_id),
|
||||
(
|
||||
"triggered_from",
|
||||
self._triggered_from.value if hasattr(self._triggered_from, "value") else str(self._triggered_from),
|
||||
),
|
||||
("type", domain_model.workflow_type.value),
|
||||
("version", domain_model.workflow_version),
|
||||
("graph", json.dumps(domain_model.graph, ensure_ascii=False) if domain_model.graph else "{}"),
|
||||
("inputs", json.dumps(domain_model.inputs, ensure_ascii=False) if domain_model.inputs else "{}"),
|
||||
("outputs", json.dumps(domain_model.outputs, ensure_ascii=False) if domain_model.outputs else "{}"),
|
||||
("status", domain_model.status.value),
|
||||
("error_message", domain_model.error_message or ""),
|
||||
("total_tokens", str(domain_model.total_tokens)),
|
||||
("total_steps", str(domain_model.total_steps)),
|
||||
("exceptions_count", str(domain_model.exceptions_count)),
|
||||
(
|
||||
"created_by_role",
|
||||
self._creator_user_role.value
|
||||
if hasattr(self._creator_user_role, "value")
|
||||
else str(self._creator_user_role),
|
||||
),
|
||||
("created_by", self._creator_user_id),
|
||||
("started_at", domain_model.started_at.isoformat() if domain_model.started_at else ""),
|
||||
("finished_at", domain_model.finished_at.isoformat() if domain_model.finished_at else ""),
|
||||
]
|
||||
|
||||
return logstore_model
|
||||
|
||||
def save(self, execution: WorkflowExecution) -> None:
|
||||
"""
|
||||
Save or update a WorkflowExecution domain entity to the logstore.
|
||||
|
||||
This method serves as a domain-to-logstore adapter that:
|
||||
1. Converts the domain entity to its logstore representation
|
||||
2. Persists the logstore model using Aliyun SLS
|
||||
3. Maintains proper multi-tenancy by including tenant context during conversion
|
||||
4. Optionally writes to SQL database for dual-write support (controlled by LOGSTORE_DUAL_WRITE_ENABLED)
|
||||
|
||||
Args:
|
||||
execution: The WorkflowExecution domain entity to persist
|
||||
"""
|
||||
logger.debug(
|
||||
"save: id=%s, workflow_id=%s, status=%s", execution.id_, execution.workflow_id, execution.status.value
|
||||
)
|
||||
try:
|
||||
logstore_model = self._to_logstore_model(execution)
|
||||
self.logstore_client.put_log(AliyunLogStore.workflow_execution_logstore, logstore_model)
|
||||
|
||||
logger.debug("Saved workflow execution to logstore: id=%s", execution.id_)
|
||||
except Exception:
|
||||
logger.exception("Failed to save workflow execution to logstore: id=%s", execution.id_)
|
||||
raise
|
||||
|
||||
# Dual-write to SQL database if enabled (for safe migration)
|
||||
if self._enable_dual_write:
|
||||
try:
|
||||
self.sql_repository.save(execution)
|
||||
logger.debug("Dual-write: saved workflow execution to SQL database: id=%s", execution.id_)
|
||||
except Exception:
|
||||
logger.exception("Failed to dual-write workflow execution to SQL database: id=%s", execution.id_)
|
||||
# Don't raise - LogStore write succeeded, SQL is just a backup
|
||||
@@ -0,0 +1,366 @@
|
||||
"""
|
||||
LogStore implementation of the WorkflowNodeExecutionRepository.
|
||||
|
||||
This module provides a LogStore-based repository for WorkflowNodeExecution entities,
|
||||
using Aliyun SLS LogStore with append-only writes and version control.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from typing import Any, Union
|
||||
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from core.model_runtime.utils.encoders import jsonable_encoder
|
||||
from core.repositories import SQLAlchemyWorkflowNodeExecutionRepository
|
||||
from core.workflow.entities import WorkflowNodeExecution
|
||||
from core.workflow.entities.workflow_node_execution import WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
|
||||
from core.workflow.enums import NodeType
|
||||
from core.workflow.repositories.workflow_node_execution_repository import OrderConfig, WorkflowNodeExecutionRepository
|
||||
from core.workflow.workflow_type_encoder import WorkflowRuntimeTypeConverter
|
||||
from extensions.logstore.aliyun_logstore import AliyunLogStore
|
||||
from libs.helper import extract_tenant_id
|
||||
from models import (
|
||||
Account,
|
||||
CreatorUserRole,
|
||||
EndUser,
|
||||
WorkflowNodeExecutionTriggeredFrom,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _dict_to_workflow_node_execution(data: dict[str, Any]) -> WorkflowNodeExecution:
|
||||
"""
|
||||
Convert LogStore result dictionary to WorkflowNodeExecution domain model.
|
||||
|
||||
Args:
|
||||
data: Dictionary from LogStore query result
|
||||
|
||||
Returns:
|
||||
WorkflowNodeExecution domain model instance
|
||||
"""
|
||||
logger.debug("_dict_to_workflow_node_execution: data keys=%s", list(data.keys())[:5])
|
||||
# Parse JSON fields
|
||||
inputs = json.loads(data.get("inputs", "{}"))
|
||||
process_data = json.loads(data.get("process_data", "{}"))
|
||||
outputs = json.loads(data.get("outputs", "{}"))
|
||||
metadata = json.loads(data.get("execution_metadata", "{}"))
|
||||
|
||||
# Convert metadata to domain enum keys
|
||||
domain_metadata = {}
|
||||
for k, v in metadata.items():
|
||||
try:
|
||||
domain_metadata[WorkflowNodeExecutionMetadataKey(k)] = v
|
||||
except ValueError:
|
||||
# Skip invalid metadata keys
|
||||
continue
|
||||
|
||||
# Convert status to domain enum
|
||||
status = WorkflowNodeExecutionStatus(data.get("status", "running"))
|
||||
|
||||
# Parse datetime fields
|
||||
created_at = datetime.fromisoformat(data.get("created_at", "")) if data.get("created_at") else datetime.now()
|
||||
finished_at = datetime.fromisoformat(data.get("finished_at", "")) if data.get("finished_at") else None
|
||||
|
||||
return WorkflowNodeExecution(
|
||||
id=data.get("id", ""),
|
||||
node_execution_id=data.get("node_execution_id"),
|
||||
workflow_id=data.get("workflow_id", ""),
|
||||
workflow_execution_id=data.get("workflow_run_id"),
|
||||
index=int(data.get("index", 0)),
|
||||
predecessor_node_id=data.get("predecessor_node_id"),
|
||||
node_id=data.get("node_id", ""),
|
||||
node_type=NodeType(data.get("node_type", "start")),
|
||||
title=data.get("title", ""),
|
||||
inputs=inputs,
|
||||
process_data=process_data,
|
||||
outputs=outputs,
|
||||
status=status,
|
||||
error=data.get("error"),
|
||||
elapsed_time=float(data.get("elapsed_time", 0.0)),
|
||||
metadata=domain_metadata,
|
||||
created_at=created_at,
|
||||
finished_at=finished_at,
|
||||
)
|
||||
|
||||
|
||||
class LogstoreWorkflowNodeExecutionRepository(WorkflowNodeExecutionRepository):
|
||||
"""
|
||||
LogStore implementation of the WorkflowNodeExecutionRepository interface.
|
||||
|
||||
This implementation uses Aliyun SLS LogStore with an append-only write strategy:
|
||||
- Each save() operation appends a new record with a version timestamp
|
||||
- Updates are simulated by writing new records with higher version numbers
|
||||
- Queries retrieve the latest version using finished_at IS NOT NULL filter
|
||||
- Multi-tenancy is maintained through tenant_id filtering
|
||||
|
||||
Version Strategy:
|
||||
version = time.time_ns() # Nanosecond timestamp for unique ordering
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session_factory: sessionmaker | Engine,
|
||||
user: Union[Account, EndUser],
|
||||
app_id: str | None,
|
||||
triggered_from: WorkflowNodeExecutionTriggeredFrom | None,
|
||||
):
|
||||
"""
|
||||
Initialize the repository with a SQLAlchemy sessionmaker or engine and context information.
|
||||
|
||||
Args:
|
||||
session_factory: SQLAlchemy sessionmaker or engine for creating sessions
|
||||
user: Account or EndUser object containing tenant_id, user ID, and role information
|
||||
app_id: App ID for filtering by application (can be None)
|
||||
triggered_from: Source of the execution trigger (SINGLE_STEP or WORKFLOW_RUN)
|
||||
"""
|
||||
logger.debug(
|
||||
"LogstoreWorkflowNodeExecutionRepository.__init__: app_id=%s, triggered_from=%s", app_id, triggered_from
|
||||
)
|
||||
# Initialize LogStore client
|
||||
self.logstore_client = AliyunLogStore()
|
||||
|
||||
# Extract tenant_id from user
|
||||
tenant_id = extract_tenant_id(user)
|
||||
if not tenant_id:
|
||||
raise ValueError("User must have a tenant_id or current_tenant_id")
|
||||
self._tenant_id = tenant_id
|
||||
|
||||
# Store app context
|
||||
self._app_id = app_id
|
||||
|
||||
# Extract user context
|
||||
self._triggered_from = triggered_from
|
||||
self._creator_user_id = user.id
|
||||
|
||||
# Determine user role based on user type
|
||||
self._creator_user_role = CreatorUserRole.ACCOUNT if isinstance(user, Account) else CreatorUserRole.END_USER
|
||||
|
||||
# Initialize SQL repository for dual-write support
|
||||
self.sql_repository = SQLAlchemyWorkflowNodeExecutionRepository(session_factory, user, app_id, triggered_from)
|
||||
|
||||
# Control flag for dual-write (write to both LogStore and SQL database)
|
||||
# Set to True to enable dual-write for safe migration, False to use LogStore only
|
||||
self._enable_dual_write = os.environ.get("LOGSTORE_DUAL_WRITE_ENABLED", "true").lower() == "true"
|
||||
|
||||
def _to_logstore_model(self, domain_model: WorkflowNodeExecution) -> Sequence[tuple[str, str]]:
|
||||
logger.debug(
|
||||
"_to_logstore_model: id=%s, node_id=%s, status=%s",
|
||||
domain_model.id,
|
||||
domain_model.node_id,
|
||||
domain_model.status.value,
|
||||
)
|
||||
if not self._triggered_from:
|
||||
raise ValueError("triggered_from is required in repository constructor")
|
||||
if not self._creator_user_id:
|
||||
raise ValueError("created_by is required in repository constructor")
|
||||
if not self._creator_user_role:
|
||||
raise ValueError("created_by_role is required in repository constructor")
|
||||
|
||||
# Generate log_version as nanosecond timestamp for record versioning
|
||||
log_version = str(time.time_ns())
|
||||
|
||||
json_converter = WorkflowRuntimeTypeConverter()
|
||||
|
||||
logstore_model = [
|
||||
("id", domain_model.id),
|
||||
("log_version", log_version), # Add log_version field for append-only writes
|
||||
("tenant_id", self._tenant_id),
|
||||
("app_id", self._app_id or ""),
|
||||
("workflow_id", domain_model.workflow_id),
|
||||
(
|
||||
"triggered_from",
|
||||
self._triggered_from.value if hasattr(self._triggered_from, "value") else str(self._triggered_from),
|
||||
),
|
||||
("workflow_run_id", domain_model.workflow_execution_id or ""),
|
||||
("index", str(domain_model.index)),
|
||||
("predecessor_node_id", domain_model.predecessor_node_id or ""),
|
||||
("node_execution_id", domain_model.node_execution_id or ""),
|
||||
("node_id", domain_model.node_id),
|
||||
("node_type", domain_model.node_type.value),
|
||||
("title", domain_model.title),
|
||||
(
|
||||
"inputs",
|
||||
json.dumps(json_converter.to_json_encodable(domain_model.inputs), ensure_ascii=False)
|
||||
if domain_model.inputs
|
||||
else "{}",
|
||||
),
|
||||
(
|
||||
"process_data",
|
||||
json.dumps(json_converter.to_json_encodable(domain_model.process_data), ensure_ascii=False)
|
||||
if domain_model.process_data
|
||||
else "{}",
|
||||
),
|
||||
(
|
||||
"outputs",
|
||||
json.dumps(json_converter.to_json_encodable(domain_model.outputs), ensure_ascii=False)
|
||||
if domain_model.outputs
|
||||
else "{}",
|
||||
),
|
||||
("status", domain_model.status.value),
|
||||
("error", domain_model.error or ""),
|
||||
("elapsed_time", str(domain_model.elapsed_time)),
|
||||
(
|
||||
"execution_metadata",
|
||||
json.dumps(jsonable_encoder(domain_model.metadata), ensure_ascii=False)
|
||||
if domain_model.metadata
|
||||
else "{}",
|
||||
),
|
||||
("created_at", domain_model.created_at.isoformat() if domain_model.created_at else ""),
|
||||
("created_by_role", self._creator_user_role.value),
|
||||
("created_by", self._creator_user_id),
|
||||
("finished_at", domain_model.finished_at.isoformat() if domain_model.finished_at else ""),
|
||||
]
|
||||
|
||||
return logstore_model
|
||||
|
||||
def save(self, execution: WorkflowNodeExecution) -> None:
|
||||
"""
|
||||
Save or update a NodeExecution domain entity to LogStore.
|
||||
|
||||
This method serves as a domain-to-logstore adapter that:
|
||||
1. Converts the domain entity to its logstore representation
|
||||
2. Appends a new record with a log_version timestamp
|
||||
3. Maintains proper multi-tenancy by including tenant context during conversion
|
||||
4. Optionally writes to SQL database for dual-write support (controlled by LOGSTORE_DUAL_WRITE_ENABLED)
|
||||
|
||||
Each save operation creates a new record. Updates are simulated by writing
|
||||
new records with higher log_version numbers.
|
||||
|
||||
Args:
|
||||
execution: The NodeExecution domain entity to persist
|
||||
"""
|
||||
logger.debug(
|
||||
"save: id=%s, node_execution_id=%s, status=%s",
|
||||
execution.id,
|
||||
execution.node_execution_id,
|
||||
execution.status.value,
|
||||
)
|
||||
try:
|
||||
logstore_model = self._to_logstore_model(execution)
|
||||
self.logstore_client.put_log(AliyunLogStore.workflow_node_execution_logstore, logstore_model)
|
||||
|
||||
logger.debug(
|
||||
"Saved node execution to LogStore: id=%s, node_execution_id=%s, status=%s",
|
||||
execution.id,
|
||||
execution.node_execution_id,
|
||||
execution.status.value,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Failed to save node execution to LogStore: id=%s, node_execution_id=%s",
|
||||
execution.id,
|
||||
execution.node_execution_id,
|
||||
)
|
||||
raise
|
||||
|
||||
# Dual-write to SQL database if enabled (for safe migration)
|
||||
if self._enable_dual_write:
|
||||
try:
|
||||
self.sql_repository.save(execution)
|
||||
logger.debug("Dual-write: saved node execution to SQL database: id=%s", execution.id)
|
||||
except Exception:
|
||||
logger.exception("Failed to dual-write node execution to SQL database: id=%s", execution.id)
|
||||
# Don't raise - LogStore write succeeded, SQL is just a backup
|
||||
|
||||
def save_execution_data(self, execution: WorkflowNodeExecution) -> None:
|
||||
"""
|
||||
Save or update the inputs, process_data, or outputs associated with a specific
|
||||
node_execution record.
|
||||
|
||||
For LogStore implementation, this is similar to save() since we always write
|
||||
complete records. We append a new record with updated data fields.
|
||||
|
||||
Args:
|
||||
execution: The NodeExecution instance with data to save
|
||||
"""
|
||||
logger.debug("save_execution_data: id=%s, node_execution_id=%s", execution.id, execution.node_execution_id)
|
||||
# In LogStore, we simply write a new complete record with the data
|
||||
# The log_version timestamp will ensure this is treated as the latest version
|
||||
self.save(execution)
|
||||
|
||||
def get_by_workflow_run(
|
||||
self,
|
||||
workflow_run_id: str,
|
||||
order_config: OrderConfig | None = None,
|
||||
) -> Sequence[WorkflowNodeExecution]:
|
||||
"""
|
||||
Retrieve all NodeExecution instances for a specific workflow run.
|
||||
Uses LogStore SQL query with finished_at IS NOT NULL filter for deduplication.
|
||||
This ensures we only get the final version of each node execution.
|
||||
Args:
|
||||
workflow_run_id: The workflow run ID
|
||||
order_config: Optional configuration for ordering results
|
||||
order_config.order_by: List of fields to order by (e.g., ["index", "created_at"])
|
||||
order_config.order_direction: Direction to order ("asc" or "desc")
|
||||
|
||||
Returns:
|
||||
A list of NodeExecution instances
|
||||
|
||||
Note:
|
||||
This method filters by finished_at IS NOT NULL to avoid duplicates from
|
||||
version updates. For complete history including intermediate states,
|
||||
a different query strategy would be needed.
|
||||
"""
|
||||
logger.debug("get_by_workflow_run: workflow_run_id=%s, order_config=%s", workflow_run_id, order_config)
|
||||
# Build SQL query with deduplication using finished_at IS NOT NULL
|
||||
# This optimization avoids window functions for common case where we only
|
||||
# want the final state of each node execution
|
||||
|
||||
# Build ORDER BY clause
|
||||
order_clause = ""
|
||||
if order_config and order_config.order_by:
|
||||
order_fields = []
|
||||
for field in order_config.order_by:
|
||||
# Map domain field names to logstore field names if needed
|
||||
field_name = field
|
||||
if order_config.order_direction == "desc":
|
||||
order_fields.append(f"{field_name} DESC")
|
||||
else:
|
||||
order_fields.append(f"{field_name} ASC")
|
||||
if order_fields:
|
||||
order_clause = "ORDER BY " + ", ".join(order_fields)
|
||||
|
||||
sql = f"""
|
||||
SELECT *
|
||||
FROM {AliyunLogStore.workflow_node_execution_logstore}
|
||||
WHERE workflow_run_id='{workflow_run_id}'
|
||||
AND tenant_id='{self._tenant_id}'
|
||||
AND finished_at IS NOT NULL
|
||||
"""
|
||||
|
||||
if self._app_id:
|
||||
sql += f" AND app_id='{self._app_id}'"
|
||||
|
||||
if order_clause:
|
||||
sql += f" {order_clause}"
|
||||
|
||||
try:
|
||||
# Execute SQL query
|
||||
results = self.logstore_client.execute_sql(
|
||||
sql=sql,
|
||||
query="*",
|
||||
logstore=AliyunLogStore.workflow_node_execution_logstore,
|
||||
)
|
||||
|
||||
# Convert LogStore results to WorkflowNodeExecution domain models
|
||||
executions = []
|
||||
for row in results:
|
||||
try:
|
||||
execution = _dict_to_workflow_node_execution(row)
|
||||
executions.append(execution)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to convert row to WorkflowNodeExecution: %s, row=%s", e, row)
|
||||
continue
|
||||
|
||||
return executions
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to retrieve node executions from LogStore: workflow_run_id=%s", workflow_run_id)
|
||||
raise
|
||||
@@ -4,6 +4,7 @@ version = "1.11.1"
|
||||
requires-python = ">=3.11,<3.13"
|
||||
|
||||
dependencies = [
|
||||
"aliyun-log-python-sdk~=0.9.37",
|
||||
"arize-phoenix-otel~=0.9.2",
|
||||
"azure-identity==1.16.1",
|
||||
"beautifulsoup4==4.12.2",
|
||||
@@ -11,7 +12,7 @@ dependencies = [
|
||||
"bs4~=0.0.1",
|
||||
"cachetools~=5.3.0",
|
||||
"celery~=5.5.2",
|
||||
"charset-normalizer>=3.4.4",
|
||||
"chardet~=5.1.0",
|
||||
"flask~=3.1.2",
|
||||
"flask-compress>=1.17,<1.18",
|
||||
"flask-cors~=6.0.0",
|
||||
@@ -91,7 +92,6 @@ dependencies = [
|
||||
"weaviate-client==4.17.0",
|
||||
"apscheduler>=3.11.0",
|
||||
"weave>=0.52.16",
|
||||
"jsonschema>=4.25.1",
|
||||
]
|
||||
# Before adding new dependency, consider place it in
|
||||
# alphabet order (a-z) and suitable group.
|
||||
|
||||
4651
api/uv.lock
generated
4651
api/uv.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1044,6 +1044,25 @@ WORKFLOW_LOG_RETENTION_DAYS=30
|
||||
# Batch size for workflow log cleanup operations (default: 100)
|
||||
WORKFLOW_LOG_CLEANUP_BATCH_SIZE=100
|
||||
|
||||
# Aliyun SLS Logstore Configuration
|
||||
# Aliyun Access Key ID
|
||||
ALIYUN_SLS_ACCESS_KEY_ID=
|
||||
# Aliyun Access Key Secret
|
||||
ALIYUN_SLS_ACCESS_KEY_SECRET=
|
||||
# Aliyun SLS Endpoint (e.g., cn-hangzhou.log.aliyuncs.com)
|
||||
ALIYUN_SLS_ENDPOINT=
|
||||
# Aliyun SLS Region (e.g., cn-hangzhou)
|
||||
ALIYUN_SLS_REGION=
|
||||
# Aliyun SLS Project Name
|
||||
ALIYUN_SLS_PROJECT_NAME=
|
||||
# Number of days to retain workflow run logs (default: 365 days, 3650 for permanent storage)
|
||||
ALIYUN_SLS_LOGSTORE_TTL=365
|
||||
# Enable dual-write to both SLS LogStore and SQL database (default: false)
|
||||
LOGSTORE_DUAL_WRITE_ENABLED=false
|
||||
# Enable dual-read fallback to SQL database when LogStore returns no results (default: true)
|
||||
# Useful for migration scenarios where historical data exists only in SQL database
|
||||
LOGSTORE_DUAL_READ_ENABLED=true
|
||||
|
||||
# HTTP request node in workflow configuration
|
||||
HTTP_REQUEST_NODE_MAX_BINARY_SIZE=10485760
|
||||
HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576
|
||||
|
||||
@@ -455,6 +455,14 @@ x-shared-env: &shared-api-worker-env
|
||||
WORKFLOW_LOG_CLEANUP_ENABLED: ${WORKFLOW_LOG_CLEANUP_ENABLED:-false}
|
||||
WORKFLOW_LOG_RETENTION_DAYS: ${WORKFLOW_LOG_RETENTION_DAYS:-30}
|
||||
WORKFLOW_LOG_CLEANUP_BATCH_SIZE: ${WORKFLOW_LOG_CLEANUP_BATCH_SIZE:-100}
|
||||
ALIYUN_SLS_ACCESS_KEY_ID: ${ALIYUN_SLS_ACCESS_KEY_ID:-}
|
||||
ALIYUN_SLS_ACCESS_KEY_SECRET: ${ALIYUN_SLS_ACCESS_KEY_SECRET:-}
|
||||
ALIYUN_SLS_ENDPOINT: ${ALIYUN_SLS_ENDPOINT:-}
|
||||
ALIYUN_SLS_REGION: ${ALIYUN_SLS_REGION:-}
|
||||
ALIYUN_SLS_PROJECT_NAME: ${ALIYUN_SLS_PROJECT_NAME:-}
|
||||
ALIYUN_SLS_LOGSTORE_TTL: ${ALIYUN_SLS_LOGSTORE_TTL:-365}
|
||||
LOGSTORE_DUAL_WRITE_ENABLED: ${LOGSTORE_DUAL_WRITE_ENABLED:-false}
|
||||
LOGSTORE_DUAL_READ_ENABLED: ${LOGSTORE_DUAL_READ_ENABLED:-true}
|
||||
HTTP_REQUEST_NODE_MAX_BINARY_SIZE: ${HTTP_REQUEST_NODE_MAX_BINARY_SIZE:-10485760}
|
||||
HTTP_REQUEST_NODE_MAX_TEXT_SIZE: ${HTTP_REQUEST_NODE_MAX_TEXT_SIZE:-1048576}
|
||||
HTTP_REQUEST_NODE_SSL_VERIFY: ${HTTP_REQUEST_NODE_SSL_VERIFY:-True}
|
||||
|
||||
@@ -213,3 +213,24 @@ PLUGIN_VOLCENGINE_TOS_ENDPOINT=
|
||||
PLUGIN_VOLCENGINE_TOS_ACCESS_KEY=
|
||||
PLUGIN_VOLCENGINE_TOS_SECRET_KEY=
|
||||
PLUGIN_VOLCENGINE_TOS_REGION=
|
||||
|
||||
# ------------------------------
|
||||
# Environment Variables for Aliyun SLS (Simple Log Service)
|
||||
# ------------------------------
|
||||
# Aliyun SLS Access Key ID
|
||||
ALIYUN_SLS_ACCESS_KEY_ID=
|
||||
# Aliyun SLS Access Key Secret
|
||||
ALIYUN_SLS_ACCESS_KEY_SECRET=
|
||||
# Aliyun SLS Endpoint (e.g., cn-hangzhou.log.aliyuncs.com)
|
||||
ALIYUN_SLS_ENDPOINT=
|
||||
# Aliyun SLS Region (e.g., cn-hangzhou)
|
||||
ALIYUN_SLS_REGION=
|
||||
# Aliyun SLS Project Name
|
||||
ALIYUN_SLS_PROJECT_NAME=
|
||||
# Aliyun SLS Logstore TTL (default: 365 days, 3650 for permanent storage)
|
||||
ALIYUN_SLS_LOGSTORE_TTL=365
|
||||
# Enable dual-write to both LogStore and SQL database (default: true)
|
||||
LOGSTORE_DUAL_WRITE_ENABLED=true
|
||||
# Enable dual-read fallback to SQL database when LogStore returns no results (default: true)
|
||||
# Useful for migration scenarios where historical data exists only in SQL database
|
||||
LOGSTORE_DUAL_READ_ENABLED=true
|
||||
Reference in New Issue
Block a user