Excel & CSV query runner (#2478)

* Excel query runner * Param handling for read_excel * CSV query runner * Fix wrong module name * Use yaml as query language * Use yaml as query language for CSV * Added icon and required modules * Local address filtering * Fix syntax error
2025-12-25 01:03:20 -05:00 · 2021-07-28 05:27:09 +09:00
parent ff7c5e8367
commit b9cb8191f5
5 changed files with 201 additions and 1 deletions
--- a/client/app/assets/images/db-logos/excel.png
+++ b/client/app/assets/images/db-logos/excel.png
--- a/redash/query_runner/csv.py
+++ b/redash/query_runner/csv.py
@@ -0,0 +1,100 @@
+import logging
+import yaml
+import requests
+import io
+
+from redash import settings
+from redash.query_runner import *
+from redash.utils import json_dumps
+
+logger = logging.getLogger(__name__)
+
+try:
+    import pandas as pd
+    import numpy as np
+    enabled = True
+except ImportError:
+    enabled = False
+
+
+class CSV(BaseQueryRunner):
+    should_annotate_query = False
+
+    @classmethod
+    def name(cls):
+        return "CSV"
+
+    @classmethod
+    def enabled(cls):
+        return enabled
+
+    @classmethod
+    def configuration_schema(cls):
+        return {
+            'type': 'object',
+            'properties': {},
+        }
+
+    def __init__(self, configuration):
+        super(CSV, self).__init__(configuration)
+        self.syntax = "yaml"
+
+    def test_connection(self):
+        pass
+
+    def run_query(self, query, user):
+        path = ""
+        ua = ""
+        args = {}
+        try:
+            args = yaml.safe_load(query)
+            path = args['url']
+            args.pop('url', None)
+            ua = args['user-agent']
+            args.pop('user-agent', None)
+
+            if is_private_address(path) and settings.ENFORCE_PRIVATE_ADDRESS_BLOCK:
+                raise Exception("Can't query private addresses.")
+        except:
+            pass
+
+        try:
+            response = requests.get(url=path, headers={"User-agent": ua})
+            workbook = pd.read_csv(io.BytesIO(response.content),sep=",", **args)
+
+            df = workbook.copy()
+            data = {'columns': [], 'rows': []}
+            conversions = [
+                {'pandas_type': np.integer, 'redash_type': 'integer',},
+                {'pandas_type': np.inexact, 'redash_type': 'float',},
+                {'pandas_type': np.datetime64, 'redash_type': 'datetime', 'to_redash': lambda x: x.strftime('%Y-%m-%d %H:%M:%S')},
+                {'pandas_type': np.bool_, 'redash_type': 'boolean'},
+                {'pandas_type': np.object, 'redash_type': 'string'}
+            ]
+            labels = []
+            for dtype, label in zip(df.dtypes, df.columns):
+                for conversion in conversions:
+                    if issubclass(dtype.type, conversion['pandas_type']):
+                        data['columns'].append({'name': label, 'friendly_name': label, 'type': conversion['redash_type']})
+                        labels.append(label)
+                        func = conversion.get('to_redash')
+                        if func:
+                            df[label] = df[label].apply(func)
+                        break
+            data['rows'] = df[labels].replace({np.nan: None}).to_dict(orient='records')
+
+            json_data = json_dumps(data)
+            error = None
+        except KeyboardInterrupt:
+            error = "Query cancelled by user."
+            json_data = None
+        except Exception as e:
+            error = "Error reading {0}. {1}".format(path, str(e))
+            json_data = None
+
+        return json_data, error
+
+    def get_schema(self):
+        raise NotSupported()
+
+register(CSV)
--- a/redash/query_runner/excel.py
+++ b/redash/query_runner/excel.py
@@ -0,0 +1,96 @@
+import logging
+import yaml
+import requests
+
+from redash import settings
+from redash.query_runner import *
+from redash.utils import json_dumps
+
+logger = logging.getLogger(__name__)
+
+try:
+    import pandas as pd
+    import xlrd
+    import openpyxl
+    import numpy as np
+    enabled = True
+except ImportError:
+    enabled = False
+
+class Excel(BaseQueryRunner):
+    should_annotate_query = False
+
+    @classmethod
+    def enabled(cls):
+        return enabled
+
+    @classmethod
+    def configuration_schema(cls):
+        return {
+            'type': 'object',
+            'properties': {},
+        }
+
+    def __init__(self, configuration):
+        super(Excel, self).__init__(configuration)
+        self.syntax = "yaml"
+
+    def test_connection(self):
+        pass
+
+    def run_query(self, query, user):
+        path = ""
+        ua = ""
+        args = {}
+        try:
+            args = yaml.safe_load(query)
+            path = args['url']
+            args.pop('url', None)
+            ua = args['user-agent']
+            args.pop('user-agent', None)
+
+            if is_private_address(path) and settings.ENFORCE_PRIVATE_ADDRESS_BLOCK:
+                raise Exception("Can't query private addresses.")
+        except:
+            pass
+
+        try:
+            response = requests.get(url=path, headers={"User-agent": ua})
+            workbook = pd.read_excel(response.content, **args)
+
+            df = workbook.copy()
+            data = {'columns': [], 'rows': []}
+            conversions = [
+                {'pandas_type': np.integer, 'redash_type': 'integer',},
+                {'pandas_type': np.inexact, 'redash_type': 'float',},
+                {'pandas_type': np.datetime64, 'redash_type': 'datetime', 'to_redash': lambda x: x.strftime('%Y-%m-%d %H:%M:%S')},
+                {'pandas_type': np.bool_, 'redash_type': 'boolean'},
+                {'pandas_type': np.object, 'redash_type': 'string'}
+            ]
+            labels = []
+            for dtype, label in zip(df.dtypes, df.columns):
+                for conversion in conversions:
+                    if issubclass(dtype.type, conversion['pandas_type']):
+                        data['columns'].append({'name': label, 'friendly_name': label, 'type': conversion['redash_type']})
+                        labels.append(label)
+                        func = conversion.get('to_redash')
+                        if func:
+                            df[label] = df[label].apply(func)
+                        break
+            data['rows'] = df[labels].replace({np.nan: None}).to_dict(orient='records')
+
+            json_data = json_dumps(data)
+            error = None
+        except KeyboardInterrupt:
+            error = "Query cancelled by user."
+            json_data = None
+        except Exception as e:
+            error = "Error reading {0}. {1}".format(path, str(e))
+            json_data = None
+
+        return json_data, error
+
+    def get_schema(self):
+        raise NotSupported()
+
+register(Excel)
--- a/redash/settings/init.py
+++ b/redash/settings/init.py
@@ -380,7 +380,9 @@ default_query_runners = [
    "redash.query_runner.cloudwatch",
    "redash.query_runner.cloudwatch_insights",
    "redash.query_runner.corporate_memory",
-    "redash.query_runner.sparql_endpoint"
+    "redash.query_runner.sparql_endpoint",
+    "redash.query_runner.excel",
+    "redash.query_runner.csv"
 ]

 enabled_query_runners = array_from_string(
--- a/requirements_all_ds.txt
+++ b/requirements_all_ds.txt
@@ -37,3 +37,5 @@ python-rapidjson==0.8.0
 pyodbc==4.0.28
 trino~=0.305
 cmem-cmempy==21.2.3
+xlrd==2.0.1
+openpyxl==3.0.7