66 lines
2.5 KiB
Python
Executable File
66 lines
2.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import sys
|
|
from typing import Any
|
|
|
|
#
|
|
# record_obfuscator is a tiny script that:
|
|
# 1. reads JSON lines from stdin
|
|
# 2. obfuscates the data in AirbyteRecordMessage lines
|
|
# 3. spits obfuscated lines back to stdout
|
|
# All stdin lines that are not type: RECORD remain unchanged.
|
|
#
|
|
# It's used in live-tests (airbyte-ci/connectors/live-tests) to be able to dump raw data from a live connection
|
|
# without leaking actual sensitive production data.
|
|
#
|
|
# The script is copied over into the live tests runnder dagger container.
|
|
#
|
|
|
|
|
|
def _generate_hash(value: Any) -> str:
|
|
return hashlib.sha256(str(value).encode()).hexdigest()
|
|
|
|
|
|
def obfuscate(value: Any) -> Any:
|
|
"""Returns an obfuscated version of the input value while retaiining it's type and length information."""
|
|
if isinstance(value, str):
|
|
obfuscated_value = f"string_len-{len(value)}_" + _generate_hash(value)
|
|
elif isinstance(value, int):
|
|
obfuscated_value = f"integer_len-{len(str(value))}" + _generate_hash(value)
|
|
elif isinstance(value, float):
|
|
obfuscated_value = f"number_len-{len(str(value))}" + _generate_hash(value)
|
|
elif isinstance(value, bool):
|
|
obfuscated_value = "boolean_" + _generate_hash(value)
|
|
elif value is None:
|
|
obfuscated_value = "null_" + _generate_hash(value)
|
|
elif isinstance(value, list):
|
|
obfuscated_value = f"array_len-{len(value)}" + _generate_hash(json.dumps(value, sort_keys=True).encode())
|
|
elif isinstance(value, dict):
|
|
obfuscated_value = f"object_len-{len(value.keys())}" + _generate_hash(json.dumps(value, sort_keys=True).encode())
|
|
else:
|
|
raise ValueError(f"Unsupported data type: {type(value)}")
|
|
|
|
return obfuscated_value
|
|
|
|
|
|
if __name__ == "__main__":
|
|
for line in sys.stdin:
|
|
line = line.strip()
|
|
try:
|
|
data = json.loads(line)
|
|
except Exception as exc:
|
|
# We don't expect invalid json so if we see it, it will go to stderr
|
|
sys.stderr.write(f"{line}\n")
|
|
|
|
# try / except / else: Else block runs only if no exceptions were raised.
|
|
else:
|
|
if data.get("type") == "RECORD":
|
|
record_data = data["record"].get("data", {})
|
|
obfuscated_record = {k: obfuscate(v) for k, v in record_data.items()}
|
|
data["record"]["data"] = obfuscated_record
|
|
sys.stdout.write(f"{json.dumps(data)}\n")
|