43 lines
959 B
Python
43 lines
959 B
Python
#
|
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
#
|
|
|
|
|
|
import re
|
|
|
|
import unidecode
|
|
|
|
TOKEN_PATTERN = re.compile(r"[A-Z]+[a-z]*|[a-z]+|\d+|(?P<NoToken>[^a-zA-Z\d]+)")
|
|
DEFAULT_SEPARATOR = "_"
|
|
|
|
|
|
def name_conversion(text):
|
|
"""
|
|
convert name using a set of rules, for example: '1MyName' -> '_1_my_name'
|
|
"""
|
|
text = unidecode.unidecode(text)
|
|
|
|
tokens = []
|
|
for m in TOKEN_PATTERN.finditer(text):
|
|
if m.group("NoToken") is None:
|
|
tokens.append(m.group(0))
|
|
else:
|
|
tokens.append("")
|
|
|
|
if len(tokens) >= 3:
|
|
tokens = tokens[:1] + [t for t in tokens[1:-1] if t] + tokens[-1:]
|
|
|
|
if tokens and tokens[0].isdigit():
|
|
tokens.insert(0, "")
|
|
|
|
text = DEFAULT_SEPARATOR.join(tokens)
|
|
text = text.lower()
|
|
return text
|
|
|
|
|
|
def safe_name_conversion(text):
|
|
new = name_conversion(text)
|
|
if not new:
|
|
raise Exception(f"initial string '{text}' converted to empty")
|
|
return new
|