IMPALA-6999: Upgrade to sqlparse-0.1.19 for Impala shell

sqlparse-0.1.19 is the last version of sqlparse that supports Python
2.6.

Testing:
- Ran all end-to-end tests

Change-Id: Ide51ef3ac52d25a96b0fa832e29b6535197d23cb
Reviewed-on: http://gerrit.cloudera.org:8080/10354
Reviewed-by: David Knupp <dknupp@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Fredy Wijaya
2018-05-08 20:22:11 -07:00
committed by Impala Public Jenkins
parent 8668d18ebb
commit 49413d9c5b
61 changed files with 558 additions and 188 deletions

View File

@@ -0,0 +1,78 @@
# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
"""Parse SQL statements."""
__version__ = '0.1.19'
# Setup namespace
from sqlparse import engine
from sqlparse import filters
from sqlparse import formatter
# Deprecated in 0.1.5. Will be removed in 0.2.0
from sqlparse.exceptions import SQLParseError
def parse(sql, encoding=None):
"""Parse sql and return a list of statements.
:param sql: A string containting one or more SQL statements.
:param encoding: The encoding of the statement (optional).
:returns: A tuple of :class:`~sqlparse.sql.Statement` instances.
"""
return tuple(parsestream(sql, encoding))
def parsestream(stream, encoding=None):
"""Parses sql statements from file-like object.
:param stream: A file-like object.
:param encoding: The encoding of the stream contents (optional).
:returns: A generator of :class:`~sqlparse.sql.Statement` instances.
"""
stack = engine.FilterStack()
stack.full_analyze()
return stack.run(stream, encoding)
def format(sql, **options):
"""Format *sql* according to *options*.
Available options are documented in :ref:`formatting`.
In addition to the formatting options this function accepts the
keyword "encoding" which determines the encoding of the statement.
:returns: The formatted SQL statement as string.
"""
encoding = options.pop('encoding', None)
stack = engine.FilterStack()
options = formatter.validate_options(options)
stack = formatter.build_filter_stack(stack, options)
stack.postprocess.append(filters.SerializerUnicode())
return ''.join(stack.run(sql, encoding))
def split(sql, encoding=None):
"""Split *sql* into single statements.
:param sql: A string containting one or more SQL statements.
:param encoding: The encoding of the statement (optional).
:returns: A list of strings.
"""
stack = engine.FilterStack()
stack.split_statements = True
return [unicode(stmt).strip() for stmt in stack.run(sql, encoding)]
from sqlparse.engine.filter import StatementFilter
def split2(stream):
splitter = StatementFilter()
return list(splitter.process(None, stream))

View File

@@ -0,0 +1,80 @@
# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
"""filter"""
from sqlparse import lexer
from sqlparse.engine import grouping
from sqlparse.engine.filter import StatementFilter
# XXX remove this when cleanup is complete
Filter = object
class FilterStack(object):
def __init__(self):
self.preprocess = []
self.stmtprocess = []
self.postprocess = []
self.split_statements = False
self._grouping = False
def _flatten(self, stream):
for token in stream:
if token.is_group():
for t in self._flatten(token.tokens):
yield t
else:
yield token
def enable_grouping(self):
self._grouping = True
def full_analyze(self):
self.enable_grouping()
def run(self, sql, encoding=None):
stream = lexer.tokenize(sql, encoding)
# Process token stream
if self.preprocess:
for filter_ in self.preprocess:
stream = filter_.process(self, stream)
if (self.stmtprocess or self.postprocess or self.split_statements
or self._grouping):
splitter = StatementFilter()
stream = splitter.process(self, stream)
if self._grouping:
def _group(stream):
for stmt in stream:
grouping.group(stmt)
yield stmt
stream = _group(stream)
if self.stmtprocess:
def _run1(stream):
ret = []
for stmt in stream:
for filter_ in self.stmtprocess:
filter_.process(self, stmt)
ret.append(stmt)
return ret
stream = _run1(stream)
if self.postprocess:
def _run2(stream):
for stmt in stream:
stmt.tokens = list(self._flatten(stmt.tokens))
for filter_ in self.postprocess:
stmt = filter_.process(self, stmt)
yield stmt
stream = _run2(stream)
return stream

View File

@@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-
from sqlparse.sql import Statement, Token
from sqlparse import tokens as T
class StatementFilter:
"Filter that split stream at individual statements"
def __init__(self):
self._in_declare = False
self._in_dbldollar = False
self._is_create = False
self._begin_depth = 0
def _reset(self):
"Set the filter attributes to its default values"
self._in_declare = False
self._in_dbldollar = False
self._is_create = False
self._begin_depth = 0
def _change_splitlevel(self, ttype, value):
"Get the new split level (increase, decrease or remain equal)"
# PostgreSQL
if (ttype == T.Name.Builtin
and value.startswith('$') and value.endswith('$')):
if self._in_dbldollar:
self._in_dbldollar = False
return -1
else:
self._in_dbldollar = True
return 1
elif self._in_dbldollar:
return 0
# ANSI
if ttype not in T.Keyword:
return 0
unified = value.upper()
if unified == 'DECLARE' and self._is_create and self._begin_depth == 0:
self._in_declare = True
return 1
if unified == 'BEGIN':
self._begin_depth += 1
if self._in_declare or self._is_create:
# FIXME(andi): This makes no sense.
return 1
return 0
if unified in ('END IF', 'END FOR'):
return -1
if unified == 'END':
# Should this respect a preceeding BEGIN?
# In CASE ... WHEN ... END this results in a split level -1.
self._begin_depth = max(0, self._begin_depth - 1)
return -1
if ttype is T.Keyword.DDL and unified.startswith('CREATE'):
self._is_create = True
return 0
if (unified in ('IF', 'FOR')
and self._is_create and self._begin_depth > 0):
return 1
# Default
return 0
def process(self, stack, stream):
"Process the stream"
consume_ws = False
splitlevel = 0
stmt = None
stmt_tokens = []
# Run over all stream tokens
for ttype, value in stream:
# Yield token if we finished a statement and there's no whitespaces
if consume_ws and ttype not in (T.Whitespace, T.Comment.Single):
stmt.tokens = stmt_tokens
yield stmt
# Reset filter and prepare to process next statement
self._reset()
consume_ws = False
splitlevel = 0
stmt = None
# Create a new statement if we are not currently in one of them
if stmt is None:
stmt = Statement()
stmt_tokens = []
# Change current split level (increase, decrease or remain equal)
splitlevel += self._change_splitlevel(ttype, value)
# Append the token to the current statement
stmt_tokens.append(Token(ttype, value))
# Check if we get the end of a statement
if splitlevel <= 0 and ttype is T.Punctuation and value == ';':
consume_ws = True
# Yield pending statement (if any)
if stmt is not None:
stmt.tokens = stmt_tokens
yield stmt

View File

@@ -0,0 +1,461 @@
# -*- coding: utf-8 -*-
import itertools
from sqlparse import sql
from sqlparse import tokens as T
try:
next
except NameError: # Python < 2.6
next = lambda i: i.next()
def _group_left_right(tlist, ttype, value, cls,
check_right=lambda t: True,
check_left=lambda t: True,
include_semicolon=False):
[_group_left_right(sgroup, ttype, value, cls, check_right, check_left,
include_semicolon) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, cls)]
idx = 0
token = tlist.token_next_match(idx, ttype, value)
while token:
right = tlist.token_next(tlist.token_index(token))
left = tlist.token_prev(tlist.token_index(token))
if right is None or not check_right(right):
token = tlist.token_next_match(tlist.token_index(token) + 1,
ttype, value)
elif left is None or not check_left(left):
token = tlist.token_next_match(tlist.token_index(token) + 1,
ttype, value)
else:
if include_semicolon:
sright = tlist.token_next_match(tlist.token_index(right),
T.Punctuation, ';')
if sright is not None:
# only overwrite "right" if a semicolon is actually
# present.
right = sright
tokens = tlist.tokens_between(left, right)[1:]
if not isinstance(left, cls):
new = cls([left])
new_idx = tlist.token_index(left)
tlist.tokens.remove(left)
tlist.tokens.insert(new_idx, new)
left = new
left.tokens.extend(tokens)
for t in tokens:
tlist.tokens.remove(t)
token = tlist.token_next_match(tlist.token_index(left) + 1,
ttype, value)
def _find_matching(idx, tlist, start_ttype, start_value, end_ttype, end_value):
depth = 1
for tok in tlist.tokens[idx:]:
if tok.match(start_ttype, start_value):
depth += 1
elif tok.match(end_ttype, end_value):
depth -= 1
if depth == 1:
return tok
return None
def _group_matching(tlist, start_ttype, start_value, end_ttype, end_value,
cls, include_semicolon=False, recurse=False):
[_group_matching(sgroup, start_ttype, start_value, end_ttype, end_value,
cls, include_semicolon) for sgroup in tlist.get_sublists()
if recurse]
if isinstance(tlist, cls):
idx = 1
else:
idx = 0
token = tlist.token_next_match(idx, start_ttype, start_value)
while token:
tidx = tlist.token_index(token)
end = _find_matching(tidx, tlist, start_ttype, start_value,
end_ttype, end_value)
if end is None:
idx = tidx + 1
else:
if include_semicolon:
next_ = tlist.token_next(tlist.token_index(end))
if next_ and next_.match(T.Punctuation, ';'):
end = next_
group = tlist.group_tokens(cls, tlist.tokens_between(token, end))
_group_matching(group, start_ttype, start_value,
end_ttype, end_value, cls, include_semicolon)
idx = tlist.token_index(group) + 1
token = tlist.token_next_match(idx, start_ttype, start_value)
def group_if(tlist):
_group_matching(tlist, T.Keyword, 'IF', T.Keyword, 'END IF', sql.If, True)
def group_for(tlist):
_group_matching(tlist, T.Keyword, 'FOR', T.Keyword, 'END LOOP',
sql.For, True)
def group_foreach(tlist):
_group_matching(tlist, T.Keyword, 'FOREACH', T.Keyword, 'END LOOP',
sql.For, True)
def group_begin(tlist):
_group_matching(tlist, T.Keyword, 'BEGIN', T.Keyword, 'END',
sql.Begin, True)
def group_as(tlist):
def _right_valid(token):
# Currently limited to DML/DDL. Maybe additional more non SQL reserved
# keywords should appear here (see issue8).
return not token.ttype in (T.DML, T.DDL)
def _left_valid(token):
if token.ttype is T.Keyword and token.value in ('NULL',):
return True
return token.ttype is not T.Keyword
_group_left_right(tlist, T.Keyword, 'AS', sql.Identifier,
check_right=_right_valid,
check_left=_left_valid)
def group_assignment(tlist):
_group_left_right(tlist, T.Assignment, ':=', sql.Assignment,
include_semicolon=True)
def group_comparison(tlist):
def _parts_valid(token):
return (token.ttype in (T.String.Symbol, T.String.Single,
T.Name, T.Number, T.Number.Float,
T.Number.Integer, T.Literal,
T.Literal.Number.Integer, T.Name.Placeholder)
or isinstance(token, (sql.Identifier, sql.Parenthesis))
or (token.ttype is T.Keyword
and token.value.upper() in ['NULL', ]))
_group_left_right(tlist, T.Operator.Comparison, None, sql.Comparison,
check_left=_parts_valid, check_right=_parts_valid)
def group_case(tlist):
_group_matching(tlist, T.Keyword, 'CASE', T.Keyword, 'END', sql.Case,
include_semicolon=True, recurse=True)
def group_identifier(tlist):
def _consume_cycle(tl, i):
# TODO: Usage of Wildcard token is ambivalent here.
x = itertools.cycle((
lambda y: (y.match(T.Punctuation, '.')
or y.ttype in (T.Operator,
T.Wildcard,
T.Name)
or isinstance(y, sql.SquareBrackets)),
lambda y: (y.ttype in (T.String.Symbol,
T.Name,
T.Wildcard,
T.Literal.String.Single,
T.Literal.Number.Integer,
T.Literal.Number.Float)
or isinstance(y, (sql.Parenthesis,
sql.SquareBrackets,
sql.Function)))))
for t in tl.tokens[i:]:
# Don't take whitespaces into account.
if t.ttype is T.Whitespace:
yield t
continue
if next(x)(t):
yield t
else:
if isinstance(t, sql.Comment) and t.is_multiline():
yield t
return
def _next_token(tl, i):
# chooses the next token. if two tokens are found then the
# first is returned.
t1 = tl.token_next_by_type(
i, (T.String.Symbol, T.Name, T.Literal.Number.Integer,
T.Literal.Number.Float))
i1 = tl.token_index(t1, start=i) if t1 else None
t2_end = None if i1 is None else i1 + 1
t2 = tl.token_next_by_instance(i, (sql.Function, sql.Parenthesis), end=t2_end)
if t1 and t2:
i2 = tl.token_index(t2, start=i)
if i1 > i2:
return t2
else:
return t1
elif t1:
return t1
else:
return t2
# bottom up approach: group subgroups first
[group_identifier(sgroup) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, sql.Identifier)]
# real processing
idx = 0
token = _next_token(tlist, idx)
while token:
identifier_tokens = [token] + list(
_consume_cycle(tlist,
tlist.token_index(token, start=idx) + 1))
# remove trailing whitespace
if identifier_tokens and identifier_tokens[-1].ttype is T.Whitespace:
identifier_tokens = identifier_tokens[:-1]
if not (len(identifier_tokens) == 1
and (isinstance(identifier_tokens[0], (sql.Function, sql.Parenthesis))
or identifier_tokens[0].ttype in (T.Literal.Number.Integer,
T.Literal.Number.Float))):
group = tlist.group_tokens(sql.Identifier, identifier_tokens)
idx = tlist.token_index(group, start=idx) + 1
else:
idx += 1
token = _next_token(tlist, idx)
def group_identifier_list(tlist):
[group_identifier_list(sgroup) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, sql.IdentifierList)]
# Allowed list items
fend1_funcs = [lambda t: isinstance(t, (sql.Identifier, sql.Function,
sql.Case)),
lambda t: t.is_whitespace(),
lambda t: t.ttype == T.Name,
lambda t: t.ttype == T.Wildcard,
lambda t: t.match(T.Keyword, 'null'),
lambda t: t.match(T.Keyword, 'role'),
lambda t: t.ttype == T.Number.Integer,
lambda t: t.ttype == T.String.Single,
lambda t: t.ttype == T.Name.Placeholder,
lambda t: t.ttype == T.Keyword,
lambda t: isinstance(t, sql.Comparison),
lambda t: isinstance(t, sql.Comment),
lambda t: t.ttype == T.Comment.Multiline,
]
tcomma = tlist.token_next_match(0, T.Punctuation, ',')
start = None
while tcomma is not None:
# Go back one idx to make sure to find the correct tcomma
idx = tlist.token_index(tcomma)
before = tlist.token_prev(idx)
after = tlist.token_next(idx)
# Check if the tokens around tcomma belong to a list
bpassed = apassed = False
for func in fend1_funcs:
if before is not None and func(before):
bpassed = True
if after is not None and func(after):
apassed = True
if not bpassed or not apassed:
# Something's wrong here, skip ahead to next ","
start = None
tcomma = tlist.token_next_match(idx + 1,
T.Punctuation, ',')
else:
if start is None:
start = before
after_idx = tlist.token_index(after, start=idx)
next_ = tlist.token_next(after_idx)
if next_ is None or not next_.match(T.Punctuation, ','):
# Reached the end of the list
tokens = tlist.tokens_between(start, after)
group = tlist.group_tokens(sql.IdentifierList, tokens)
start = None
tcomma = tlist.token_next_match(tlist.token_index(group) + 1,
T.Punctuation, ',')
else:
tcomma = next_
def group_brackets(tlist):
"""Group parentheses () or square brackets []
This is just like _group_matching, but complicated by the fact that
round brackets can contain square bracket groups and vice versa
"""
if isinstance(tlist, (sql.Parenthesis, sql.SquareBrackets)):
idx = 1
else:
idx = 0
# Find the first opening bracket
token = tlist.token_next_match(idx, T.Punctuation, ['(', '['])
while token:
start_val = token.value # either '(' or '['
if start_val == '(':
end_val = ')'
group_class = sql.Parenthesis
else:
end_val = ']'
group_class = sql.SquareBrackets
tidx = tlist.token_index(token)
# Find the corresponding closing bracket
end = _find_matching(tidx, tlist, T.Punctuation, start_val,
T.Punctuation, end_val)
if end is None:
idx = tidx + 1
else:
group = tlist.group_tokens(group_class,
tlist.tokens_between(token, end))
# Check for nested bracket groups within this group
group_brackets(group)
idx = tlist.token_index(group) + 1
# Find the next opening bracket
token = tlist.token_next_match(idx, T.Punctuation, ['(', '['])
def group_comments(tlist):
[group_comments(sgroup) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, sql.Comment)]
idx = 0
token = tlist.token_next_by_type(idx, T.Comment)
while token:
tidx = tlist.token_index(token)
end = tlist.token_not_matching(tidx + 1,
[lambda t: t.ttype in T.Comment,
lambda t: t.is_whitespace()])
if end is None:
idx = tidx + 1
else:
eidx = tlist.token_index(end)
grp_tokens = tlist.tokens_between(token,
tlist.token_prev(eidx, False))
group = tlist.group_tokens(sql.Comment, grp_tokens)
idx = tlist.token_index(group)
token = tlist.token_next_by_type(idx, T.Comment)
def group_where(tlist):
[group_where(sgroup) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, sql.Where)]
idx = 0
token = tlist.token_next_match(idx, T.Keyword, 'WHERE')
stopwords = ('ORDER', 'GROUP', 'LIMIT', 'UNION', 'EXCEPT', 'HAVING')
while token:
tidx = tlist.token_index(token)
end = tlist.token_next_match(tidx + 1, T.Keyword, stopwords)
if end is None:
end = tlist._groupable_tokens[-1]
else:
end = tlist.tokens[tlist.token_index(end) - 1]
group = tlist.group_tokens(sql.Where,
tlist.tokens_between(token, end),
ignore_ws=True)
idx = tlist.token_index(group)
token = tlist.token_next_match(idx, T.Keyword, 'WHERE')
def group_aliased(tlist):
clss = (sql.Identifier, sql.Function, sql.Case)
[group_aliased(sgroup) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, clss)]
idx = 0
token = tlist.token_next_by_instance(idx, clss)
while token:
next_ = tlist.token_next(tlist.token_index(token))
if next_ is not None and isinstance(next_, clss):
if not next_.value.upper().startswith('VARCHAR'):
grp = tlist.tokens_between(token, next_)[1:]
token.tokens.extend(grp)
for t in grp:
tlist.tokens.remove(t)
idx = tlist.token_index(token) + 1
token = tlist.token_next_by_instance(idx, clss)
def group_typecasts(tlist):
_group_left_right(tlist, T.Punctuation, '::', sql.Identifier)
def group_functions(tlist):
[group_functions(sgroup) for sgroup in tlist.get_sublists()
if not isinstance(sgroup, sql.Function)]
idx = 0
token = tlist.token_next_by_type(idx, T.Name)
while token:
next_ = tlist.token_next(token)
if not isinstance(next_, sql.Parenthesis):
idx = tlist.token_index(token) + 1
else:
func = tlist.group_tokens(sql.Function,
tlist.tokens_between(token, next_))
idx = tlist.token_index(func) + 1
token = tlist.token_next_by_type(idx, T.Name)
def group_order(tlist):
idx = 0
token = tlist.token_next_by_type(idx, T.Keyword.Order)
while token:
prev = tlist.token_prev(token)
if isinstance(prev, sql.Identifier):
ido = tlist.group_tokens(sql.Identifier,
tlist.tokens_between(prev, token))
idx = tlist.token_index(ido) + 1
else:
idx = tlist.token_index(token) + 1
token = tlist.token_next_by_type(idx, T.Keyword.Order)
def align_comments(tlist):
[align_comments(sgroup) for sgroup in tlist.get_sublists()]
idx = 0
token = tlist.token_next_by_instance(idx, sql.Comment)
while token:
before = tlist.token_prev(tlist.token_index(token))
if isinstance(before, sql.TokenList):
grp = tlist.tokens_between(before, token)[1:]
before.tokens.extend(grp)
for t in grp:
tlist.tokens.remove(t)
idx = tlist.token_index(before) + 1
else:
idx = tlist.token_index(token) + 1
token = tlist.token_next_by_instance(idx, sql.Comment)
def group(tlist):
for func in [
group_comments,
group_brackets,
group_functions,
group_where,
group_case,
group_identifier,
group_order,
group_typecasts,
group_as,
group_aliased,
group_assignment,
group_comparison,
align_comments,
group_identifier_list,
group_if,
group_for,
group_foreach,
group_begin,
]:
func(tlist)

View File

@@ -0,0 +1,10 @@
# Copyright (C) 2012 Andi Albrecht, albrecht.andi@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
"""Exceptions used in this package."""
class SQLParseError(Exception):
"""Base class for exceptions in this module."""

View File

@@ -0,0 +1,728 @@
# -*- coding: utf-8 -*-
import re
from os.path import abspath, join
from sqlparse import sql, tokens as T
from sqlparse.engine import FilterStack
from sqlparse.lexer import tokenize
from sqlparse.pipeline import Pipeline
from sqlparse.tokens import (Comment, Comparison, Keyword, Name, Punctuation,
String, Whitespace)
from sqlparse.utils import memoize_generator
from sqlparse.utils import split_unquoted_newlines
# --------------------------
# token process
class _CaseFilter:
ttype = None
def __init__(self, case=None):
if case is None:
case = 'upper'
assert case in ['lower', 'upper', 'capitalize']
self.convert = getattr(unicode, case)
def process(self, stack, stream):
for ttype, value in stream:
if ttype in self.ttype:
value = self.convert(value)
yield ttype, value
class KeywordCaseFilter(_CaseFilter):
ttype = T.Keyword
class IdentifierCaseFilter(_CaseFilter):
ttype = (T.Name, T.String.Symbol)
def process(self, stack, stream):
for ttype, value in stream:
if ttype in self.ttype and not value.strip()[0] == '"':
value = self.convert(value)
yield ttype, value
class TruncateStringFilter:
def __init__(self, width, char):
self.width = max(width, 1)
self.char = unicode(char)
def process(self, stack, stream):
for ttype, value in stream:
if ttype is T.Literal.String.Single:
if value[:2] == '\'\'':
inner = value[2:-2]
quote = u'\'\''
else:
inner = value[1:-1]
quote = u'\''
if len(inner) > self.width:
value = u''.join((quote, inner[:self.width], self.char,
quote))
yield ttype, value
class GetComments:
"""Get the comments from a stack"""
def process(self, stack, stream):
for token_type, value in stream:
if token_type in Comment:
yield token_type, value
class StripComments:
"""Strip the comments from a stack"""
def process(self, stack, stream):
for token_type, value in stream:
if token_type not in Comment:
yield token_type, value
def StripWhitespace(stream):
"Strip the useless whitespaces from a stream leaving only the minimal ones"
last_type = None
has_space = False
ignore_group = frozenset((Comparison, Punctuation))
for token_type, value in stream:
# We got a previous token (not empty first ones)
if last_type:
if token_type in Whitespace:
has_space = True
continue
# Ignore first empty spaces and dot-commas
elif token_type in (Whitespace, Whitespace.Newline, ignore_group):
continue
# Yield a whitespace if it can't be ignored
if has_space:
if not ignore_group.intersection((last_type, token_type)):
yield Whitespace, ' '
has_space = False
# Yield the token and set its type for checking with the next one
yield token_type, value
last_type = token_type
class IncludeStatement:
"""Filter that enable a INCLUDE statement"""
def __init__(self, dirpath=".", maxrecursive=10, raiseexceptions=False):
if maxrecursive <= 0:
raise ValueError('Max recursion limit reached')
self.dirpath = abspath(dirpath)
self.maxRecursive = maxrecursive
self.raiseexceptions = raiseexceptions
self.detected = False
@memoize_generator
def process(self, stack, stream):
# Run over all tokens in the stream
for token_type, value in stream:
# INCLUDE statement found, set detected mode
if token_type in Name and value.upper() == 'INCLUDE':
self.detected = True
continue
# INCLUDE statement was found, parse it
elif self.detected:
# Omit whitespaces
if token_type in Whitespace:
continue
# Found file path to include
if token_type in String.Symbol:
# if token_type in tokens.String.Symbol:
# Get path of file to include
path = join(self.dirpath, value[1:-1])
try:
f = open(path)
raw_sql = f.read()
f.close()
# There was a problem loading the include file
except IOError, err:
# Raise the exception to the interpreter
if self.raiseexceptions:
raise
# Put the exception as a comment on the SQL code
yield Comment, u'-- IOError: %s\n' % err
else:
# Create new FilterStack to parse readed file
# and add all its tokens to the main stack recursively
try:
filtr = IncludeStatement(self.dirpath,
self.maxRecursive - 1,
self.raiseexceptions)
# Max recursion limit reached
except ValueError, err:
# Raise the exception to the interpreter
if self.raiseexceptions:
raise
# Put the exception as a comment on the SQL code
yield Comment, u'-- ValueError: %s\n' % err
stack = FilterStack()
stack.preprocess.append(filtr)
for tv in stack.run(raw_sql):
yield tv
# Set normal mode
self.detected = False
# Don't include any token while in detected mode
continue
# Normal token
yield token_type, value
# ----------------------
# statement process
class StripCommentsFilter:
def _get_next_comment(self, tlist):
# TODO(andi) Comment types should be unified, see related issue38
token = tlist.token_next_by_instance(0, sql.Comment)
if token is None:
token = tlist.token_next_by_type(0, T.Comment)
return token
def _process(self, tlist):
token = self._get_next_comment(tlist)
while token:
tidx = tlist.token_index(token)
prev = tlist.token_prev(tidx, False)
next_ = tlist.token_next(tidx, False)
# Replace by whitespace if prev and next exist and if they're not
# whitespaces. This doesn't apply if prev or next is a paranthesis.
if (prev is not None and next_ is not None
and not prev.is_whitespace() and not next_.is_whitespace()
and not (prev.match(T.Punctuation, '(')
or next_.match(T.Punctuation, ')'))):
tlist.tokens[tidx] = sql.Token(T.Whitespace, ' ')
else:
tlist.tokens.pop(tidx)
token = self._get_next_comment(tlist)
def process(self, stack, stmt):
[self.process(stack, sgroup) for sgroup in stmt.get_sublists()]
self._process(stmt)
class StripWhitespaceFilter:
def _stripws(self, tlist):
func_name = '_stripws_%s' % tlist.__class__.__name__.lower()
func = getattr(self, func_name, self._stripws_default)
func(tlist)
def _stripws_default(self, tlist):
last_was_ws = False
for token in tlist.tokens:
if token.is_whitespace():
if last_was_ws:
token.value = ''
else:
token.value = ' '
last_was_ws = token.is_whitespace()
def _stripws_identifierlist(self, tlist):
# Removes newlines before commas, see issue140
last_nl = None
for token in tlist.tokens[:]:
if (token.ttype is T.Punctuation
and token.value == ','
and last_nl is not None):
tlist.tokens.remove(last_nl)
if token.is_whitespace():
last_nl = token
else:
last_nl = None
return self._stripws_default(tlist)
def _stripws_parenthesis(self, tlist):
if tlist.tokens[1].is_whitespace():
tlist.tokens.pop(1)
if tlist.tokens[-2].is_whitespace():
tlist.tokens.pop(-2)
self._stripws_default(tlist)
def process(self, stack, stmt, depth=0):
[self.process(stack, sgroup, depth + 1)
for sgroup in stmt.get_sublists()]
self._stripws(stmt)
if (
depth == 0
and stmt.tokens
and stmt.tokens[-1].is_whitespace()
):
stmt.tokens.pop(-1)
class ReindentFilter:
def __init__(self, width=2, char=' ', line_width=None):
self.width = width
self.char = char
self.indent = 0
self.offset = 0
self.line_width = line_width
self._curr_stmt = None
self._last_stmt = None
def _flatten_up_to_token(self, token):
"""Yields all tokens up to token plus the next one."""
# helper for _get_offset
iterator = self._curr_stmt.flatten()
for t in iterator:
yield t
if t == token:
raise StopIteration
def _get_offset(self, token):
raw = ''.join(map(unicode, self._flatten_up_to_token(token)))
line = raw.splitlines()[-1]
# Now take current offset into account and return relative offset.
full_offset = len(line) - len(self.char * (self.width * self.indent))
return full_offset - self.offset
def nl(self):
# TODO: newline character should be configurable
space = (self.char * ((self.indent * self.width) + self.offset))
# Detect runaway indenting due to parsing errors
if len(space) > 200:
# something seems to be wrong, flip back
self.indent = self.offset = 0
space = (self.char * ((self.indent * self.width) + self.offset))
ws = '\n' + space
return sql.Token(T.Whitespace, ws)
def _split_kwds(self, tlist):
split_words = ('FROM', 'STRAIGHT_JOIN$', 'JOIN$', 'AND', 'OR',
'GROUP', 'ORDER', 'UNION', 'VALUES',
'SET', 'BETWEEN', 'EXCEPT', 'HAVING')
def _next_token(i):
t = tlist.token_next_match(i, T.Keyword, split_words,
regex=True)
if t and t.value.upper() == 'BETWEEN':
t = _next_token(tlist.token_index(t) + 1)
if t and t.value.upper() == 'AND':
t = _next_token(tlist.token_index(t) + 1)
return t
idx = 0
token = _next_token(idx)
added = set()
while token:
prev = tlist.token_prev(tlist.token_index(token), False)
offset = 1
if prev and prev.is_whitespace() and prev not in added:
tlist.tokens.pop(tlist.token_index(prev))
offset += 1
uprev = unicode(prev)
if (prev and (uprev.endswith('\n') or uprev.endswith('\r'))):
nl = tlist.token_next(token)
else:
nl = self.nl()
added.add(nl)
tlist.insert_before(token, nl)
offset += 1
token = _next_token(tlist.token_index(nl) + offset)
def _split_statements(self, tlist):
idx = 0
token = tlist.token_next_by_type(idx, (T.Keyword.DDL, T.Keyword.DML))
while token:
prev = tlist.token_prev(tlist.token_index(token), False)
if prev and prev.is_whitespace():
tlist.tokens.pop(tlist.token_index(prev))
# only break if it's not the first token
if prev:
nl = self.nl()
tlist.insert_before(token, nl)
token = tlist.token_next_by_type(tlist.token_index(token) + 1,
(T.Keyword.DDL, T.Keyword.DML))
def _process(self, tlist):
func_name = '_process_%s' % tlist.__class__.__name__.lower()
func = getattr(self, func_name, self._process_default)
func(tlist)
def _process_where(self, tlist):
token = tlist.token_next_match(0, T.Keyword, 'WHERE')
try:
tlist.insert_before(token, self.nl())
except ValueError: # issue121, errors in statement
pass
self.indent += 1
self._process_default(tlist)
self.indent -= 1
def _process_having(self, tlist):
token = tlist.token_next_match(0, T.Keyword, 'HAVING')
try:
tlist.insert_before(token, self.nl())
except ValueError: # issue121, errors in statement
pass
self.indent += 1
self._process_default(tlist)
self.indent -= 1
def _process_parenthesis(self, tlist):
first = tlist.token_next(0)
indented = False
if first and first.ttype in (T.Keyword.DML, T.Keyword.DDL):
self.indent += 1
tlist.tokens.insert(0, self.nl())
indented = True
num_offset = self._get_offset(
tlist.token_next_match(0, T.Punctuation, '('))
self.offset += num_offset
self._process_default(tlist, stmts=not indented)
if indented:
self.indent -= 1
self.offset -= num_offset
def _process_identifierlist(self, tlist):
identifiers = list(tlist.get_identifiers())
if len(identifiers) > 1 and not tlist.within(sql.Function):
first = list(identifiers[0].flatten())[0]
if self.char == '\t':
# when using tabs we don't count the actual word length
# in spaces.
num_offset = 1
else:
num_offset = self._get_offset(first) - len(first.value)
self.offset += num_offset
for token in identifiers[1:]:
tlist.insert_before(token, self.nl())
self.offset -= num_offset
self._process_default(tlist)
def _process_case(self, tlist):
is_first = True
num_offset = None
case = tlist.tokens[0]
outer_offset = self._get_offset(case) - len(case.value)
self.offset += outer_offset
for cond, value in tlist.get_cases():
if is_first:
tcond = list(cond[0].flatten())[0]
is_first = False
num_offset = self._get_offset(tcond) - len(tcond.value)
self.offset += num_offset
continue
if cond is None:
token = value[0]
else:
token = cond[0]
tlist.insert_before(token, self.nl())
# Line breaks on group level are done. Now let's add an offset of
# 5 (=length of "when", "then", "else") and process subgroups.
self.offset += 5
self._process_default(tlist)
self.offset -= 5
if num_offset is not None:
self.offset -= num_offset
end = tlist.token_next_match(0, T.Keyword, 'END')
tlist.insert_before(end, self.nl())
self.offset -= outer_offset
def _process_default(self, tlist, stmts=True, kwds=True):
if stmts:
self._split_statements(tlist)
if kwds:
self._split_kwds(tlist)
[self._process(sgroup) for sgroup in tlist.get_sublists()]
def process(self, stack, stmt):
if isinstance(stmt, sql.Statement):
self._curr_stmt = stmt
self._process(stmt)
if isinstance(stmt, sql.Statement):
if self._last_stmt is not None:
if unicode(self._last_stmt).endswith('\n'):
nl = '\n'
else:
nl = '\n\n'
stmt.tokens.insert(
0, sql.Token(T.Whitespace, nl))
if self._last_stmt != stmt:
self._last_stmt = stmt
# FIXME: Doesn't work ;)
class RightMarginFilter:
keep_together = (
# sql.TypeCast, sql.Identifier, sql.Alias,
)
def __init__(self, width=79):
self.width = width
self.line = ''
def _process(self, stack, group, stream):
for token in stream:
if token.is_whitespace() and '\n' in token.value:
if token.value.endswith('\n'):
self.line = ''
else:
self.line = token.value.splitlines()[-1]
elif (token.is_group()
and not token.__class__ in self.keep_together):
token.tokens = self._process(stack, token, token.tokens)
else:
val = unicode(token)
if len(self.line) + len(val) > self.width:
match = re.search('^ +', self.line)
if match is not None:
indent = match.group()
else:
indent = ''
yield sql.Token(T.Whitespace, '\n%s' % indent)
self.line = indent
self.line += val
yield token
def process(self, stack, group):
return
group.tokens = self._process(stack, group, group.tokens)
class ColumnsSelect:
"""Get the columns names of a SELECT query"""
def process(self, stack, stream):
mode = 0
oldValue = ""
parenthesis = 0
for token_type, value in stream:
# Ignore comments
if token_type in Comment:
continue
# We have not detected a SELECT statement
if mode == 0:
if token_type in Keyword and value == 'SELECT':
mode = 1
# We have detected a SELECT statement
elif mode == 1:
if value == 'FROM':
if oldValue:
yield oldValue
mode = 3 # Columns have been checked
elif value == 'AS':
oldValue = ""
mode = 2
elif (token_type == Punctuation
and value == ',' and not parenthesis):
if oldValue:
yield oldValue
oldValue = ""
elif token_type not in Whitespace:
if value == '(':
parenthesis += 1
elif value == ')':
parenthesis -= 1
oldValue += value
# We are processing an AS keyword
elif mode == 2:
# We check also for Keywords because a bug in SQLParse
if token_type == Name or token_type == Keyword:
yield value
mode = 1
# ---------------------------
# postprocess
class SerializerUnicode:
def process(self, stack, stmt):
raw = unicode(stmt)
lines = split_unquoted_newlines(raw)
res = '\n'.join(line.rstrip() for line in lines)
return res
def Tokens2Unicode(stream):
result = ""
for _, value in stream:
result += unicode(value)
return result
class OutputFilter:
varname_prefix = ''
def __init__(self, varname='sql'):
self.varname = self.varname_prefix + varname
self.count = 0
def _process(self, stream, varname, has_nl):
raise NotImplementedError
def process(self, stack, stmt):
self.count += 1
if self.count > 1:
varname = '%s%d' % (self.varname, self.count)
else:
varname = self.varname
has_nl = len(unicode(stmt).strip().splitlines()) > 1
stmt.tokens = self._process(stmt.tokens, varname, has_nl)
return stmt
class OutputPythonFilter(OutputFilter):
def _process(self, stream, varname, has_nl):
# SQL query asignation to varname
if self.count > 1:
yield sql.Token(T.Whitespace, '\n')
yield sql.Token(T.Name, varname)
yield sql.Token(T.Whitespace, ' ')
yield sql.Token(T.Operator, '=')
yield sql.Token(T.Whitespace, ' ')
if has_nl:
yield sql.Token(T.Operator, '(')
yield sql.Token(T.Text, "'")
# Print the tokens on the quote
for token in stream:
# Token is a new line separator
if token.is_whitespace() and '\n' in token.value:
# Close quote and add a new line
yield sql.Token(T.Text, " '")
yield sql.Token(T.Whitespace, '\n')
# Quote header on secondary lines
yield sql.Token(T.Whitespace, ' ' * (len(varname) + 4))
yield sql.Token(T.Text, "'")
# Indentation
after_lb = token.value.split('\n', 1)[1]
if after_lb:
yield sql.Token(T.Whitespace, after_lb)
continue
# Token has escape chars
elif "'" in token.value:
token.value = token.value.replace("'", "\\'")
# Put the token
yield sql.Token(T.Text, token.value)
# Close quote
yield sql.Token(T.Text, "'")
if has_nl:
yield sql.Token(T.Operator, ')')
class OutputPHPFilter(OutputFilter):
varname_prefix = '$'
def _process(self, stream, varname, has_nl):
# SQL query asignation to varname (quote header)
if self.count > 1:
yield sql.Token(T.Whitespace, '\n')
yield sql.Token(T.Name, varname)
yield sql.Token(T.Whitespace, ' ')
if has_nl:
yield sql.Token(T.Whitespace, ' ')
yield sql.Token(T.Operator, '=')
yield sql.Token(T.Whitespace, ' ')
yield sql.Token(T.Text, '"')
# Print the tokens on the quote
for token in stream:
# Token is a new line separator
if token.is_whitespace() and '\n' in token.value:
# Close quote and add a new line
yield sql.Token(T.Text, ' ";')
yield sql.Token(T.Whitespace, '\n')
# Quote header on secondary lines
yield sql.Token(T.Name, varname)
yield sql.Token(T.Whitespace, ' ')
yield sql.Token(T.Operator, '.=')
yield sql.Token(T.Whitespace, ' ')
yield sql.Token(T.Text, '"')
# Indentation
after_lb = token.value.split('\n', 1)[1]
if after_lb:
yield sql.Token(T.Whitespace, after_lb)
continue
# Token has escape chars
elif '"' in token.value:
token.value = token.value.replace('"', '\\"')
# Put the token
yield sql.Token(T.Text, token.value)
# Close quote
yield sql.Token(T.Text, '"')
yield sql.Token(T.Punctuation, ';')
class Limit:
"""Get the LIMIT of a query.
If not defined, return -1 (SQL specification for no LIMIT query)
"""
def process(self, stack, stream):
index = 7
stream = list(stream)
stream.reverse()
# Run over all tokens in the stream from the end
for token_type, value in stream:
index -= 1
# if index and token_type in Keyword:
if index and token_type in Keyword and value == 'LIMIT':
return stream[4 - index][1]
return -1
def compact(stream):
"""Function that return a compacted version of the stream"""
pipe = Pipeline()
pipe.append(StripComments())
pipe.append(StripWhitespace)
return pipe(stream)

View File

@@ -0,0 +1,137 @@
# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
"""SQL formatter"""
from sqlparse import filters
from sqlparse.exceptions import SQLParseError
def validate_options(options):
"""Validates options."""
kwcase = options.get('keyword_case', None)
if kwcase not in [None, 'upper', 'lower', 'capitalize']:
raise SQLParseError('Invalid value for keyword_case: %r' % kwcase)
idcase = options.get('identifier_case', None)
if idcase not in [None, 'upper', 'lower', 'capitalize']:
raise SQLParseError('Invalid value for identifier_case: %r' % idcase)
ofrmt = options.get('output_format', None)
if ofrmt not in [None, 'sql', 'python', 'php']:
raise SQLParseError('Unknown output format: %r' % ofrmt)
strip_comments = options.get('strip_comments', False)
if strip_comments not in [True, False]:
raise SQLParseError('Invalid value for strip_comments: %r'
% strip_comments)
strip_ws = options.get('strip_whitespace', False)
if strip_ws not in [True, False]:
raise SQLParseError('Invalid value for strip_whitespace: %r'
% strip_ws)
truncate_strings = options.get('truncate_strings', None)
if truncate_strings is not None:
try:
truncate_strings = int(truncate_strings)
except (ValueError, TypeError):
raise SQLParseError('Invalid value for truncate_strings: %r'
% truncate_strings)
if truncate_strings <= 1:
raise SQLParseError('Invalid value for truncate_strings: %r'
% truncate_strings)
options['truncate_strings'] = truncate_strings
options['truncate_char'] = options.get('truncate_char', '[...]')
reindent = options.get('reindent', False)
if reindent not in [True, False]:
raise SQLParseError('Invalid value for reindent: %r'
% reindent)
elif reindent:
options['strip_whitespace'] = True
indent_tabs = options.get('indent_tabs', False)
if indent_tabs not in [True, False]:
raise SQLParseError('Invalid value for indent_tabs: %r' % indent_tabs)
elif indent_tabs:
options['indent_char'] = '\t'
else:
options['indent_char'] = ' '
indent_width = options.get('indent_width', 2)
try:
indent_width = int(indent_width)
except (TypeError, ValueError):
raise SQLParseError('indent_width requires an integer')
if indent_width < 1:
raise SQLParseError('indent_width requires an positive integer')
options['indent_width'] = indent_width
right_margin = options.get('right_margin', None)
if right_margin is not None:
try:
right_margin = int(right_margin)
except (TypeError, ValueError):
raise SQLParseError('right_margin requires an integer')
if right_margin < 10:
raise SQLParseError('right_margin requires an integer > 10')
options['right_margin'] = right_margin
return options
def build_filter_stack(stack, options):
"""Setup and return a filter stack.
Args:
stack: :class:`~sqlparse.filters.FilterStack` instance
options: Dictionary with options validated by validate_options.
"""
# Token filter
if options.get('keyword_case', None):
stack.preprocess.append(
filters.KeywordCaseFilter(options['keyword_case']))
if options.get('identifier_case', None):
stack.preprocess.append(
filters.IdentifierCaseFilter(options['identifier_case']))
if options.get('truncate_strings', None) is not None:
stack.preprocess.append(filters.TruncateStringFilter(
width=options['truncate_strings'], char=options['truncate_char']))
# After grouping
if options.get('strip_comments', False):
stack.enable_grouping()
stack.stmtprocess.append(filters.StripCommentsFilter())
if (options.get('strip_whitespace', False)
or options.get('reindent', False)):
stack.enable_grouping()
stack.stmtprocess.append(filters.StripWhitespaceFilter())
if options.get('reindent', False):
stack.enable_grouping()
stack.stmtprocess.append(
filters.ReindentFilter(char=options['indent_char'],
width=options['indent_width']))
if options.get('right_margin', False):
stack.enable_grouping()
stack.stmtprocess.append(
filters.RightMarginFilter(width=options['right_margin']))
# Serializer
if options.get('output_format'):
frmt = options['output_format']
if frmt.lower() == 'php':
fltr = filters.OutputPHPFilter()
elif frmt.lower() == 'python':
fltr = filters.OutputPythonFilter()
else:
fltr = None
if fltr is not None:
stack.postprocess.append(fltr)
return stack

View File

@@ -0,0 +1,44 @@
'''
Created on 17/05/2012
@author: piranna
Several utility functions to extract info from the SQL sentences
'''
from sqlparse.filters import ColumnsSelect, Limit
from sqlparse.pipeline import Pipeline
from sqlparse.tokens import Keyword, Whitespace
def getlimit(stream):
"""Function that return the LIMIT of a input SQL """
pipe = Pipeline()
pipe.append(Limit())
result = pipe(stream)
try:
return int(result)
except ValueError:
return result
def getcolumns(stream):
"""Function that return the colums of a SELECT query"""
pipe = Pipeline()
pipe.append(ColumnsSelect())
return pipe(stream)
class IsType(object):
"""Functor that return is the statement is of a specific type"""
def __init__(self, type):
self.type = type
def __call__(self, stream):
for token_type, value in stream:
if token_type not in Whitespace:
return token_type in Keyword and value == self.type

View File

@@ -0,0 +1,574 @@
from sqlparse import tokens
KEYWORDS = {
'ABORT': tokens.Keyword,
'ABS': tokens.Keyword,
'ABSOLUTE': tokens.Keyword,
'ACCESS': tokens.Keyword,
'ADA': tokens.Keyword,
'ADD': tokens.Keyword,
'ADMIN': tokens.Keyword,
'AFTER': tokens.Keyword,
'AGGREGATE': tokens.Keyword,
'ALIAS': tokens.Keyword,
'ALL': tokens.Keyword,
'ALLOCATE': tokens.Keyword,
'ANALYSE': tokens.Keyword,
'ANALYZE': tokens.Keyword,
'ANY': tokens.Keyword,
'ARE': tokens.Keyword,
'ASC': tokens.Keyword.Order,
'ASENSITIVE': tokens.Keyword,
'ASSERTION': tokens.Keyword,
'ASSIGNMENT': tokens.Keyword,
'ASYMMETRIC': tokens.Keyword,
'AT': tokens.Keyword,
'ATOMIC': tokens.Keyword,
'AUTHORIZATION': tokens.Keyword,
'AVG': tokens.Keyword,
'BACKWARD': tokens.Keyword,
'BEFORE': tokens.Keyword,
'BEGIN': tokens.Keyword,
'BETWEEN': tokens.Keyword,
'BITVAR': tokens.Keyword,
'BIT_LENGTH': tokens.Keyword,
'BOTH': tokens.Keyword,
'BREADTH': tokens.Keyword,
# 'C': tokens.Keyword, # most likely this is an alias
'CACHE': tokens.Keyword,
'CALL': tokens.Keyword,
'CALLED': tokens.Keyword,
'CARDINALITY': tokens.Keyword,
'CASCADE': tokens.Keyword,
'CASCADED': tokens.Keyword,
'CAST': tokens.Keyword,
'CATALOG': tokens.Keyword,
'CATALOG_NAME': tokens.Keyword,
'CHAIN': tokens.Keyword,
'CHARACTERISTICS': tokens.Keyword,
'CHARACTER_LENGTH': tokens.Keyword,
'CHARACTER_SET_CATALOG': tokens.Keyword,
'CHARACTER_SET_NAME': tokens.Keyword,
'CHARACTER_SET_SCHEMA': tokens.Keyword,
'CHAR_LENGTH': tokens.Keyword,
'CHECK': tokens.Keyword,
'CHECKED': tokens.Keyword,
'CHECKPOINT': tokens.Keyword,
'CLASS': tokens.Keyword,
'CLASS_ORIGIN': tokens.Keyword,
'CLOB': tokens.Keyword,
'CLOSE': tokens.Keyword,
'CLUSTER': tokens.Keyword,
'COALESCE': tokens.Keyword,
'COBOL': tokens.Keyword,
'COLLATE': tokens.Keyword,
'COLLATION': tokens.Keyword,
'COLLATION_CATALOG': tokens.Keyword,
'COLLATION_NAME': tokens.Keyword,
'COLLATION_SCHEMA': tokens.Keyword,
'COLLECT': tokens.Keyword,
'COLUMN': tokens.Keyword,
'COLUMN_NAME': tokens.Keyword,
'COMMAND_FUNCTION': tokens.Keyword,
'COMMAND_FUNCTION_CODE': tokens.Keyword,
'COMMENT': tokens.Keyword,
'COMMIT': tokens.Keyword.DML,
'COMMITTED': tokens.Keyword,
'COMPLETION': tokens.Keyword,
'CONDITION_NUMBER': tokens.Keyword,
'CONNECT': tokens.Keyword,
'CONNECTION': tokens.Keyword,
'CONNECTION_NAME': tokens.Keyword,
'CONSTRAINT': tokens.Keyword,
'CONSTRAINTS': tokens.Keyword,
'CONSTRAINT_CATALOG': tokens.Keyword,
'CONSTRAINT_NAME': tokens.Keyword,
'CONSTRAINT_SCHEMA': tokens.Keyword,
'CONSTRUCTOR': tokens.Keyword,
'CONTAINS': tokens.Keyword,
'CONTINUE': tokens.Keyword,
'CONVERSION': tokens.Keyword,
'CONVERT': tokens.Keyword,
'COPY': tokens.Keyword,
'CORRESPONTING': tokens.Keyword,
'COUNT': tokens.Keyword,
'CREATEDB': tokens.Keyword,
'CREATEUSER': tokens.Keyword,
'CROSS': tokens.Keyword,
'CUBE': tokens.Keyword,
'CURRENT': tokens.Keyword,
'CURRENT_DATE': tokens.Keyword,
'CURRENT_PATH': tokens.Keyword,
'CURRENT_ROLE': tokens.Keyword,
'CURRENT_TIME': tokens.Keyword,
'CURRENT_TIMESTAMP': tokens.Keyword,
'CURRENT_USER': tokens.Keyword,
'CURSOR': tokens.Keyword,
'CURSOR_NAME': tokens.Keyword,
'CYCLE': tokens.Keyword,
'DATA': tokens.Keyword,
'DATABASE': tokens.Keyword,
'DATETIME_INTERVAL_CODE': tokens.Keyword,
'DATETIME_INTERVAL_PRECISION': tokens.Keyword,
'DAY': tokens.Keyword,
'DEALLOCATE': tokens.Keyword,
'DECLARE': tokens.Keyword,
'DEFAULT': tokens.Keyword,
'DEFAULTS': tokens.Keyword,
'DEFERRABLE': tokens.Keyword,
'DEFERRED': tokens.Keyword,
'DEFINED': tokens.Keyword,
'DEFINER': tokens.Keyword,
'DELIMITER': tokens.Keyword,
'DELIMITERS': tokens.Keyword,
'DEREF': tokens.Keyword,
'DESC': tokens.Keyword.Order,
'DESCRIBE': tokens.Keyword,
'DESCRIPTOR': tokens.Keyword,
'DESTROY': tokens.Keyword,
'DESTRUCTOR': tokens.Keyword,
'DETERMINISTIC': tokens.Keyword,
'DIAGNOSTICS': tokens.Keyword,
'DICTIONARY': tokens.Keyword,
'DISCONNECT': tokens.Keyword,
'DISPATCH': tokens.Keyword,
'DO': tokens.Keyword,
'DOMAIN': tokens.Keyword,
'DYNAMIC': tokens.Keyword,
'DYNAMIC_FUNCTION': tokens.Keyword,
'DYNAMIC_FUNCTION_CODE': tokens.Keyword,
'EACH': tokens.Keyword,
'ENCODING': tokens.Keyword,
'ENCRYPTED': tokens.Keyword,
'END-EXEC': tokens.Keyword,
'EQUALS': tokens.Keyword,
'ESCAPE': tokens.Keyword,
'EVERY': tokens.Keyword,
'EXCEPT': tokens.Keyword,
'ESCEPTION': tokens.Keyword,
'EXCLUDING': tokens.Keyword,
'EXCLUSIVE': tokens.Keyword,
'EXEC': tokens.Keyword,
'EXECUTE': tokens.Keyword,
'EXISTING': tokens.Keyword,
'EXISTS': tokens.Keyword,
'EXTERNAL': tokens.Keyword,
'EXTRACT': tokens.Keyword,
'FALSE': tokens.Keyword,
'FETCH': tokens.Keyword,
'FINAL': tokens.Keyword,
'FIRST': tokens.Keyword,
'FORCE': tokens.Keyword,
'FOREACH': tokens.Keyword,
'FOREIGN': tokens.Keyword,
'FORTRAN': tokens.Keyword,
'FORWARD': tokens.Keyword,
'FOUND': tokens.Keyword,
'FREE': tokens.Keyword,
'FREEZE': tokens.Keyword,
'FULL': tokens.Keyword,
'FUNCTION': tokens.Keyword,
# 'G': tokens.Keyword,
'GENERAL': tokens.Keyword,
'GENERATED': tokens.Keyword,
'GET': tokens.Keyword,
'GLOBAL': tokens.Keyword,
'GO': tokens.Keyword,
'GOTO': tokens.Keyword,
'GRANT': tokens.Keyword,
'GRANTED': tokens.Keyword,
'GROUPING': tokens.Keyword,
'HANDLER': tokens.Keyword,
'HAVING': tokens.Keyword,
'HIERARCHY': tokens.Keyword,
'HOLD': tokens.Keyword,
'HOST': tokens.Keyword,
'IDENTITY': tokens.Keyword,
'IGNORE': tokens.Keyword,
'ILIKE': tokens.Keyword,
'IMMEDIATE': tokens.Keyword,
'IMMUTABLE': tokens.Keyword,
'IMPLEMENTATION': tokens.Keyword,
'IMPLICIT': tokens.Keyword,
'INCLUDING': tokens.Keyword,
'INCREMENT': tokens.Keyword,
'INDEX': tokens.Keyword,
'INDITCATOR': tokens.Keyword,
'INFIX': tokens.Keyword,
'INHERITS': tokens.Keyword,
'INITIALIZE': tokens.Keyword,
'INITIALLY': tokens.Keyword,
'INOUT': tokens.Keyword,
'INPUT': tokens.Keyword,
'INSENSITIVE': tokens.Keyword,
'INSTANTIABLE': tokens.Keyword,
'INSTEAD': tokens.Keyword,
'INTERSECT': tokens.Keyword,
'INTO': tokens.Keyword,
'INVOKER': tokens.Keyword,
'IS': tokens.Keyword,
'ISNULL': tokens.Keyword,
'ISOLATION': tokens.Keyword,
'ITERATE': tokens.Keyword,
# 'K': tokens.Keyword,
'KEY': tokens.Keyword,
'KEY_MEMBER': tokens.Keyword,
'KEY_TYPE': tokens.Keyword,
'LANCOMPILER': tokens.Keyword,
'LANGUAGE': tokens.Keyword,
'LARGE': tokens.Keyword,
'LAST': tokens.Keyword,
'LATERAL': tokens.Keyword,
'LEADING': tokens.Keyword,
'LENGTH': tokens.Keyword,
'LESS': tokens.Keyword,
'LEVEL': tokens.Keyword,
'LIMIT': tokens.Keyword,
'LISTEN': tokens.Keyword,
'LOAD': tokens.Keyword,
'LOCAL': tokens.Keyword,
'LOCALTIME': tokens.Keyword,
'LOCALTIMESTAMP': tokens.Keyword,
'LOCATION': tokens.Keyword,
'LOCATOR': tokens.Keyword,
'LOCK': tokens.Keyword,
'LOWER': tokens.Keyword,
# 'M': tokens.Keyword,
'MAP': tokens.Keyword,
'MATCH': tokens.Keyword,
'MAXVALUE': tokens.Keyword,
'MESSAGE_LENGTH': tokens.Keyword,
'MESSAGE_OCTET_LENGTH': tokens.Keyword,
'MESSAGE_TEXT': tokens.Keyword,
'METHOD': tokens.Keyword,
'MINUTE': tokens.Keyword,
'MINVALUE': tokens.Keyword,
'MOD': tokens.Keyword,
'MODE': tokens.Keyword,
'MODIFIES': tokens.Keyword,
'MODIFY': tokens.Keyword,
'MONTH': tokens.Keyword,
'MORE': tokens.Keyword,
'MOVE': tokens.Keyword,
'MUMPS': tokens.Keyword,
'NAMES': tokens.Keyword,
'NATIONAL': tokens.Keyword,
'NATURAL': tokens.Keyword,
'NCHAR': tokens.Keyword,
'NCLOB': tokens.Keyword,
'NEW': tokens.Keyword,
'NEXT': tokens.Keyword,
'NO': tokens.Keyword,
'NOCREATEDB': tokens.Keyword,
'NOCREATEUSER': tokens.Keyword,
'NONE': tokens.Keyword,
'NOT': tokens.Keyword,
'NOTHING': tokens.Keyword,
'NOTIFY': tokens.Keyword,
'NOTNULL': tokens.Keyword,
'NULL': tokens.Keyword,
'NULLABLE': tokens.Keyword,
'NULLIF': tokens.Keyword,
'OBJECT': tokens.Keyword,
'OCTET_LENGTH': tokens.Keyword,
'OF': tokens.Keyword,
'OFF': tokens.Keyword,
'OFFSET': tokens.Keyword,
'OIDS': tokens.Keyword,
'OLD': tokens.Keyword,
'ONLY': tokens.Keyword,
'OPEN': tokens.Keyword,
'OPERATION': tokens.Keyword,
'OPERATOR': tokens.Keyword,
'OPTION': tokens.Keyword,
'OPTIONS': tokens.Keyword,
'ORDINALITY': tokens.Keyword,
'OUT': tokens.Keyword,
'OUTPUT': tokens.Keyword,
'OVERLAPS': tokens.Keyword,
'OVERLAY': tokens.Keyword,
'OVERRIDING': tokens.Keyword,
'OWNER': tokens.Keyword,
'PAD': tokens.Keyword,
'PARAMETER': tokens.Keyword,
'PARAMETERS': tokens.Keyword,
'PARAMETER_MODE': tokens.Keyword,
'PARAMATER_NAME': tokens.Keyword,
'PARAMATER_ORDINAL_POSITION': tokens.Keyword,
'PARAMETER_SPECIFIC_CATALOG': tokens.Keyword,
'PARAMETER_SPECIFIC_NAME': tokens.Keyword,
'PARAMATER_SPECIFIC_SCHEMA': tokens.Keyword,
'PARTIAL': tokens.Keyword,
'PASCAL': tokens.Keyword,
'PENDANT': tokens.Keyword,
'PLACING': tokens.Keyword,
'PLI': tokens.Keyword,
'POSITION': tokens.Keyword,
'POSTFIX': tokens.Keyword,
'PRECISION': tokens.Keyword,
'PREFIX': tokens.Keyword,
'PREORDER': tokens.Keyword,
'PREPARE': tokens.Keyword,
'PRESERVE': tokens.Keyword,
'PRIMARY': tokens.Keyword,
'PRIOR': tokens.Keyword,
'PRIVILEGES': tokens.Keyword,
'PROCEDURAL': tokens.Keyword,
'PROCEDURE': tokens.Keyword,
'PUBLIC': tokens.Keyword,
'RAISE': tokens.Keyword,
'READ': tokens.Keyword,
'READS': tokens.Keyword,
'RECHECK': tokens.Keyword,
'RECURSIVE': tokens.Keyword,
'REF': tokens.Keyword,
'REFERENCES': tokens.Keyword,
'REFERENCING': tokens.Keyword,
'REINDEX': tokens.Keyword,
'RELATIVE': tokens.Keyword,
'RENAME': tokens.Keyword,
'REPEATABLE': tokens.Keyword,
'RESET': tokens.Keyword,
'RESTART': tokens.Keyword,
'RESTRICT': tokens.Keyword,
'RESULT': tokens.Keyword,
'RETURN': tokens.Keyword,
'RETURNED_LENGTH': tokens.Keyword,
'RETURNED_OCTET_LENGTH': tokens.Keyword,
'RETURNED_SQLSTATE': tokens.Keyword,
'RETURNS': tokens.Keyword,
'REVOKE': tokens.Keyword,
'RIGHT': tokens.Keyword,
'ROLE': tokens.Keyword,
'ROLLBACK': tokens.Keyword.DML,
'ROLLUP': tokens.Keyword,
'ROUTINE': tokens.Keyword,
'ROUTINE_CATALOG': tokens.Keyword,
'ROUTINE_NAME': tokens.Keyword,
'ROUTINE_SCHEMA': tokens.Keyword,
'ROW': tokens.Keyword,
'ROWS': tokens.Keyword,
'ROW_COUNT': tokens.Keyword,
'RULE': tokens.Keyword,
'SAVE_POINT': tokens.Keyword,
'SCALE': tokens.Keyword,
'SCHEMA': tokens.Keyword,
'SCHEMA_NAME': tokens.Keyword,
'SCOPE': tokens.Keyword,
'SCROLL': tokens.Keyword,
'SEARCH': tokens.Keyword,
'SECOND': tokens.Keyword,
'SECURITY': tokens.Keyword,
'SELF': tokens.Keyword,
'SENSITIVE': tokens.Keyword,
'SERIALIZABLE': tokens.Keyword,
'SERVER_NAME': tokens.Keyword,
'SESSION': tokens.Keyword,
'SESSION_USER': tokens.Keyword,
'SETOF': tokens.Keyword,
'SETS': tokens.Keyword,
'SHARE': tokens.Keyword,
'SHOW': tokens.Keyword,
'SIMILAR': tokens.Keyword,
'SIMPLE': tokens.Keyword,
'SIZE': tokens.Keyword,
'SOME': tokens.Keyword,
'SOURCE': tokens.Keyword,
'SPACE': tokens.Keyword,
'SPECIFIC': tokens.Keyword,
'SPECIFICTYPE': tokens.Keyword,
'SPECIFIC_NAME': tokens.Keyword,
'SQL': tokens.Keyword,
'SQLCODE': tokens.Keyword,
'SQLERROR': tokens.Keyword,
'SQLEXCEPTION': tokens.Keyword,
'SQLSTATE': tokens.Keyword,
'SQLWARNING': tokens.Keyword,
'STABLE': tokens.Keyword,
'START': tokens.Keyword.DML,
'STATE': tokens.Keyword,
'STATEMENT': tokens.Keyword,
'STATIC': tokens.Keyword,
'STATISTICS': tokens.Keyword,
'STDIN': tokens.Keyword,
'STDOUT': tokens.Keyword,
'STORAGE': tokens.Keyword,
'STRICT': tokens.Keyword,
'STRUCTURE': tokens.Keyword,
'STYPE': tokens.Keyword,
'SUBCLASS_ORIGIN': tokens.Keyword,
'SUBLIST': tokens.Keyword,
'SUBSTRING': tokens.Keyword,
'SUM': tokens.Keyword,
'SYMMETRIC': tokens.Keyword,
'SYSID': tokens.Keyword,
'SYSTEM': tokens.Keyword,
'SYSTEM_USER': tokens.Keyword,
'TABLE': tokens.Keyword,
'TABLE_NAME': tokens.Keyword,
'TEMP': tokens.Keyword,
'TEMPLATE': tokens.Keyword,
'TEMPORARY': tokens.Keyword,
'TERMINATE': tokens.Keyword,
'THAN': tokens.Keyword,
'TIMESTAMP': tokens.Keyword,
'TIMEZONE_HOUR': tokens.Keyword,
'TIMEZONE_MINUTE': tokens.Keyword,
'TO': tokens.Keyword,
'TOAST': tokens.Keyword,
'TRAILING': tokens.Keyword,
'TRANSATION': tokens.Keyword,
'TRANSACTIONS_COMMITTED': tokens.Keyword,
'TRANSACTIONS_ROLLED_BACK': tokens.Keyword,
'TRANSATION_ACTIVE': tokens.Keyword,
'TRANSFORM': tokens.Keyword,
'TRANSFORMS': tokens.Keyword,
'TRANSLATE': tokens.Keyword,
'TRANSLATION': tokens.Keyword,
'TREAT': tokens.Keyword,
'TRIGGER': tokens.Keyword,
'TRIGGER_CATALOG': tokens.Keyword,
'TRIGGER_NAME': tokens.Keyword,
'TRIGGER_SCHEMA': tokens.Keyword,
'TRIM': tokens.Keyword,
'TRUE': tokens.Keyword,
'TRUNCATE': tokens.Keyword,
'TRUSTED': tokens.Keyword,
'TYPE': tokens.Keyword,
'UNCOMMITTED': tokens.Keyword,
'UNDER': tokens.Keyword,
'UNENCRYPTED': tokens.Keyword,
'UNION': tokens.Keyword,
'UNIQUE': tokens.Keyword,
'UNKNOWN': tokens.Keyword,
'UNLISTEN': tokens.Keyword,
'UNNAMED': tokens.Keyword,
'UNNEST': tokens.Keyword,
'UNTIL': tokens.Keyword,
'UPPER': tokens.Keyword,
'USAGE': tokens.Keyword,
'USE': tokens.Keyword,
'USER': tokens.Keyword,
'USER_DEFINED_TYPE_CATALOG': tokens.Keyword,
'USER_DEFINED_TYPE_NAME': tokens.Keyword,
'USER_DEFINED_TYPE_SCHEMA': tokens.Keyword,
'USING': tokens.Keyword,
'VACUUM': tokens.Keyword,
'VALID': tokens.Keyword,
'VALIDATOR': tokens.Keyword,
'VALUES': tokens.Keyword,
'VARIABLE': tokens.Keyword,
'VERBOSE': tokens.Keyword,
'VERSION': tokens.Keyword,
'VIEW': tokens.Keyword,
'VOLATILE': tokens.Keyword,
'WHENEVER': tokens.Keyword,
'WITH': tokens.Keyword,
'WITHOUT': tokens.Keyword,
'WORK': tokens.Keyword,
'WRITE': tokens.Keyword,
'YEAR': tokens.Keyword,
'ZONE': tokens.Keyword,
# Name.Builtin
'ARRAY': tokens.Name.Builtin,
'BIGINT': tokens.Name.Builtin,
'BINARY': tokens.Name.Builtin,
'BIT': tokens.Name.Builtin,
'BLOB': tokens.Name.Builtin,
'BOOLEAN': tokens.Name.Builtin,
'CHAR': tokens.Name.Builtin,
'CHARACTER': tokens.Name.Builtin,
'DATE': tokens.Name.Builtin,
'DEC': tokens.Name.Builtin,
'DECIMAL': tokens.Name.Builtin,
'FLOAT': tokens.Name.Builtin,
'INT': tokens.Name.Builtin,
'INT8': tokens.Name.Builtin,
'INTEGER': tokens.Name.Builtin,
'INTERVAL': tokens.Name.Builtin,
'LONG': tokens.Name.Builtin,
'NUMBER': tokens.Name.Builtin,
'NUMERIC': tokens.Name.Builtin,
'REAL': tokens.Name.Builtin,
'SERIAL': tokens.Name.Builtin,
'SERIAL8': tokens.Name.Builtin,
'SIGNED': tokens.Name.Builtin,
'SMALLINT': tokens.Name.Builtin,
'TEXT': tokens.Name.Builtin,
'TINYINT': tokens.Name.Builtin,
'UNSIGNED': tokens.Name.Builtin,
'VARCHAR': tokens.Name.Builtin,
'VARCHAR2': tokens.Name.Builtin,
'VARYING': tokens.Name.Builtin,
}
KEYWORDS_COMMON = {
'SELECT': tokens.Keyword.DML,
'INSERT': tokens.Keyword.DML,
'DELETE': tokens.Keyword.DML,
'UPDATE': tokens.Keyword.DML,
'REPLACE': tokens.Keyword.DML,
'MERGE': tokens.Keyword.DML,
'DROP': tokens.Keyword.DDL,
'CREATE': tokens.Keyword.DDL,
'ALTER': tokens.Keyword.DDL,
'WHERE': tokens.Keyword,
'FROM': tokens.Keyword,
'INNER': tokens.Keyword,
'JOIN': tokens.Keyword,
'STRAIGHT_JOIN': tokens.Keyword,
'AND': tokens.Keyword,
'OR': tokens.Keyword,
'LIKE': tokens.Keyword,
'ON': tokens.Keyword,
'IN': tokens.Keyword,
'SET': tokens.Keyword,
'BY': tokens.Keyword,
'GROUP': tokens.Keyword,
'ORDER': tokens.Keyword,
'LEFT': tokens.Keyword,
'OUTER': tokens.Keyword,
'FULL': tokens.Keyword,
'IF': tokens.Keyword,
'END': tokens.Keyword,
'THEN': tokens.Keyword,
'LOOP': tokens.Keyword,
'AS': tokens.Keyword,
'ELSE': tokens.Keyword,
'FOR': tokens.Keyword,
'CASE': tokens.Keyword,
'WHEN': tokens.Keyword,
'MIN': tokens.Keyword,
'MAX': tokens.Keyword,
'DISTINCT': tokens.Keyword,
}

View File

@@ -0,0 +1,362 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
"""SQL Lexer"""
# This code is based on the SqlLexer in pygments.
# http://pygments.org/
# It's separated from the rest of pygments to increase performance
# and to allow some customizations.
import re
import sys
from sqlparse import tokens
from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON
from cStringIO import StringIO
class include(str):
pass
class combined(tuple):
"""Indicates a state combined from multiple states."""
def __new__(cls, *args):
return tuple.__new__(cls, args)
def __init__(self, *args):
# tuple.__init__ doesn't do anything
pass
def is_keyword(value):
test = value.upper()
return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, tokens.Name)), value
def apply_filters(stream, filters, lexer=None):
"""
Use this method to apply an iterable of filters to
a stream. If lexer is given it's forwarded to the
filter, otherwise the filter receives `None`.
"""
def _apply(filter_, stream):
for token in filter_.filter(lexer, stream):
yield token
for filter_ in filters:
stream = _apply(filter_, stream)
return stream
class LexerMeta(type):
"""
Metaclass for Lexer, creates the self._tokens attribute from
self.tokens on the first instantiation.
"""
def _process_state(cls, unprocessed, processed, state):
assert type(state) is str, "wrong state name %r" % state
assert state[0] != '#', "invalid state name %r" % state
if state in processed:
return processed[state]
tokenlist = processed[state] = []
rflags = cls.flags
for tdef in unprocessed[state]:
if isinstance(tdef, include):
# it's a state reference
assert tdef != state, "circular state reference %r" % state
tokenlist.extend(cls._process_state(
unprocessed, processed, str(tdef)))
continue
assert type(tdef) is tuple, "wrong rule def %r" % tdef
try:
rex = re.compile(tdef[0], rflags).match
except Exception, err:
raise ValueError(("uncompilable regex %r in state"
" %r of %r: %s"
% (tdef[0], state, cls, err)))
assert type(tdef[1]) is tokens._TokenType or callable(tdef[1]), \
('token type must be simple type or callable, not %r'
% (tdef[1],))
if len(tdef) == 2:
new_state = None
else:
tdef2 = tdef[2]
if isinstance(tdef2, str):
# an existing state
if tdef2 == '#pop':
new_state = -1
elif tdef2 in unprocessed:
new_state = (tdef2,)
elif tdef2 == '#push':
new_state = tdef2
elif tdef2[:5] == '#pop:':
new_state = -int(tdef2[5:])
else:
assert False, 'unknown new state %r' % tdef2
elif isinstance(tdef2, combined):
# combine a new state from existing ones
new_state = '_tmp_%d' % cls._tmpname
cls._tmpname += 1
itokens = []
for istate in tdef2:
assert istate != state, \
'circular state ref %r' % istate
itokens.extend(cls._process_state(unprocessed,
processed, istate))
processed[new_state] = itokens
new_state = (new_state,)
elif isinstance(tdef2, tuple):
# push more than one state
for state in tdef2:
assert (state in unprocessed or
state in ('#pop', '#push')), \
'unknown new state ' + state
new_state = tdef2
else:
assert False, 'unknown new state def %r' % tdef2
tokenlist.append((rex, tdef[1], new_state))
return tokenlist
def process_tokendef(cls):
cls._all_tokens = {}
cls._tmpname = 0
processed = cls._all_tokens[cls.__name__] = {}
#tokendefs = tokendefs or cls.tokens[name]
for state in cls.tokens.keys():
cls._process_state(cls.tokens, processed, state)
return processed
def __call__(cls, *args, **kwds):
if not hasattr(cls, '_tokens'):
cls._all_tokens = {}
cls._tmpname = 0
if hasattr(cls, 'token_variants') and cls.token_variants:
# don't process yet
pass
else:
cls._tokens = cls.process_tokendef()
return type.__call__(cls, *args, **kwds)
class Lexer(object):
__metaclass__ = LexerMeta
encoding = 'utf-8'
stripall = False
stripnl = False
tabsize = 0
flags = re.IGNORECASE | re.UNICODE
tokens = {
'root': [
(r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single),
# $ matches *before* newline, therefore we have two patterns
# to match Comment.Single
(r'(--|# ).*?$', tokens.Comment.Single),
(r'(\r\n|\r|\n)', tokens.Newline),
(r'\s+', tokens.Whitespace),
(r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
(r':=', tokens.Assignment),
(r'::', tokens.Punctuation),
(r'[*]', tokens.Wildcard),
(r'CASE\b', tokens.Keyword), # extended CASE(foo)
(r"`(``|[^`])*`", tokens.Name),
(r"´(´´|[^´])*´", tokens.Name),
(r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin),
(r'\?{1}', tokens.Name.Placeholder),
(r'%\(\w+\)s', tokens.Name.Placeholder),
(r'%s', tokens.Name.Placeholder),
(r'[$:?]\w+', tokens.Name.Placeholder),
# FIXME(andi): VALUES shouldn't be listed here
# see https://github.com/andialbrecht/sqlparse/pull/64
(r'VALUES', tokens.Keyword),
(r'(@|##|#)[^\W\d_]\w+', tokens.Name),
# IN is special, it may be followed by a parenthesis, but
# is never a functino, see issue183
(r'in\b(?=[ (])?', tokens.Keyword),
(r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39
(r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal),
(r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float),
(r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float),
(r'[-]?[0-9]+', tokens.Number.Integer),
(r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
# not a real string literal in ANSI SQL:
(r'(""|".*?[^\\]")', tokens.String.Symbol),
# sqlite names can be escaped with [square brackets]. left bracket
# cannot be preceded by word character or a right bracket --
# otherwise it's probably an array index
(r'(?<![\w\])])(\[[^\]]+\])', tokens.Name),
(r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
(r'END(\s+IF|\s+LOOP)?\b', tokens.Keyword),
(r'NOT NULL\b', tokens.Keyword),
(r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
(r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
(r'(?<=\.)[^\W\d_]\w*', tokens.Name),
(r'[^\W\d]\w*', is_keyword),
(r'[;:()\[\],\.]', tokens.Punctuation),
(r'[<>=~!]+', tokens.Operator.Comparison),
(r'[+/@#%^&|`?^-]+', tokens.Operator),
],
'multiline-comments': [
(r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
(r'\*/', tokens.Comment.Multiline, '#pop'),
(r'[^/\*]+', tokens.Comment.Multiline),
(r'[/*]', tokens.Comment.Multiline),
]}
def __init__(self):
self.filters = []
def add_filter(self, filter_, **options):
from sqlparse.filters import Filter
if not isinstance(filter_, Filter):
filter_ = filter_(**options)
self.filters.append(filter_)
def _decode(self, text):
if sys.version_info[0] == 3:
if isinstance(text, str):
return text
if self.encoding == 'guess':
try:
text = text.decode('utf-8')
if text.startswith(u'\ufeff'):
text = text[len(u'\ufeff'):]
except UnicodeDecodeError:
text = text.decode('latin1')
else:
try:
text = text.decode(self.encoding)
except UnicodeDecodeError:
text = text.decode('unicode-escape')
if self.tabsize > 0:
text = text.expandtabs(self.tabsize)
return text
def get_tokens(self, text, unfiltered=False):
"""
Return an iterable of (tokentype, value) pairs generated from
`text`. If `unfiltered` is set to `True`, the filtering mechanism
is bypassed even if filters are defined.
Also preprocess the text, i.e. expand tabs and strip it if
wanted and applies registered filters.
"""
if isinstance(text, basestring):
if self.stripall:
text = text.strip()
elif self.stripnl:
text = text.strip('\n')
if sys.version_info[0] < 3 and isinstance(text, unicode):
text = StringIO(text.encode('utf-8'))
self.encoding = 'utf-8'
else:
text = StringIO(text)
def streamer():
for i, t, v in self.get_tokens_unprocessed(text):
yield t, v
stream = streamer()
if not unfiltered:
stream = apply_filters(stream, self.filters, self)
return stream
def get_tokens_unprocessed(self, stream, stack=('root',)):
"""
Split ``text`` into (tokentype, text) pairs.
``stack`` is the inital stack (default: ``['root']``)
"""
pos = 0
tokendefs = self._tokens # see __call__, pylint:disable=E1101
statestack = list(stack)
statetokens = tokendefs[statestack[-1]]
known_names = {}
text = stream.read()
text = self._decode(text)
while 1:
for rexmatch, action, new_state in statetokens:
m = rexmatch(text, pos)
if m:
value = m.group()
if value in known_names:
yield pos, known_names[value], value
elif type(action) is tokens._TokenType:
yield pos, action, value
elif hasattr(action, '__call__'):
ttype, value = action(value)
known_names[value] = ttype
yield pos, ttype, value
else:
for item in action(self, m):
yield item
pos = m.end()
if new_state is not None:
# state transition
if isinstance(new_state, tuple):
for state in new_state:
if state == '#pop':
statestack.pop()
elif state == '#push':
statestack.append(statestack[-1])
elif (
# Ugly hack - multiline-comments
# are not stackable
state != 'multiline-comments'
or not statestack
or statestack[-1] != 'multiline-comments'
):
statestack.append(state)
elif isinstance(new_state, int):
# pop
del statestack[new_state:]
elif new_state == '#push':
statestack.append(statestack[-1])
else:
assert False, "wrong state def: %r" % new_state
statetokens = tokendefs[statestack[-1]]
break
else:
try:
if text[pos] == '\n':
# at EOL, reset state to "root"
pos += 1
statestack = ['root']
statetokens = tokendefs['root']
yield pos, tokens.Text, u'\n'
continue
yield pos, tokens.Error, text[pos]
pos += 1
except IndexError:
break
def tokenize(sql, encoding=None):
"""Tokenize sql.
Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
of ``(token type, value)`` items.
"""
lexer = Lexer()
if encoding is not None:
lexer.encoding = encoding
return lexer.get_tokens(sql)

View File

@@ -0,0 +1,31 @@
# Copyright (C) 2011 Jesus Leganes "piranna", piranna@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
from types import GeneratorType
class Pipeline(list):
"""Pipeline to process filters sequentially"""
def __call__(self, stream):
"""Run the pipeline
Return a static (non generator) version of the result
"""
# Run the stream over all the filters on the pipeline
for filter in self:
# Functions and callable objects (objects with '__call__' method)
if callable(filter):
stream = filter(stream)
# Normal filters (objects with 'process' method)
else:
stream = filter.process(None, stream)
# If last filter return a generator, staticalize it inside a list
if isinstance(stream, GeneratorType):
return list(stream)
return stream

View File

@@ -0,0 +1,684 @@
# -*- coding: utf-8 -*-
"""This module contains classes representing syntactical elements of SQL."""
import re
import sys
from sqlparse import tokens as T
class Token(object):
"""Base class for all other classes in this module.
It represents a single token and has two instance attributes:
``value`` is the unchange value of the token and ``ttype`` is
the type of the token.
"""
__slots__ = ('value', 'ttype', 'parent', 'normalized', 'is_keyword')
def __init__(self, ttype, value):
self.value = value
if ttype in T.Keyword:
self.normalized = value.upper()
else:
self.normalized = value
self.ttype = ttype
self.is_keyword = ttype in T.Keyword
self.parent = None
def __str__(self):
if sys.version_info[0] == 3:
return self.value
else:
return unicode(self).encode('utf-8')
def __repr__(self):
short = self._get_repr_value()
if sys.version_info[0] < 3:
short = short.encode('utf-8')
return '<%s \'%s\' at 0x%07x>' % (self._get_repr_name(),
short, id(self))
def __unicode__(self):
"""Returns a unicode representation of this object."""
return self.value or ''
def to_unicode(self):
"""Returns a unicode representation of this object.
.. deprecated:: 0.1.5
Use ``unicode(token)`` (for Python 3: ``str(token)``) instead.
"""
return unicode(self)
def _get_repr_name(self):
return str(self.ttype).split('.')[-1]
def _get_repr_value(self):
raw = unicode(self)
if len(raw) > 7:
raw = raw[:6] + u'...'
return re.sub('\s+', ' ', raw)
def flatten(self):
"""Resolve subgroups."""
yield self
def match(self, ttype, values, regex=False):
"""Checks whether the token matches the given arguments.
*ttype* is a token type. If this token doesn't match the given token
type.
*values* is a list of possible values for this token. The values
are OR'ed together so if only one of the values matches ``True``
is returned. Except for keyword tokens the comparison is
case-sensitive. For convenience it's ok to pass in a single string.
If *regex* is ``True`` (default is ``False``) the given values are
treated as regular expressions.
"""
type_matched = self.ttype is ttype
if not type_matched or values is None:
return type_matched
if regex:
if isinstance(values, basestring):
values = set([values])
if self.ttype is T.Keyword:
values = set(re.compile(v, re.IGNORECASE) for v in values)
else:
values = set(re.compile(v) for v in values)
for pattern in values:
if pattern.search(self.value):
return True
return False
if isinstance(values, basestring):
if self.is_keyword:
return values.upper() == self.normalized
return values == self.value
if self.is_keyword:
for v in values:
if v.upper() == self.normalized:
return True
return False
return self.value in values
def is_group(self):
"""Returns ``True`` if this object has children."""
return False
def is_whitespace(self):
"""Return ``True`` if this token is a whitespace token."""
return self.ttype and self.ttype in T.Whitespace
def within(self, group_cls):
"""Returns ``True`` if this token is within *group_cls*.
Use this method for example to check if an identifier is within
a function: ``t.within(sql.Function)``.
"""
parent = self.parent
while parent:
if isinstance(parent, group_cls):
return True
parent = parent.parent
return False
def is_child_of(self, other):
"""Returns ``True`` if this token is a direct child of *other*."""
return self.parent == other
def has_ancestor(self, other):
"""Returns ``True`` if *other* is in this tokens ancestry."""
parent = self.parent
while parent:
if parent == other:
return True
parent = parent.parent
return False
class TokenList(Token):
"""A group of tokens.
It has an additional instance attribute ``tokens`` which holds a
list of child-tokens.
"""
__slots__ = ('value', 'ttype', 'tokens')
def __init__(self, tokens=None):
if tokens is None:
tokens = []
self.tokens = tokens
Token.__init__(self, None, self._to_string())
def __unicode__(self):
return self._to_string()
def __str__(self):
str_ = self._to_string()
if sys.version_info[0] < 2:
str_ = str_.encode('utf-8')
return str_
def _to_string(self):
if sys.version_info[0] == 3:
return ''.join(x.value for x in self.flatten())
else:
return ''.join(unicode(x) for x in self.flatten())
def _get_repr_name(self):
return self.__class__.__name__
def _pprint_tree(self, max_depth=None, depth=0):
"""Pretty-print the object tree."""
indent = ' ' * (depth * 2)
for idx, token in enumerate(self.tokens):
if token.is_group():
pre = ' +-'
else:
pre = ' | '
print '%s%s%d %s \'%s\'' % (indent, pre, idx,
token._get_repr_name(),
token._get_repr_value())
if (token.is_group() and (max_depth is None or depth < max_depth)):
token._pprint_tree(max_depth, depth + 1)
def _remove_quotes(self, val):
"""Helper that removes surrounding quotes from strings."""
if not val:
return val
if val[0] in ('"', '\'') and val[-1] == val[0]:
val = val[1:-1]
return val
def get_token_at_offset(self, offset):
"""Returns the token that is on position offset."""
idx = 0
for token in self.flatten():
end = idx + len(token.value)
if idx <= offset <= end:
return token
idx = end
def flatten(self):
"""Generator yielding ungrouped tokens.
This method is recursively called for all child tokens.
"""
for token in self.tokens:
if isinstance(token, TokenList):
for item in token.flatten():
yield item
else:
yield token
# def __iter__(self):
# return self
#
# def next(self):
# for token in self.tokens:
# yield token
def is_group(self):
return True
def get_sublists(self):
# return [x for x in self.tokens if isinstance(x, TokenList)]
for x in self.tokens:
if isinstance(x, TokenList):
yield x
@property
def _groupable_tokens(self):
return self.tokens
def token_first(self, ignore_whitespace=True, ignore_comments=False):
"""Returns the first child token.
If *ignore_whitespace* is ``True`` (the default), whitespace
tokens are ignored.
if *ignore_comments* is ``True`` (default: ``False``), comments are
ignored too.
"""
for token in self.tokens:
if ignore_whitespace and token.is_whitespace():
continue
if ignore_comments and isinstance(token, Comment):
continue
return token
def token_next_by_instance(self, idx, clss, end=None):
"""Returns the next token matching a class.
*idx* is where to start searching in the list of child tokens.
*clss* is a list of classes the token should be an instance of.
If no matching token can be found ``None`` is returned.
"""
if not isinstance(clss, (list, tuple)):
clss = (clss,)
for token in self.tokens[idx:end]:
if isinstance(token, clss):
return token
def token_next_by_type(self, idx, ttypes):
"""Returns next matching token by it's token type."""
if not isinstance(ttypes, (list, tuple)):
ttypes = [ttypes]
for token in self.tokens[idx:]:
if token.ttype in ttypes:
return token
def token_next_match(self, idx, ttype, value, regex=False):
"""Returns next token where it's ``match`` method returns ``True``."""
if not isinstance(idx, int):
idx = self.token_index(idx)
for n in xrange(idx, len(self.tokens)):
token = self.tokens[n]
if token.match(ttype, value, regex):
return token
def token_not_matching(self, idx, funcs):
for token in self.tokens[idx:]:
passed = False
for func in funcs:
if func(token):
passed = True
break
if not passed:
return token
def token_matching(self, idx, funcs):
for token in self.tokens[idx:]:
for func in funcs:
if func(token):
return token
def token_prev(self, idx, skip_ws=True):
"""Returns the previous token relative to *idx*.
If *skip_ws* is ``True`` (the default) whitespace tokens are ignored.
``None`` is returned if there's no previous token.
"""
if idx is None:
return None
if not isinstance(idx, int):
idx = self.token_index(idx)
while idx:
idx -= 1
if self.tokens[idx].is_whitespace() and skip_ws:
continue
return self.tokens[idx]
def token_next(self, idx, skip_ws=True):
"""Returns the next token relative to *idx*.
If *skip_ws* is ``True`` (the default) whitespace tokens are ignored.
``None`` is returned if there's no next token.
"""
if idx is None:
return None
if not isinstance(idx, int):
idx = self.token_index(idx)
while idx < len(self.tokens) - 1:
idx += 1
if self.tokens[idx].is_whitespace() and skip_ws:
continue
return self.tokens[idx]
def token_index(self, token, start=0):
"""Return list index of token."""
if start > 0:
# Performing `index` manually is much faster when starting in the middle
# of the list of tokens and expecting to find the token near to the starting
# index.
for i in xrange(start, len(self.tokens)):
if self.tokens[i] == token:
return i
return -1
return self.tokens.index(token)
def tokens_between(self, start, end, exclude_end=False):
"""Return all tokens between (and including) start and end.
If *exclude_end* is ``True`` (default is ``False``) the end token
is included too.
"""
# FIXME(andi): rename exclude_end to inlcude_end
if exclude_end:
offset = 0
else:
offset = 1
end_idx = self.token_index(end) + offset
start_idx = self.token_index(start)
return self.tokens[start_idx:end_idx]
def group_tokens(self, grp_cls, tokens, ignore_ws=False):
"""Replace tokens by an instance of *grp_cls*."""
idx = self.token_index(tokens[0])
if ignore_ws:
while tokens and tokens[-1].is_whitespace():
tokens = tokens[:-1]
for t in tokens:
self.tokens.remove(t)
grp = grp_cls(tokens)
for token in tokens:
token.parent = grp
grp.parent = self
self.tokens.insert(idx, grp)
return grp
def insert_before(self, where, token):
"""Inserts *token* before *where*."""
self.tokens.insert(self.token_index(where), token)
def insert_after(self, where, token, skip_ws=True):
"""Inserts *token* after *where*."""
next_token = self.token_next(where, skip_ws=skip_ws)
if next_token is None:
self.tokens.append(token)
else:
self.tokens.insert(self.token_index(next_token), token)
def has_alias(self):
"""Returns ``True`` if an alias is present."""
return self.get_alias() is not None
def get_alias(self):
"""Returns the alias for this identifier or ``None``."""
# "name AS alias"
kw = self.token_next_match(0, T.Keyword, 'AS')
if kw is not None:
return self._get_first_name(kw, keywords=True)
# "name alias" or "complicated column expression alias"
if len(self.tokens) > 2 \
and self.token_next_by_type(0, T.Whitespace) is not None:
return self._get_first_name(reverse=True)
return None
def get_name(self):
"""Returns the name of this identifier.
This is either it's alias or it's real name. The returned valued can
be considered as the name under which the object corresponding to
this identifier is known within the current statement.
"""
alias = self.get_alias()
if alias is not None:
return alias
return self.get_real_name()
def get_real_name(self):
"""Returns the real name (object name) of this identifier."""
# a.b
dot = self.token_next_match(0, T.Punctuation, '.')
if dot is not None:
return self._get_first_name(self.token_index(dot))
return self._get_first_name()
def get_parent_name(self):
"""Return name of the parent object if any.
A parent object is identified by the first occuring dot.
"""
dot = self.token_next_match(0, T.Punctuation, '.')
if dot is None:
return None
prev_ = self.token_prev(self.token_index(dot))
if prev_ is None: # something must be verry wrong here..
return None
return self._remove_quotes(prev_.value)
def _get_first_name(self, idx=None, reverse=False, keywords=False):
"""Returns the name of the first token with a name"""
if idx and not isinstance(idx, int):
idx = self.token_index(idx) + 1
tokens = self.tokens[idx:] if idx else self.tokens
tokens = reversed(tokens) if reverse else tokens
types = [T.Name, T.Wildcard, T.String.Symbol]
if keywords:
types.append(T.Keyword)
for tok in tokens:
if tok.ttype in types:
return self._remove_quotes(tok.value)
elif isinstance(tok, Identifier) or isinstance(tok, Function):
return tok.get_name()
return None
class Statement(TokenList):
"""Represents a SQL statement."""
__slots__ = ('value', 'ttype', 'tokens')
def get_type(self):
"""Returns the type of a statement.
The returned value is a string holding an upper-cased reprint of
the first DML or DDL keyword. If the first token in this group
isn't a DML or DDL keyword "UNKNOWN" is returned.
Whitespaces and comments at the beginning of the statement
are ignored.
"""
first_token = self.token_first(ignore_comments=True)
if first_token is None:
# An "empty" statement that either has not tokens at all
# or only whitespace tokens.
return 'UNKNOWN'
elif first_token.ttype in (T.Keyword.DML, T.Keyword.DDL):
return first_token.normalized
return 'UNKNOWN'
class Identifier(TokenList):
"""Represents an identifier.
Identifiers may have aliases or typecasts.
"""
__slots__ = ('value', 'ttype', 'tokens')
def is_wildcard(self):
"""Return ``True`` if this identifier contains a wildcard."""
token = self.token_next_by_type(0, T.Wildcard)
return token is not None
def get_typecast(self):
"""Returns the typecast or ``None`` of this object as a string."""
marker = self.token_next_match(0, T.Punctuation, '::')
if marker is None:
return None
next_ = self.token_next(self.token_index(marker), False)
if next_ is None:
return None
return unicode(next_)
def get_ordering(self):
"""Returns the ordering or ``None`` as uppercase string."""
ordering = self.token_next_by_type(0, T.Keyword.Order)
if ordering is None:
return None
return ordering.value.upper()
def get_array_indices(self):
"""Returns an iterator of index token lists"""
for tok in self.tokens:
if isinstance(tok, SquareBrackets):
# Use [1:-1] index to discard the square brackets
yield tok.tokens[1:-1]
class IdentifierList(TokenList):
"""A list of :class:`~sqlparse.sql.Identifier`\'s."""
__slots__ = ('value', 'ttype', 'tokens')
def get_identifiers(self):
"""Returns the identifiers.
Whitespaces and punctuations are not included in this generator.
"""
for x in self.tokens:
if not x.is_whitespace() and not x.match(T.Punctuation, ','):
yield x
class Parenthesis(TokenList):
"""Tokens between parenthesis."""
__slots__ = ('value', 'ttype', 'tokens')
@property
def _groupable_tokens(self):
return self.tokens[1:-1]
class SquareBrackets(TokenList):
"""Tokens between square brackets"""
__slots__ = ('value', 'ttype', 'tokens')
@property
def _groupable_tokens(self):
return self.tokens[1:-1]
class Assignment(TokenList):
"""An assignment like 'var := val;'"""
__slots__ = ('value', 'ttype', 'tokens')
class If(TokenList):
"""An 'if' clause with possible 'else if' or 'else' parts."""
__slots__ = ('value', 'ttype', 'tokens')
class For(TokenList):
"""A 'FOR' loop."""
__slots__ = ('value', 'ttype', 'tokens')
class Comparison(TokenList):
"""A comparison used for example in WHERE clauses."""
__slots__ = ('value', 'ttype', 'tokens')
@property
def left(self):
return self.tokens[0]
@property
def right(self):
return self.tokens[-1]
class Comment(TokenList):
"""A comment."""
__slots__ = ('value', 'ttype', 'tokens')
def is_multiline(self):
return self.tokens and self.tokens[0].ttype == T.Comment.Multiline
class Where(TokenList):
"""A WHERE clause."""
__slots__ = ('value', 'ttype', 'tokens')
class Case(TokenList):
"""A CASE statement with one or more WHEN and possibly an ELSE part."""
__slots__ = ('value', 'ttype', 'tokens')
def get_cases(self):
"""Returns a list of 2-tuples (condition, value).
If an ELSE exists condition is None.
"""
CONDITION = 1
VALUE = 2
ret = []
mode = CONDITION
for token in self.tokens:
# Set mode from the current statement
if token.match(T.Keyword, 'CASE'):
continue
elif token.match(T.Keyword, 'WHEN'):
ret.append(([], []))
mode = CONDITION
elif token.match(T.Keyword, 'THEN'):
mode = VALUE
elif token.match(T.Keyword, 'ELSE'):
ret.append((None, []))
mode = VALUE
elif token.match(T.Keyword, 'END'):
mode = None
# First condition without preceding WHEN
if mode and not ret:
ret.append(([], []))
# Append token depending of the current mode
if mode == CONDITION:
ret[-1][0].append(token)
elif mode == VALUE:
ret[-1][1].append(token)
# Return cases list
return ret
class Function(TokenList):
"""A function or procedure call."""
__slots__ = ('value', 'ttype', 'tokens')
def get_parameters(self):
"""Return a list of parameters."""
parenthesis = self.tokens[-1]
for t in parenthesis.tokens:
if isinstance(t, IdentifierList):
return t.get_identifiers()
elif isinstance(t, Identifier) or \
isinstance(t, Function) or \
t.ttype in T.Literal:
return [t,]
return []
class Begin(TokenList):
"""A BEGIN/END block."""
__slots__ = ('value', 'ttype', 'tokens')

View File

@@ -0,0 +1,83 @@
# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
#
# This module is part of python-sqlparse and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php.
# The Token implementation is based on pygment's token system written
# by Georg Brandl.
# http://pygments.org/
"""Tokens"""
class _TokenType(tuple):
parent = None
def split(self):
buf = []
node = self
while node is not None:
buf.append(node)
node = node.parent
buf.reverse()
return buf
def __contains__(self, val):
return val is not None and (self is val or val[:len(self)] == self)
def __getattr__(self, val):
if not val or not val[0].isupper():
return tuple.__getattribute__(self, val)
new = _TokenType(self + (val,))
setattr(self, val, new)
new.parent = self
return new
def __hash__(self):
return hash(tuple(self))
def __repr__(self):
return 'Token' + (self and '.' or '') + '.'.join(self)
Token = _TokenType()
# Special token types
Text = Token.Text
Whitespace = Text.Whitespace
Newline = Whitespace.Newline
Error = Token.Error
# Text that doesn't belong to this lexer (e.g. HTML in PHP)
Other = Token.Other
# Common token types for source code
Keyword = Token.Keyword
Name = Token.Name
Literal = Token.Literal
String = Literal.String
Number = Literal.Number
Punctuation = Token.Punctuation
Operator = Token.Operator
Comparison = Operator.Comparison
Wildcard = Token.Wildcard
Comment = Token.Comment
Assignment = Token.Assignement
# Generic types for non-source code
Generic = Token.Generic
# String and some others are not direct childs of Token.
# alias them:
Token.Token = Token
Token.String = String
Token.Number = Number
# SQL specific tokens
DML = Keyword.DML
DDL = Keyword.DDL
Command = Keyword.Command
Group = Token.Group
Group.Parenthesis = Token.Group.Parenthesis
Group.Comment = Token.Group.Comment
Group.Where = Token.Group.Where

View File

@@ -0,0 +1,137 @@
'''
Created on 17/05/2012
@author: piranna
'''
import re
try:
from collections import OrderedDict
except ImportError:
OrderedDict = None
if OrderedDict:
class Cache(OrderedDict):
"""Cache with LRU algorithm using an OrderedDict as basis
"""
def __init__(self, maxsize=100):
OrderedDict.__init__(self)
self._maxsize = maxsize
def __getitem__(self, key, *args, **kwargs):
# Get the key and remove it from the cache, or raise KeyError
value = OrderedDict.__getitem__(self, key)
del self[key]
# Insert the (key, value) pair on the front of the cache
OrderedDict.__setitem__(self, key, value)
# Return the value from the cache
return value
def __setitem__(self, key, value, *args, **kwargs):
# Key was inserted before, remove it so we put it at front later
if key in self:
del self[key]
# Too much items on the cache, remove the least recent used
elif len(self) >= self._maxsize:
self.popitem(False)
# Insert the (key, value) pair on the front of the cache
OrderedDict.__setitem__(self, key, value, *args, **kwargs)
else:
class Cache(dict):
"""Cache that reset when gets full
"""
def __init__(self, maxsize=100):
dict.__init__(self)
self._maxsize = maxsize
def __setitem__(self, key, value, *args, **kwargs):
# Reset the cache if we have too much cached entries and start over
if len(self) >= self._maxsize:
self.clear()
# Insert the (key, value) pair on the front of the cache
dict.__setitem__(self, key, value, *args, **kwargs)
def memoize_generator(func):
"""Memoize decorator for generators
Store `func` results in a cache according to their arguments as 'memoize'
does but instead this works on decorators instead of regular functions.
Obviusly, this is only useful if the generator will always return the same
values for each specific parameters...
"""
cache = Cache()
def wrapped_func(*args, **kwargs):
# params = (args, kwargs)
params = (args, tuple(sorted(kwargs.items())))
# Look if cached
try:
cached = cache[params]
# Not cached, exec and store it
except KeyError:
cached = []
for item in func(*args, **kwargs):
cached.append(item)
yield item
cache[params] = cached
# Cached, yield its items
else:
for item in cached:
yield item
return wrapped_func
# This regular expression replaces the home-cooked parser that was here before.
# It is much faster, but requires an extra post-processing step to get the
# desired results (that are compatible with what you would expect from the
# str.splitlines() method).
#
# It matches groups of characters: newlines, quoted strings, or unquoted text,
# and splits on that basis. The post-processing step puts those back together
# into the actual lines of SQL.
SPLIT_REGEX = re.compile(r"""
(
(?: # Start of non-capturing group
(?:\r\n|\r|\n) | # Match any single newline, or
[^\r\n'"]+ | # Match any character series without quotes or
# newlines, or
"(?:[^"\\]|\\.)*" | # Match double-quoted strings, or
'(?:[^'\\]|\\.)*' # Match single quoted strings
)
)
""", re.VERBOSE)
LINE_MATCH = re.compile(r'(\r\n|\r|\n)')
def split_unquoted_newlines(text):
"""Split a string on all unquoted newlines.
Unlike str.splitlines(), this will ignore CR/LF/CR+LF if the requisite
character is inside of a string."""
lines = SPLIT_REGEX.split(text)
outputlines = ['']
for line in lines:
if not line:
continue
elif LINE_MATCH.match(line):
outputlines.append('')
else:
outputlines[-1] += line
return outputlines