import string
import warnings
from json import loads
from jmespath.exceptions import LexerError, EmptyExpressionError
class Lexer(object):
START_IDENTIFIER = set(string.ascii_letters + '_')
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
VALID_NUMBER = set(string.digits)
WHITESPACE = set(" \t\n\r")
SIMPLE_TOKENS = {
'.': 'dot',
'*': 'star',
']': 'rbracket',
',': 'comma',
':': 'colon',
'@': 'current',
'(': 'lparen',
')': 'rparen',
'{': 'lbrace',
'}': 'rbrace',
}
def tokenize(self, expression):
self._initialize_for_expression(expression)
while self._current is not None:
if self._current in self.SIMPLE_TOKENS:
yield {'type': self.SIMPLE_TOKENS[self._current],
'value': self._current,
'start': self._position, 'end': self._position + 1}
self._next()
elif self._current in self.START_IDENTIFIER:
start = self._position
buff = self._current
while self._next() in self.VALID_IDENTIFIER:
buff += self._current
yield {'type': 'unquoted_identifier', 'value': buff,
'start': start, 'end': start + len(buff)}
elif self._current in self.WHITESPACE:
self._next()
elif self._current == '[':
start = self._position
next_char = self._next()
if next_char == ']':
self._next()
yield {'type': 'flatten', 'value': '[]',
'start': start, 'end': start + 2}
elif next_char == '?':
self._next()
yield {'type': 'filter', 'value': '[?',
'start': start, 'end': start + 2}
else:
yield {'type': 'lbracket', 'value': '[',
'start': start, 'end': start + 1}
elif self._current == "'":
yield self._consume_raw_string_literal()
elif self._current == '|':
yield self._match_or_else('|', 'or', 'pipe')
elif self._current == '&':
yield self._match_or_else('&', 'and', 'expref')
elif self._current == '`':
yield self._consume_literal()
elif self._current in self.VALID_NUMBER:
start = self._position
buff = self._consume_number()
yield {'type': 'number', 'value': int(buff),
'start': start, 'end': start + len(buff)}
elif self._current == '-':
# Negative number.
start = self._position
buff = self._consume_number()
if len(buff) > 1:
yield {'type': 'number', 'value': int(buff),
'start': start, 'end': start + len(buff)}
else:
raise LexerError(lexer_position=start,
lexer_value=buff,
message="Unknown token '%s'" % buff)
elif self._current == '"':
yield self._consume_quoted_identifier()
elif self._current == '<':
yield self._match_or_else('=', 'lte', 'lt')
elif self._current == '>':
yield self._match_or_else('=', 'gte', 'gt')
elif self._current == '!':
yield self._match_or_else('=', 'ne', 'not')
elif self._current == '=':
if self._next() == '=':
yield {'type': 'eq', 'value': '==',
'start': self._position - 1, 'end': self._position}
self._next()
else:
if self._current is None:
# If we're at the EOF, we never advanced
# the position so we don't need to rewind
# it back one location.
position = self._position
else:
position = self._position - 1
raise LexerError(
lexer_position=position,
lexer_value='=',
message="Unknown token '='")
else:
raise LexerError(lexer_position=self._position,
lexer_value=self._current,
message="Unknown token %s" % self._current)
yield {'type': 'eof', 'value': '',
'start': self._length, 'end': self._length}
def _consume_number(self):
start = self._position
buff = self._current
while self._next() in self.VALID_NUMBER:
buff += self._current
return buff
def _initialize_for_expression(self, expression):
if not expression:
raise EmptyExpressionError()
self._position = 0
self._expression = expression
self._chars = list(self._expression)
self._current = self._chars[self._position]
self._length = len(self._expression)
def _next(self):
if self._position == self._length - 1:
self._current = None
else:
self._position += 1
self._current = self._chars[self._position]
return self._current
def _consume_until(self, delimiter):
# Consume until the delimiter is reached,
# allowing for the delimiter to be escaped with "\".
start = self._position
buff = ''
self._next()
while self._current != delimiter:
if self._current == '\\':
buff += '\\'
self._next()
if self._current is None:
# We're at the EOF.
raise LexerError(lexer_position=start,
lexer_value=self._expression[start:],
message="Unclosed %s delimiter" % delimiter)
buff += self._current
self._next()
# Skip the closing delimiter.
self._next()
return buff
def _consume_literal(self):
start = self._position
lexeme = self._consume_until('`').replace('\\`', '`')
try:
# Assume it is valid JSON and attempt to parse.
parsed_json = loads(lexeme)
except ValueError:
try:
# Invalid JSON values should be converted to quoted
# JSON strings during the JEP-12 deprecation period.
parsed_json = loads('"%s"' % lexeme.lstrip())
warnings.warn("deprecated string literal syntax",
PendingDeprecationWarning)
except ValueError:
raise LexerError(lexer_position=start,
lexer_value=self._expression[start:],
message="Bad token %s" % lexeme)
token_len = self._position - start
return {'type': 'literal', 'value': parsed_json,
'start': start, 'end': token_len}
def _consume_quoted_identifier(self):
start = self._position
lexeme = '"' + self._consume_until('"') + '"'
try:
token_len = self._position - start
return {'type': 'quoted_identifier', 'value': loads(lexeme),
'start': start, 'end': token_len}
except ValueError as e:
error_message = str(e).split(':')[0]
raise LexerError(lexer_position=start,
lexer_value=lexeme,
message=error_message)
def _consume_raw_string_literal(self):
start = self._position
lexeme = self._consume_until("'").replace("\\'", "'")
token_len = self._position - start
return {'type': 'literal', 'value': lexeme,
'start': start, 'end': token_len}
def _match_or_else(self, expected, match_type, else_type):
start = self._position
current = self._current
next_char = self._next()
if next_char == expected:
self._next()
return {'type': match_type, 'value': current + next_char,
'start': start, 'end': start + 1}
return {'type': else_type, 'value': current,
'start': start, 'end': start}