235 lines
6.3 KiB
Python
235 lines
6.3 KiB
Python
from rply import LexerGenerator, LexingError
|
|
from rply.token import Token
|
|
|
|
# attempts at writing a simple Python-like lexer
|
|
tabsize = 4
|
|
|
|
|
|
def make_indent_token(token, start):
|
|
assert token.name == "NewlineAndWhitespace"
|
|
token.name = "Indent"
|
|
token.value = token.value[start:]
|
|
token.source_pos.idx += start
|
|
token.source_pos.lineno += 1
|
|
token.source_pos.colno = 0
|
|
return token
|
|
|
|
|
|
def make_dedent_token(token, start):
|
|
assert token.name == "NewlineAndWhitespace"
|
|
token.name = "Dedent"
|
|
token.value = token.value[start:]
|
|
token.source_pos.idx += start
|
|
token.source_pos.lineno += 1
|
|
token.source_pos.colno = 0
|
|
return token
|
|
|
|
|
|
# split the token in two: one for the newline and one for the
|
|
# in/dedent
|
|
# the NewlineAndWhitespace token looks like this: \r?\n[ \f\t]*
|
|
def compute_position_of_newline(token):
|
|
assert token.name == "NewlineAndWhitespace"
|
|
s = token.value
|
|
length = len(s)
|
|
pos = 0
|
|
column = 0
|
|
if s[0] == '\n':
|
|
pos = 1
|
|
start = 1
|
|
else:
|
|
pos = 2
|
|
start = 2
|
|
while pos < length: # count the indentation depth of the whitespace
|
|
c = s[pos]
|
|
if c == ' ':
|
|
column = column + 1
|
|
elif c == '\t':
|
|
column = (column // tabsize + 1) * tabsize
|
|
elif c == '\f':
|
|
column = 0
|
|
pos = pos + 1
|
|
return start, column
|
|
|
|
|
|
def compute_indent_or_dedent(token, indentation_levels, output_tokens):
|
|
start, column = compute_position_of_newline(token)
|
|
# before start: new line token
|
|
output_tokens.append(Token("Newline", token.value[:start], token.source_pos))
|
|
# after start: deal with white spaces (create indent or dedent token)
|
|
if column > indentation_levels[-1]: # count indents or dedents
|
|
indentation_levels.append(column)
|
|
token = make_indent_token(token, start)
|
|
output_tokens.append(token)
|
|
else:
|
|
dedented = False
|
|
while column < indentation_levels[-1]:
|
|
dedented = True
|
|
indentation_levels.pop()
|
|
output_tokens.append(Token("Dedent", "",
|
|
token.source_pos))
|
|
if dedented:
|
|
token = make_dedent_token(token, start)
|
|
output_tokens[-1] = token
|
|
|
|
|
|
# input: lexer token stream
|
|
# output: modified token stream
|
|
def postprocess(tokens, source):
|
|
parenthesis_level = 0
|
|
indentation_levels = [0]
|
|
output_tokens = []
|
|
tokens = [token for token in tokens if token.name != "Ignore"]
|
|
token = None
|
|
for i in range(len(tokens)):
|
|
token = tokens[i]
|
|
# never create indent/dedent token between brackets
|
|
if token.name == "OpenBracket":
|
|
parenthesis_level += 1
|
|
output_tokens.append(token)
|
|
elif token.name == "CloseBracket":
|
|
parenthesis_level -= 1
|
|
if parenthesis_level < 0:
|
|
raise LexingError("unmatched parenthesis", token.source_pos)
|
|
output_tokens.append(token)
|
|
elif token.name == "NewlineAndWhitespace":
|
|
if i + 1 < len(tokens) and tokens[i + 1].name == "NewlineAndWhitespace":
|
|
continue
|
|
if parenthesis_level == 0:
|
|
compute_indent_or_dedent(token, indentation_levels, output_tokens)
|
|
else:
|
|
pass # implicit line-continuations within parenthesis
|
|
else:
|
|
# something else: e.g. name, keyword, etc...
|
|
output_tokens.append(token)
|
|
if token is not None:
|
|
output_tokens.append(Token("EOF", "", token.source_pos))
|
|
return output_tokens
|
|
|
|
|
|
# RPython reimplementation
|
|
def group(*choices, **namegroup):
|
|
choices = list(choices)
|
|
return '(' + '|'.join(choices) + ')'
|
|
|
|
|
|
# RPython reimplementation
|
|
def any(*choices):
|
|
result = group(*choices) + '*'
|
|
return result
|
|
|
|
|
|
# ' or " string. eg. 'hello' or "hello"
|
|
def make_single_string(delim):
|
|
normal_chars = r"[^\n\%s]*" % (delim,)
|
|
return "".join([delim, normal_chars,
|
|
any(r"\\." + normal_chars), delim])
|
|
|
|
|
|
# ____________________________________________________________
|
|
# Literals
|
|
|
|
Number = r'(([+-])?[1-9][0-9]*)|0'
|
|
|
|
# ____________________________________________________________
|
|
# Ignored
|
|
|
|
Whitespace = r'[ \f\t]'
|
|
Newline = r'\r?\n'
|
|
Linecontinue = r'\\' + Newline
|
|
Comment = r'#[^\r\n]*'
|
|
NewlineAndWhitespace = Newline + any(Whitespace)
|
|
Ignore = group(Whitespace + '+', Linecontinue, Comment)
|
|
|
|
# ____________________________________________________________
|
|
# Identifier
|
|
|
|
Name = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
|
PrimitiveName = '\\$' + Name
|
|
|
|
# ____________________________________________________________
|
|
# Symbols
|
|
|
|
Colon = r'\:'
|
|
Comma = r'\,'
|
|
Assign = r'\='
|
|
|
|
OpenBracket = r'[\[\(\{]'
|
|
CloseBracket = r'[\]\)\}]'
|
|
|
|
# ____________________________________________________________
|
|
# Project
|
|
|
|
Boolean = r"true|false"
|
|
String = group(make_single_string(r"\'"), make_single_string(r'\"'))
|
|
|
|
_sign = r"([+-])?"
|
|
_int = r"(([1-9][0-9]*)|0)"
|
|
_dec = r"(([0-9]*[1-9])|0)"
|
|
Double = group(_sign + group(_int, r"") + r"\." + _dec, # 0.1 / .1
|
|
_sign + _int + r"\." + group(_dec, r"")) # 1.0 / 1.
|
|
|
|
Plus = r'\+'
|
|
Minus = r'-'
|
|
Multiply = r'\*'
|
|
Divide = r'/'
|
|
Modulo = r'%'
|
|
|
|
Increment = r'\+\+'
|
|
Decrement = r'--'
|
|
PlusInplace = r'\+='
|
|
MinusInplace = r'-='
|
|
MultiplyInplace = r'\*='
|
|
DivideInplace = r'/='
|
|
|
|
Less = r'<'
|
|
LessEqual = r'<='
|
|
Greater = r'>'
|
|
GreaterEqual = r'>='
|
|
Equal = r'=='
|
|
NotEqual = r'!='
|
|
|
|
And = r'&&'
|
|
Or = r'\|\|'
|
|
Not = r'!'
|
|
|
|
GC = r'gc'
|
|
|
|
# ____________________________________________________________
|
|
# Keywords
|
|
|
|
If = r'if'
|
|
Else = r'else'
|
|
While = r'while'
|
|
Def = r'def'
|
|
Object = r'object'
|
|
|
|
tokens = ["If", "Else", "While", "Def", "Object", "Ignore",
|
|
"String", "Boolean", "Double",
|
|
"Number",
|
|
"GC",
|
|
"NewlineAndWhitespace", "OpenBracket", "CloseBracket", "Comma", "Colon",
|
|
"And", "Or", "LessEqual", "Less", "GreaterEqual", "Greater", "Equal", "NotEqual",
|
|
"Decrement", "PlusInplace", "MinusInplace", "MultiplyInplace", "DivideInplace",
|
|
"Increment", "Plus", "Minus", "Multiply", "Divide", "Modulo",
|
|
"Assign", "Not",
|
|
"Name", "PrimitiveName"]
|
|
|
|
|
|
def make_lexer():
|
|
lg = LexerGenerator()
|
|
for token in tokens:
|
|
# e.g. (Name, r'[a-zA-Z_][a-zA-Z0-9_]*')
|
|
lg.add(token, globals()[token])
|
|
return lg.build()
|
|
|
|
|
|
lexer = make_lexer()
|
|
|
|
|
|
# s is the simple program code
|
|
def lex(s):
|
|
if not s.endswith('\n'):
|
|
s += '\n'
|
|
return list(postprocess(lexer.lex(s), s))
|