1
Files
lecture-interpreters/de.churl.simple/simplelexer.py
Christoph 81e2dd0745 cleanup
2021-09-02 18:26:59 +02:00

235 lines
6.3 KiB
Python

from rply import LexerGenerator, LexingError
from rply.token import Token
# attempts at writing a simple Python-like lexer
tabsize = 4
def make_indent_token(token, start):
assert token.name == "NewlineAndWhitespace"
token.name = "Indent"
token.value = token.value[start:]
token.source_pos.idx += start
token.source_pos.lineno += 1
token.source_pos.colno = 0
return token
def make_dedent_token(token, start):
assert token.name == "NewlineAndWhitespace"
token.name = "Dedent"
token.value = token.value[start:]
token.source_pos.idx += start
token.source_pos.lineno += 1
token.source_pos.colno = 0
return token
# split the token in two: one for the newline and one for the
# in/dedent
# the NewlineAndWhitespace token looks like this: \r?\n[ \f\t]*
def compute_position_of_newline(token):
assert token.name == "NewlineAndWhitespace"
s = token.value
length = len(s)
pos = 0
column = 0
if s[0] == '\n':
pos = 1
start = 1
else:
pos = 2
start = 2
while pos < length: # count the indentation depth of the whitespace
c = s[pos]
if c == ' ':
column = column + 1
elif c == '\t':
column = (column // tabsize + 1) * tabsize
elif c == '\f':
column = 0
pos = pos + 1
return start, column
def compute_indent_or_dedent(token, indentation_levels, output_tokens):
start, column = compute_position_of_newline(token)
# before start: new line token
output_tokens.append(Token("Newline", token.value[:start], token.source_pos))
# after start: deal with white spaces (create indent or dedent token)
if column > indentation_levels[-1]: # count indents or dedents
indentation_levels.append(column)
token = make_indent_token(token, start)
output_tokens.append(token)
else:
dedented = False
while column < indentation_levels[-1]:
dedented = True
indentation_levels.pop()
output_tokens.append(Token("Dedent", "",
token.source_pos))
if dedented:
token = make_dedent_token(token, start)
output_tokens[-1] = token
# input: lexer token stream
# output: modified token stream
def postprocess(tokens, source):
parenthesis_level = 0
indentation_levels = [0]
output_tokens = []
tokens = [token for token in tokens if token.name != "Ignore"]
token = None
for i in range(len(tokens)):
token = tokens[i]
# never create indent/dedent token between brackets
if token.name == "OpenBracket":
parenthesis_level += 1
output_tokens.append(token)
elif token.name == "CloseBracket":
parenthesis_level -= 1
if parenthesis_level < 0:
raise LexingError("unmatched parenthesis", token.source_pos)
output_tokens.append(token)
elif token.name == "NewlineAndWhitespace":
if i + 1 < len(tokens) and tokens[i + 1].name == "NewlineAndWhitespace":
continue
if parenthesis_level == 0:
compute_indent_or_dedent(token, indentation_levels, output_tokens)
else:
pass # implicit line-continuations within parenthesis
else:
# something else: e.g. name, keyword, etc...
output_tokens.append(token)
if token is not None:
output_tokens.append(Token("EOF", "", token.source_pos))
return output_tokens
# RPython reimplementation
def group(*choices, **namegroup):
choices = list(choices)
return '(' + '|'.join(choices) + ')'
# RPython reimplementation
def any(*choices):
result = group(*choices) + '*'
return result
# ' or " string. eg. 'hello' or "hello"
def make_single_string(delim):
normal_chars = r"[^\n\%s]*" % (delim,)
return "".join([delim, normal_chars,
any(r"\\." + normal_chars), delim])
# ____________________________________________________________
# Literals
Number = r'(([+-])?[1-9][0-9]*)|0'
# ____________________________________________________________
# Ignored
Whitespace = r'[ \f\t]'
Newline = r'\r?\n'
Linecontinue = r'\\' + Newline
Comment = r'#[^\r\n]*'
NewlineAndWhitespace = Newline + any(Whitespace)
Ignore = group(Whitespace + '+', Linecontinue, Comment)
# ____________________________________________________________
# Identifier
Name = r'[a-zA-Z_][a-zA-Z0-9_]*'
PrimitiveName = '\\$' + Name
# ____________________________________________________________
# Symbols
Colon = r'\:'
Comma = r'\,'
Assign = r'\='
OpenBracket = r'[\[\(\{]'
CloseBracket = r'[\]\)\}]'
# ____________________________________________________________
# Project
Boolean = r"true|false"
String = group(make_single_string(r"\'"), make_single_string(r'\"'))
_sign = r"([+-])?"
_int = r"(([1-9][0-9]*)|0)"
_dec = r"(([0-9]*[1-9])|0)"
Double = group(_sign + group(_int, r"") + r"\." + _dec, # 0.1 / .1
_sign + _int + r"\." + group(_dec, r"")) # 1.0 / 1.
Plus = r'\+'
Minus = r'-'
Multiply = r'\*'
Divide = r'/'
Modulo = r'%'
Increment = r'\+\+'
Decrement = r'--'
PlusInplace = r'\+='
MinusInplace = r'-='
MultiplyInplace = r'\*='
DivideInplace = r'/='
Less = r'<'
LessEqual = r'<='
Greater = r'>'
GreaterEqual = r'>='
Equal = r'=='
NotEqual = r'!='
And = r'&&'
Or = r'\|\|'
Not = r'!'
GC = r'gc'
# ____________________________________________________________
# Keywords
If = r'if'
Else = r'else'
While = r'while'
Def = r'def'
Object = r'object'
tokens = ["If", "Else", "While", "Def", "Object", "Ignore",
"String", "Boolean", "Double",
"Number",
"GC",
"NewlineAndWhitespace", "OpenBracket", "CloseBracket", "Comma", "Colon",
"And", "Or", "LessEqual", "Less", "GreaterEqual", "Greater", "Equal", "NotEqual",
"Decrement", "PlusInplace", "MinusInplace", "MultiplyInplace", "DivideInplace",
"Increment", "Plus", "Minus", "Multiply", "Divide", "Modulo",
"Assign", "Not",
"Name", "PrimitiveName"]
def make_lexer():
lg = LexerGenerator()
for token in tokens:
# e.g. (Name, r'[a-zA-Z_][a-zA-Z0-9_]*')
lg.add(token, globals()[token])
return lg.build()
lexer = make_lexer()
# s is the simple program code
def lex(s):
if not s.endswith('\n'):
s += '\n'
return list(postprocess(lexer.lex(s), s))