1
Files
lecture-interpreters/de.churl.simple/simplelexer.py
2021-09-01 22:49:06 +02:00

236 lines
6.4 KiB
Python

from rply import LexerGenerator, LexingError
from rply.token import Token
# attempts at writing a simple Python-like lexer
tabsize = 4
def make_indent_token(token, start):
assert token.name == "NewlineAndWhitespace"
token.name = "Indent"
token.value = token.value[start:]
token.source_pos.idx += start
token.source_pos.lineno += 1
token.source_pos.colno = 0
return token
def make_dedent_token(token, start):
assert token.name == "NewlineAndWhitespace"
token.name = "Dedent"
token.value = token.value[start:]
token.source_pos.idx += start
token.source_pos.lineno += 1
token.source_pos.colno = 0
return token
# split the token in two: one for the newline and one for the
# in/dedent
# the NewlineAndWhitespace token looks like this: \r?\n[ \f\t]*
def compute_position_of_newline(token):
assert token.name == "NewlineAndWhitespace"
s = token.value
length = len(s)
pos = 0
column = 0
if s[0] == '\n':
pos = 1
start = 1
else:
pos = 2
start = 2
while pos < length: # count the indentation depth of the whitespace
c = s[pos]
if c == ' ':
column = column + 1
elif c == '\t':
column = (column // tabsize + 1) * tabsize
elif c == '\f':
column = 0
pos = pos + 1
return start, column
def compute_indent_or_dedent(token, indentation_levels, output_tokens):
start, column = compute_position_of_newline(token)
# before start: new line token
output_tokens.append(Token("Newline", token.value[:start], token.source_pos))
# after start: deal with white spaces (create indent or dedent token)
if column > indentation_levels[-1]: # count indents or dedents
indentation_levels.append(column)
token = make_indent_token(token, start)
output_tokens.append(token)
else:
dedented = False
while column < indentation_levels[-1]:
dedented = True
indentation_levels.pop()
output_tokens.append(Token("Dedent", "",
token.source_pos))
if dedented:
token = make_dedent_token(token, start)
output_tokens[-1] = token
# input: lexer token stream
# output: modified token stream
def postprocess(tokens, source):
parenthesis_level = 0
indentation_levels = [0]
output_tokens = []
tokens = [token for token in tokens if token.name != "Ignore"]
token = None
for i in range(len(tokens)):
token = tokens[i]
# never create indent/dedent token between brackets
if token.name == "OpenBracket":
parenthesis_level += 1
output_tokens.append(token)
elif token.name == "CloseBracket":
parenthesis_level -= 1
if parenthesis_level < 0:
raise LexingError("unmatched parenthesis", token.source_pos)
output_tokens.append(token)
elif token.name == "NewlineAndWhitespace":
if i + 1 < len(tokens) and tokens[i + 1].name == "NewlineAndWhitespace":
continue
if parenthesis_level == 0:
compute_indent_or_dedent(token, indentation_levels, output_tokens)
else:
pass # implicit line-continuations within parenthesis
else:
# something else: e.g. name, keyword, etc...
output_tokens.append(token)
if token is not None:
output_tokens.append(Token("EOF", "", token.source_pos))
return output_tokens
# RPython reimplementation
def group(*choices, **namegroup):
choices = list(choices)
return '(' + '|'.join(choices) + ')'
# RPython reimplementation
def any(*choices):
result = group(*choices) + '*'
return result
# ' or " string. eg. 'hello' or "hello"
def make_single_string(delim):
normal_chars = r"[^\n\%s]*" % (delim,)
return "".join([delim, normal_chars,
any(r"\\." + normal_chars), delim])
# ____________________________________________________________
# Literals
Number = r'(([+-])?[1-9][0-9]*)|0'
# ____________________________________________________________
# Ignored
Whitespace = r'[ \f\t]'
Newline = r'\r?\n'
Linecontinue = r'\\' + Newline
Comment = r'#[^\r\n]*'
NewlineAndWhitespace = Newline + any(Whitespace)
Ignore = group(Whitespace + '+', Linecontinue, Comment)
# ____________________________________________________________
# Identifier
Name = r'[a-zA-Z_][a-zA-Z0-9_]*'
PrimitiveName = '\\$' + Name
# ____________________________________________________________
# Symbols
Colon = r'\:'
Comma = r'\,'
Assign = r'\='
OpenBracket = r'[\[\(\{]'
CloseBracket = r'[\]\)\}]'
# ____________________________________________________________
# Project
Boolean = r"true|false"
String = group(make_single_string(r"\'"), make_single_string(r'\"'))
_sign = r"([+-])?"
_int = r"(([1-9][0-9]*)|0)"
_dec = r"(([0-9]*[1-9])|0)"
Double = group(_sign + group(_int, r"") + r"\." + _dec, # 0.1 / .1
_sign + _int + r"\." + group(_dec, r"")) # 1.0 / 1.
Plus = r'\+'
Minus = r'-'
Multiply = r'\*'
Divide = r'/'
Increment = r'\+\+'
Decrement = r'--'
Modulo = r'%'
PlusInplace = r'\+='
MinusInplace = r'-='
MultiplyInplace = r'\*='
DivideInplace = r'/='
Less = r'<'
LessEqual = r'<='
Greater = r'>'
GreaterEqual = r'>='
Equal = r'=='
NotEqual = r'!='
And = r'&&'
Or = r'\|\|'
Not = r'!'
GC = r'gc'
# ____________________________________________________________
# Keywords
If = r'if'
Else = r'else'
While = r'while'
Def = r'def'
Object = r'object'
tokens = ["If", "Else", "While", "Def", "Object", "Ignore",
"String", "Boolean", "Double",
"Number", # after Double
"GC",
"NewlineAndWhitespace", "OpenBracket", "CloseBracket", "Comma", "Colon",
"And", "Or", "LessEqual", "Less", "GreaterEqual", "Greater", "Equal", "NotEqual",
"Decrement", "PlusInplace", "MinusInplace", "MultiplyInplace", "DivideInplace",
"Increment", "Plus", "Minus", "Multiply", "Divide", "Modulo",
"Assign", "Not",
"Name", "PrimitiveName",
]
def make_lexer():
lg = LexerGenerator()
for token in tokens:
# e.g. (Name, r'[a-zA-Z_][a-zA-Z0-9_]*')
lg.add(token, globals()[token])
return lg.build()
lexer = make_lexer()
# s is the simple program code
def lex(s):
if not s.endswith('\n'):
s += '\n'
return list(postprocess(lexer.lex(s), s))