from rply import LexerGenerator, LexingError from rply.token import Token # attempts at writing a simple Python-like lexer tabsize = 4 def make_indent_token(token, start): assert token.name == "NewlineAndWhitespace" token.name = "Indent" token.value = token.value[start:] token.source_pos.idx += start token.source_pos.lineno += 1 token.source_pos.colno = 0 return token def make_dedent_token(token, start): assert token.name == "NewlineAndWhitespace" token.name = "Dedent" token.value = token.value[start:] token.source_pos.idx += start token.source_pos.lineno += 1 token.source_pos.colno = 0 return token # split the token in two: one for the newline and one for the # in/dedent # the NewlineAndWhitespace token looks like this: \r?\n[ \f\t]* def compute_position_of_newline(token): assert token.name == "NewlineAndWhitespace" s = token.value length = len(s) pos = 0 column = 0 if s[0] == '\n': pos = 1 start = 1 else: pos = 2 start = 2 while pos < length: # count the indentation depth of the whitespace c = s[pos] if c == ' ': column = column + 1 elif c == '\t': column = (column // tabsize + 1) * tabsize elif c == '\f': column = 0 pos = pos + 1 return start, column def compute_indent_or_dedent(token, indentation_levels, output_tokens): start, column = compute_position_of_newline(token) # before start: new line token output_tokens.append(Token("Newline", token.value[:start], token.source_pos)) # after start: deal with white spaces (create indent or dedent token) if column > indentation_levels[-1]: # count indents or dedents indentation_levels.append(column) token = make_indent_token(token, start) output_tokens.append(token) else: dedented = False while column < indentation_levels[-1]: dedented = True indentation_levels.pop() output_tokens.append(Token("Dedent", "", token.source_pos)) if dedented: token = make_dedent_token(token, start) output_tokens[-1] = token # input: lexer token stream # output: modified token stream def postprocess(tokens, source): parenthesis_level = 0 indentation_levels = [0] output_tokens = [] tokens = [token for token in tokens if token.name != "Ignore"] token = None for i in range(len(tokens)): token = tokens[i] # never create indent/dedent token between brackets if token.name == "OpenBracket": parenthesis_level += 1 output_tokens.append(token) elif token.name == "CloseBracket": parenthesis_level -= 1 if parenthesis_level < 0: raise LexingError("unmatched parenthesis", token.source_pos) output_tokens.append(token) elif token.name == "NewlineAndWhitespace": if i + 1 < len(tokens) and tokens[i + 1].name == "NewlineAndWhitespace": continue if parenthesis_level == 0: compute_indent_or_dedent(token, indentation_levels, output_tokens) else: pass # implicit line-continuations within parenthesis else: # something else: e.g. name, keyword, etc... output_tokens.append(token) if token is not None: output_tokens.append(Token("EOF", "", token.source_pos)) return output_tokens # RPython reimplementation def group(*choices, **namegroup): choices = list(choices) return '(' + '|'.join(choices) + ')' # RPython reimplementation def any(*choices): result = group(*choices) + '*' return result # ' or " string. eg. 'hello' or "hello" def make_single_string(delim): normal_chars = r"[^\n\%s]*" % (delim,) return "".join([delim, normal_chars, any(r"\\." + normal_chars), delim]) # ____________________________________________________________ # Literals Number = r'(([+-])?[1-9][0-9]*)|0' # ____________________________________________________________ # Ignored Whitespace = r'[ \f\t]' Newline = r'\r?\n' Linecontinue = r'\\' + Newline Comment = r'#[^\r\n]*' NewlineAndWhitespace = Newline + any(Whitespace) Ignore = group(Whitespace + '+', Linecontinue, Comment) # ____________________________________________________________ # Identifier Name = r'[a-zA-Z_][a-zA-Z0-9_]*' PrimitiveName = '\\$' + Name # ____________________________________________________________ # Symbols Colon = r'\:' Comma = r'\,' Assign = r'\=' OpenBracket = r'[\[\(\{]' CloseBracket = r'[\]\)\}]' # ____________________________________________________________ # Project Boolean = r"true|false" String = group(make_single_string(r"\'"), make_single_string(r'\"')) _sign = r"([+-])?" _int = r"(([1-9][0-9]*)|0)" _dec = r"(([0-9]*[1-9])|0)" Double = group(_sign + group(_int, r"") + r"\." + _dec, # 0.1 / .1 _sign + _int + r"\." + group(_dec, r"")) # 1.0 / 1. Plus = r'\+' Minus = r'-' Multiply = r'\*' Divide = r'/' Increment = r'\+\+' Decrement = r'--' Modulo = r'%' PlusInplace = r'\+=' MinusInplace = r'-=' MultiplyInplace = r'\*=' DivideInplace = r'/=' Less = r'<' LessEqual = r'<=' Greater = r'>' GreaterEqual = r'>=' Equal = r'==' NotEqual = r'!=' And = r'&&' Or = r'\|\|' Not = r'!' GC = r'gc' # ____________________________________________________________ # Keywords If = r'if' Else = r'else' While = r'while' Def = r'def' Object = r'object' tokens = ["If", "Else", "While", "Def", "Object", "Ignore", "String", "Boolean", "Double", "Number", # after Double "GC", "NewlineAndWhitespace", "OpenBracket", "CloseBracket", "Comma", "Colon", "And", "Or", "LessEqual", "Less", "GreaterEqual", "Greater", "Equal", "NotEqual", "Decrement", "PlusInplace", "MinusInplace", "MultiplyInplace", "DivideInplace", "Increment", "Plus", "Minus", "Multiply", "Divide", "Modulo", "Assign", "Not", "Name", "PrimitiveName", ] def make_lexer(): lg = LexerGenerator() for token in tokens: # e.g. (Name, r'[a-zA-Z_][a-zA-Z0-9_]*') lg.add(token, globals()[token]) return lg.build() lexer = make_lexer() # s is the simple program code def lex(s): if not s.endswith('\n'): s += '\n' return list(postprocess(lexer.lex(s), s))