From 07e37a1bb02292f01b666a512b459ae94a52b4f0 Mon Sep 17 00:00:00 2001 From: ChUrl Date: Mon, 20 Mar 2023 20:00:34 +0100 Subject: [PATCH] Implement a simple lexer --- src/lexer/Lexer.cpp | 148 ++++++++++++++++++++++++++++++++++++++++++++ src/lexer/Lexer.h | 46 ++++++++++++++ src/lexer/Token.cpp | 28 +++++++++ src/lexer/Token.h | 58 +++++++++++++++++ 4 files changed, 280 insertions(+) create mode 100644 src/lexer/Lexer.cpp create mode 100644 src/lexer/Lexer.h create mode 100644 src/lexer/Token.cpp create mode 100644 src/lexer/Token.h diff --git a/src/lexer/Lexer.cpp b/src/lexer/Lexer.cpp new file mode 100644 index 0000000..78d1d98 --- /dev/null +++ b/src/lexer/Lexer.cpp @@ -0,0 +1,148 @@ +// +// Created by christoph on 20.03.23. +// + +#include +#include +#include "Lexer.h" + +// ! Helper Functions + +auto is_whitespace(const char character) -> bool { + const auto ascii_value = static_cast(character); + const uint8_t ascii_tab = 9; + const uint8_t ascii_cr = 13; + const uint8_t ascii_space = 32; + + return (ascii_value >= ascii_tab && ascii_value <= ascii_cr) + || ascii_value == ascii_space; +} + +auto is_ignored(const char character) -> bool { + // TODO: Any other ignored characters that could happen in the program? + return character == ','; +} + +auto is_numeric(const char character) -> bool { + const auto ascii_value = static_cast(character); + const uint8_t ascii_zero = 48; + const uint8_t ascii_nine = 57; + + return ascii_value >= ascii_zero && ascii_value <= ascii_nine; +} + +auto is_alphabetical(const char character) -> bool { + const auto ascii_value = static_cast(character); + const uint8_t ascii_a = 97; + const uint8_t ascii_A = 65; + const uint8_t ascii_z = 122; + const uint8_t ascii_Z = 90; + const uint8_t ascii_underscore = 95; + + return (ascii_value >= ascii_a && ascii_value <= ascii_z) + || (ascii_value >= ascii_A && ascii_value <= ascii_Z) + || ascii_value == ascii_underscore; +} + +auto is_mnemonic(const Token &token) -> bool { + // TODO: Add other mnemonics + const std::vector mnemonics = {"MOV", + "ADD"}; + + return std::find(mnemonics.begin(), mnemonics.end(), static_cast(token)) + != mnemonics.end(); +} + +// ! Public Functions + +Lexer::Lexer(std::string_view input_string) + : input_string(input_string), position(input_string.begin()) {} + +auto Lexer::next() -> Token { + // Skip past everything that doesn't contain program information + while (is_whitespace(peek()) || peek() == ',' || peek() == '#') { + if (peek() == '#') { + // Eat whole comment, we can't decide if sth is a comment after eating # + comment(); + } else { + get(); + } + } + + if (position >= input_string.end()) { + return static_cast(Token::END); + } + + if (is_numeric(peek())) { + return number(); + } + + if (peek() == '[') { + return address(); + } + + if (is_alphabetical(peek())) { + const Token token = identifier_or_mnemonic(); + if (is_mnemonic(token)) { + return {Token::MNEMONIC, static_cast(token)}; + } + return {Token::IDENTIFIER, static_cast(token)}; + } + + return {Token::UNEXPECTED, std::string_view(position, position + 1)}; +} + +// ! Private Functions + +auto Lexer::peek() const -> char { + return *position; +} + +auto Lexer::get() -> char { + return *(position++); +} + +auto Lexer::identifier_or_mnemonic() -> Token { + const std::string_view::const_iterator begin = position; + while (peek() != ' ' && (is_alphabetical(peek()) || is_numeric(peek()))) { + get(); + } + const std::string_view::const_iterator end = position; + + // We don't know the type yet, so use UNEXPECTED + return {Token::UNEXPECTED, std::string_view(begin, end)}; +} + +auto Lexer::number() -> Token { + const std::string_view::const_iterator begin = position; + while (is_numeric(peek())) { + get(); + } + const std::string_view::const_iterator end = position; + + return {Token::NUMBER, std::string_view(begin, end)}; +} + +auto Lexer::address() -> Token { + get(); // Eat '[' + + const std::string_view::const_iterator begin = position; + while (is_numeric(peek())) { + get(); // Eat the address number + } + const std::string_view::const_iterator end = position; + + if (peek() != ']') { + throw "Lexer Error: Expected ']'!"; + } + get(); // Eat ']' + + return {Token::ADDRESS, std::string_view(begin, end)}; +} + +void Lexer::comment() { + // Eat the whole line + while (peek() != '\n') { + get(); + } +} diff --git a/src/lexer/Lexer.h b/src/lexer/Lexer.h new file mode 100644 index 0000000..78ea93d --- /dev/null +++ b/src/lexer/Lexer.h @@ -0,0 +1,46 @@ +// +// Created by christoph on 20.03.23. +// + +#ifndef LOGISIMASSEMBLER_LEXER_H +#define LOGISIMASSEMBLER_LEXER_H + +#include +#include +#include "Token.h" + +class Lexer { +public: + explicit Lexer(std::string_view input_string); + + Lexer(const Lexer ©) = delete; + + auto operator=(const Lexer ©) -> Lexer & = delete; + + Lexer(Lexer &&move) = delete; + + auto operator=(Lexer &&move) -> Lexer & = delete; + + ~Lexer() = default; + + auto next() -> Token; + +private: + [[nodiscard]] auto peek() const -> char; + + auto get() -> char; + + auto identifier_or_mnemonic() -> Token; + + auto number() -> Token; + + auto address() -> Token; + + void comment(); + +private: + std::string_view input_string; + std::string_view::const_iterator position; +}; + +#endif //LOGISIMASSEMBLER_LEXER_H diff --git a/src/lexer/Token.cpp b/src/lexer/Token.cpp new file mode 100644 index 0000000..01712e3 --- /dev/null +++ b/src/lexer/Token.cpp @@ -0,0 +1,28 @@ +// +// Created by christoph on 20.03.23. +// + +#include +#include +#include "Token.h" + +Token::Token(Token::Type type) : type(type) {} + +Token::Token(Token::Type type, std::string_view lexeme) : type(type), lexeme(lexeme) {} + +auto Token::getType() const -> Token::Type { + return type; +} + +auto Token::getTypeName() const -> std::string { + return std::array {"MNEMONIC", + "IDENTIFIER", + "NUMBER", + "ADDRESS", + "END", + "UNEXPECTED"}[type]; +} + +Token::operator std::string_view() const { + return lexeme; +} diff --git a/src/lexer/Token.h b/src/lexer/Token.h new file mode 100644 index 0000000..2996589 --- /dev/null +++ b/src/lexer/Token.h @@ -0,0 +1,58 @@ +// +// Created by christoph on 20.03.23. +// + +#ifndef LOGISIMASSEMBLER_TOKEN_H +#define LOGISIMASSEMBLER_TOKEN_H + +#include +#include + +class Token { +public: + enum Type : uint8_t { + MNEMONIC, + IDENTIFIER, + NUMBER, + ADDRESS, // Using [] + + // TODO: Inline calculations + // PLUS, + // MINUS, + // ASTERISK, + // SLASH, + + END, + UNEXPECTED + }; + +public: + explicit Token(Type type); + + Token(Type type, std::string_view lexeme); + + Token(const Token ©) = default; + + auto operator=(const Token ©) -> Token & = default; + + Token(Token &&move) = default; + + auto operator=(Token &&move) -> Token & = default; + + ~Token() = default; + + [[nodiscard]] auto getType() const -> Type; + + [[nodiscard]] auto getTypeName() const -> std::string; + + explicit operator std::string_view() const; + +private: + Type type; + + // ! The Token only contains a reference to the string kept inside the Lexer. + // ! If the Lexer is gone, all Tokens will be invalid! + std::string lexeme; +}; + +#endif //LOGISIMASSEMBLER_TOKEN_H