From 07e37a1bb02292f01b666a512b459ae94a52b4f0 Mon Sep 17 00:00:00 2001
From: ChUrl <christoph.urlacher@protonmail.com>
Date: Mon, 20 Mar 2023 20:00:34 +0100
Subject: [PATCH] Implement a simple lexer

---
 src/lexer/Lexer.cpp | 148 ++++++++++++++++++++++++++++++++++++++++++++
 src/lexer/Lexer.h   |  46 ++++++++++++++
 src/lexer/Token.cpp |  28 +++++++++
 src/lexer/Token.h   |  58 +++++++++++++++++
 4 files changed, 280 insertions(+)
 create mode 100644 src/lexer/Lexer.cpp
 create mode 100644 src/lexer/Lexer.h
 create mode 100644 src/lexer/Token.cpp
 create mode 100644 src/lexer/Token.h
diff --git a/src/lexer/Lexer.cpp b/src/lexer/Lexer.cpp
new file mode 100644
index 0000000..78d1d98
--- /dev/null
+++ b/src/lexer/Lexer.cpp
@@ -0,0 +1,148 @@
+//
+// Created by christoph on 20.03.23.
+//
+
+#include <algorithm>
+#include <vector>
+#include "Lexer.h"
+
+// ! Helper Functions
+
+auto is_whitespace(const char character) -> bool {
+    const auto ascii_value = static_cast<uint8_t>(character);
+    const uint8_t ascii_tab = 9;
+    const uint8_t ascii_cr = 13;
+    const uint8_t ascii_space = 32;
+
+    return (ascii_value >= ascii_tab && ascii_value <= ascii_cr)
+           || ascii_value == ascii_space;
+}
+
+auto is_ignored(const char character) -> bool {
+    // TODO: Any other ignored characters that could happen in the program?
+    return character == ',';
+}
+
+auto is_numeric(const char character) -> bool {
+    const auto ascii_value = static_cast<uint8_t>(character);
+    const uint8_t ascii_zero = 48;
+    const uint8_t ascii_nine = 57;
+
+    return ascii_value >= ascii_zero && ascii_value <= ascii_nine;
+}
+
+auto is_alphabetical(const char character) -> bool {
+    const auto ascii_value = static_cast<uint8_t>(character);
+    const uint8_t ascii_a = 97;
+    const uint8_t ascii_A = 65;
+    const uint8_t ascii_z = 122;
+    const uint8_t ascii_Z = 90;
+    const uint8_t ascii_underscore = 95;
+
+    return (ascii_value >= ascii_a && ascii_value <= ascii_z)
+           || (ascii_value >= ascii_A && ascii_value <= ascii_Z)
+           || ascii_value == ascii_underscore;
+}
+
+auto is_mnemonic(const Token &token) -> bool {
+    // TODO: Add other mnemonics
+    const std::vector<std::string> mnemonics = {"MOV",
+                                                "ADD"};
+
+    return std::find(mnemonics.begin(), mnemonics.end(), static_cast<std::string_view>(token))
+           != mnemonics.end();
+}
+
+// ! Public Functions
+
+Lexer::Lexer(std::string_view input_string)
+        : input_string(input_string), position(input_string.begin()) {}
+
+auto Lexer::next() -> Token {
+    // Skip past everything that doesn't contain program information
+    while (is_whitespace(peek()) || peek() == ',' || peek() == '#') {
+        if (peek() == '#') {
+            // Eat whole comment, we can't decide if sth is a comment after eating #
+            comment();
+        } else {
+            get();
+        }
+    }
+
+    if (position >= input_string.end()) {
+        return static_cast<Token>(Token::END);
+    }
+
+    if (is_numeric(peek())) {
+        return number();
+    }
+
+    if (peek() == '[') {
+        return address();
+    }
+
+    if (is_alphabetical(peek())) {
+        const Token token = identifier_or_mnemonic();
+        if (is_mnemonic(token)) {
+            return {Token::MNEMONIC, static_cast<std::string_view>(token)};
+        }
+        return {Token::IDENTIFIER, static_cast<std::string_view>(token)};
+    }
+
+    return {Token::UNEXPECTED, std::string_view(position, position + 1)};
+}
+
+// ! Private Functions
+
+auto Lexer::peek() const -> char {
+    return *position;
+}
+
+auto Lexer::get() -> char {
+    return *(position++);
+}
+
+auto Lexer::identifier_or_mnemonic() -> Token {
+    const std::string_view::const_iterator begin = position;
+    while (peek() != ' ' && (is_alphabetical(peek()) || is_numeric(peek()))) {
+        get();
+    }
+    const std::string_view::const_iterator end = position;
+
+    // We don't know the type yet, so use UNEXPECTED
+    return {Token::UNEXPECTED, std::string_view(begin, end)};
+}
+
+auto Lexer::number() -> Token {
+    const std::string_view::const_iterator begin = position;
+    while (is_numeric(peek())) {
+        get();
+    }
+    const std::string_view::const_iterator end = position;
+
+    return {Token::NUMBER, std::string_view(begin, end)};
+}
+
+auto Lexer::address() -> Token {
+    get(); // Eat '['
+
+    const std::string_view::const_iterator begin = position;
+    while (is_numeric(peek())) {
+        get(); // Eat the address number
+    }
+    const std::string_view::const_iterator end = position;
+
+    if (peek() != ']') {
+        throw "Lexer Error: Expected ']'!";
+    }
+    get(); // Eat ']'
+
+    return {Token::ADDRESS, std::string_view(begin, end)};
+}
+
+void Lexer::comment() {
+    // Eat the whole line
+    while (peek() != '\n') {
+        get();
+    }
+}
diff --git a/src/lexer/Lexer.h b/src/lexer/Lexer.h
new file mode 100644
index 0000000..78ea93d
--- /dev/null
+++ b/src/lexer/Lexer.h
@@ -0,0 +1,46 @@
+//
+// Created by christoph on 20.03.23.
+//
+
+#ifndef LOGISIMASSEMBLER_LEXER_H
+#define LOGISIMASSEMBLER_LEXER_H
+
+#include <string>
+#include <string_view>
+#include "Token.h"
+
+class Lexer {
+public:
+    explicit Lexer(std::string_view input_string);
+
+    Lexer(const Lexer &copy) = delete;
+
+    auto operator=(const Lexer &copy) -> Lexer & = delete;
+
+    Lexer(Lexer &&move) = delete;
+
+    auto operator=(Lexer &&move) -> Lexer & = delete;
+
+    ~Lexer() = default;
+
+    auto next() -> Token;
+
+private:
+    [[nodiscard]] auto peek() const -> char;
+
+    auto get() -> char;
+
+    auto identifier_or_mnemonic() -> Token;
+
+    auto number() -> Token;
+
+    auto address() -> Token;
+
+    void comment();
+
+private:
+    std::string_view input_string;
+    std::string_view::const_iterator position;
+};
+
+#endif //LOGISIMASSEMBLER_LEXER_H
diff --git a/src/lexer/Token.cpp b/src/lexer/Token.cpp
new file mode 100644
index 0000000..01712e3
--- /dev/null
+++ b/src/lexer/Token.cpp
@@ -0,0 +1,28 @@
+//
+// Created by christoph on 20.03.23.
+//
+
+#include <array>
+#include <string>
+#include "Token.h"
+
+Token::Token(Token::Type type) : type(type) {}
+
+Token::Token(Token::Type type, std::string_view lexeme) : type(type), lexeme(lexeme) {}
+
+auto Token::getType() const -> Token::Type {
+    return type;
+}
+
+auto Token::getTypeName() const -> std::string {
+    return std::array<std::string, 6> {"MNEMONIC",
+                                       "IDENTIFIER",
+                                       "NUMBER",
+                                       "ADDRESS",
+                                       "END",
+                                       "UNEXPECTED"}[type];
+}
+
+Token::operator std::string_view() const {
+    return lexeme;
+}
diff --git a/src/lexer/Token.h b/src/lexer/Token.h
new file mode 100644
index 0000000..2996589
--- /dev/null
+++ b/src/lexer/Token.h
@@ -0,0 +1,58 @@
+//
+// Created by christoph on 20.03.23.
+//
+
+#ifndef LOGISIMASSEMBLER_TOKEN_H
+#define LOGISIMASSEMBLER_TOKEN_H
+
+#include <cstdint>
+#include <string_view>
+
+class Token {
+public:
+    enum Type : uint8_t {
+        MNEMONIC,
+        IDENTIFIER,
+        NUMBER,
+        ADDRESS, // Using []
+
+        // TODO: Inline calculations
+        // PLUS,
+        // MINUS,
+        // ASTERISK,
+        // SLASH,
+
+        END,
+        UNEXPECTED
+    };
+
+public:
+    explicit Token(Type type);
+
+    Token(Type type, std::string_view lexeme);
+
+    Token(const Token &copy) = default;
+
+    auto operator=(const Token &copy) -> Token & = default;
+
+    Token(Token &&move) = default;
+
+    auto operator=(Token &&move) -> Token & = default;
+
+    ~Token() = default;
+
+    [[nodiscard]] auto getType() const -> Type;
+
+    [[nodiscard]] auto getTypeName() const -> std::string;
+
+    explicit operator std::string_view() const;
+
+private:
+    Type type;
+
+    // ! The Token only contains a reference to the string kept inside the Lexer.
+    // ! If the Lexer is gone, all Tokens will be invalid!
+    std::string lexeme;
+};
+
+#endif //LOGISIMASSEMBLER_TOKEN_H