1

Implement a simple lexer

This commit is contained in:
2023-03-20 20:00:34 +01:00
parent e532be7970
commit 07e37a1bb0
4 changed files with 280 additions and 0 deletions

148
src/lexer/Lexer.cpp Normal file
View File

@ -0,0 +1,148 @@
//
// Created by christoph on 20.03.23.
//
#include <algorithm>
#include <vector>
#include "Lexer.h"
// ! Helper Functions
auto is_whitespace(const char character) -> bool {
const auto ascii_value = static_cast<uint8_t>(character);
const uint8_t ascii_tab = 9;
const uint8_t ascii_cr = 13;
const uint8_t ascii_space = 32;
return (ascii_value >= ascii_tab && ascii_value <= ascii_cr)
|| ascii_value == ascii_space;
}
auto is_ignored(const char character) -> bool {
// TODO: Any other ignored characters that could happen in the program?
return character == ',';
}
auto is_numeric(const char character) -> bool {
const auto ascii_value = static_cast<uint8_t>(character);
const uint8_t ascii_zero = 48;
const uint8_t ascii_nine = 57;
return ascii_value >= ascii_zero && ascii_value <= ascii_nine;
}
auto is_alphabetical(const char character) -> bool {
const auto ascii_value = static_cast<uint8_t>(character);
const uint8_t ascii_a = 97;
const uint8_t ascii_A = 65;
const uint8_t ascii_z = 122;
const uint8_t ascii_Z = 90;
const uint8_t ascii_underscore = 95;
return (ascii_value >= ascii_a && ascii_value <= ascii_z)
|| (ascii_value >= ascii_A && ascii_value <= ascii_Z)
|| ascii_value == ascii_underscore;
}
auto is_mnemonic(const Token &token) -> bool {
// TODO: Add other mnemonics
const std::vector<std::string> mnemonics = {"MOV",
"ADD"};
return std::find(mnemonics.begin(), mnemonics.end(), static_cast<std::string_view>(token))
!= mnemonics.end();
}
// ! Public Functions
Lexer::Lexer(std::string_view input_string)
: input_string(input_string), position(input_string.begin()) {}
auto Lexer::next() -> Token {
// Skip past everything that doesn't contain program information
while (is_whitespace(peek()) || peek() == ',' || peek() == '#') {
if (peek() == '#') {
// Eat whole comment, we can't decide if sth is a comment after eating #
comment();
} else {
get();
}
}
if (position >= input_string.end()) {
return static_cast<Token>(Token::END);
}
if (is_numeric(peek())) {
return number();
}
if (peek() == '[') {
return address();
}
if (is_alphabetical(peek())) {
const Token token = identifier_or_mnemonic();
if (is_mnemonic(token)) {
return {Token::MNEMONIC, static_cast<std::string_view>(token)};
}
return {Token::IDENTIFIER, static_cast<std::string_view>(token)};
}
return {Token::UNEXPECTED, std::string_view(position, position + 1)};
}
// ! Private Functions
auto Lexer::peek() const -> char {
return *position;
}
auto Lexer::get() -> char {
return *(position++);
}
auto Lexer::identifier_or_mnemonic() -> Token {
const std::string_view::const_iterator begin = position;
while (peek() != ' ' && (is_alphabetical(peek()) || is_numeric(peek()))) {
get();
}
const std::string_view::const_iterator end = position;
// We don't know the type yet, so use UNEXPECTED
return {Token::UNEXPECTED, std::string_view(begin, end)};
}
auto Lexer::number() -> Token {
const std::string_view::const_iterator begin = position;
while (is_numeric(peek())) {
get();
}
const std::string_view::const_iterator end = position;
return {Token::NUMBER, std::string_view(begin, end)};
}
auto Lexer::address() -> Token {
get(); // Eat '['
const std::string_view::const_iterator begin = position;
while (is_numeric(peek())) {
get(); // Eat the address number
}
const std::string_view::const_iterator end = position;
if (peek() != ']') {
throw "Lexer Error: Expected ']'!";
}
get(); // Eat ']'
return {Token::ADDRESS, std::string_view(begin, end)};
}
void Lexer::comment() {
// Eat the whole line
while (peek() != '\n') {
get();
}
}

46
src/lexer/Lexer.h Normal file
View File

@ -0,0 +1,46 @@
//
// Created by christoph on 20.03.23.
//
#ifndef LOGISIMASSEMBLER_LEXER_H
#define LOGISIMASSEMBLER_LEXER_H
#include <string>
#include <string_view>
#include "Token.h"
class Lexer {
public:
explicit Lexer(std::string_view input_string);
Lexer(const Lexer &copy) = delete;
auto operator=(const Lexer &copy) -> Lexer & = delete;
Lexer(Lexer &&move) = delete;
auto operator=(Lexer &&move) -> Lexer & = delete;
~Lexer() = default;
auto next() -> Token;
private:
[[nodiscard]] auto peek() const -> char;
auto get() -> char;
auto identifier_or_mnemonic() -> Token;
auto number() -> Token;
auto address() -> Token;
void comment();
private:
std::string_view input_string;
std::string_view::const_iterator position;
};
#endif //LOGISIMASSEMBLER_LEXER_H

28
src/lexer/Token.cpp Normal file
View File

@ -0,0 +1,28 @@
//
// Created by christoph on 20.03.23.
//
#include <array>
#include <string>
#include "Token.h"
Token::Token(Token::Type type) : type(type) {}
Token::Token(Token::Type type, std::string_view lexeme) : type(type), lexeme(lexeme) {}
auto Token::getType() const -> Token::Type {
return type;
}
auto Token::getTypeName() const -> std::string {
return std::array<std::string, 6> {"MNEMONIC",
"IDENTIFIER",
"NUMBER",
"ADDRESS",
"END",
"UNEXPECTED"}[type];
}
Token::operator std::string_view() const {
return lexeme;
}

58
src/lexer/Token.h Normal file
View File

@ -0,0 +1,58 @@
//
// Created by christoph on 20.03.23.
//
#ifndef LOGISIMASSEMBLER_TOKEN_H
#define LOGISIMASSEMBLER_TOKEN_H
#include <cstdint>
#include <string_view>
class Token {
public:
enum Type : uint8_t {
MNEMONIC,
IDENTIFIER,
NUMBER,
ADDRESS, // Using []
// TODO: Inline calculations
// PLUS,
// MINUS,
// ASTERISK,
// SLASH,
END,
UNEXPECTED
};
public:
explicit Token(Type type);
Token(Type type, std::string_view lexeme);
Token(const Token &copy) = default;
auto operator=(const Token &copy) -> Token & = default;
Token(Token &&move) = default;
auto operator=(Token &&move) -> Token & = default;
~Token() = default;
[[nodiscard]] auto getType() const -> Type;
[[nodiscard]] auto getTypeName() const -> std::string;
explicit operator std::string_view() const;
private:
Type type;
// ! The Token only contains a reference to the string kept inside the Lexer.
// ! If the Lexer is gone, all Tokens will be invalid!
std::string lexeme;
};
#endif //LOGISIMASSEMBLER_TOKEN_H