Implement a simple lexer
This commit is contained in:
148
src/lexer/Lexer.cpp
Normal file
148
src/lexer/Lexer.cpp
Normal file
@ -0,0 +1,148 @@
|
||||
//
|
||||
// Created by christoph on 20.03.23.
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "Lexer.h"
|
||||
|
||||
// ! Helper Functions
|
||||
|
||||
auto is_whitespace(const char character) -> bool {
|
||||
const auto ascii_value = static_cast<uint8_t>(character);
|
||||
const uint8_t ascii_tab = 9;
|
||||
const uint8_t ascii_cr = 13;
|
||||
const uint8_t ascii_space = 32;
|
||||
|
||||
return (ascii_value >= ascii_tab && ascii_value <= ascii_cr)
|
||||
|| ascii_value == ascii_space;
|
||||
}
|
||||
|
||||
auto is_ignored(const char character) -> bool {
|
||||
// TODO: Any other ignored characters that could happen in the program?
|
||||
return character == ',';
|
||||
}
|
||||
|
||||
auto is_numeric(const char character) -> bool {
|
||||
const auto ascii_value = static_cast<uint8_t>(character);
|
||||
const uint8_t ascii_zero = 48;
|
||||
const uint8_t ascii_nine = 57;
|
||||
|
||||
return ascii_value >= ascii_zero && ascii_value <= ascii_nine;
|
||||
}
|
||||
|
||||
auto is_alphabetical(const char character) -> bool {
|
||||
const auto ascii_value = static_cast<uint8_t>(character);
|
||||
const uint8_t ascii_a = 97;
|
||||
const uint8_t ascii_A = 65;
|
||||
const uint8_t ascii_z = 122;
|
||||
const uint8_t ascii_Z = 90;
|
||||
const uint8_t ascii_underscore = 95;
|
||||
|
||||
return (ascii_value >= ascii_a && ascii_value <= ascii_z)
|
||||
|| (ascii_value >= ascii_A && ascii_value <= ascii_Z)
|
||||
|| ascii_value == ascii_underscore;
|
||||
}
|
||||
|
||||
auto is_mnemonic(const Token &token) -> bool {
|
||||
// TODO: Add other mnemonics
|
||||
const std::vector<std::string> mnemonics = {"MOV",
|
||||
"ADD"};
|
||||
|
||||
return std::find(mnemonics.begin(), mnemonics.end(), static_cast<std::string_view>(token))
|
||||
!= mnemonics.end();
|
||||
}
|
||||
|
||||
// ! Public Functions
|
||||
|
||||
Lexer::Lexer(std::string_view input_string)
|
||||
: input_string(input_string), position(input_string.begin()) {}
|
||||
|
||||
auto Lexer::next() -> Token {
|
||||
// Skip past everything that doesn't contain program information
|
||||
while (is_whitespace(peek()) || peek() == ',' || peek() == '#') {
|
||||
if (peek() == '#') {
|
||||
// Eat whole comment, we can't decide if sth is a comment after eating #
|
||||
comment();
|
||||
} else {
|
||||
get();
|
||||
}
|
||||
}
|
||||
|
||||
if (position >= input_string.end()) {
|
||||
return static_cast<Token>(Token::END);
|
||||
}
|
||||
|
||||
if (is_numeric(peek())) {
|
||||
return number();
|
||||
}
|
||||
|
||||
if (peek() == '[') {
|
||||
return address();
|
||||
}
|
||||
|
||||
if (is_alphabetical(peek())) {
|
||||
const Token token = identifier_or_mnemonic();
|
||||
if (is_mnemonic(token)) {
|
||||
return {Token::MNEMONIC, static_cast<std::string_view>(token)};
|
||||
}
|
||||
return {Token::IDENTIFIER, static_cast<std::string_view>(token)};
|
||||
}
|
||||
|
||||
return {Token::UNEXPECTED, std::string_view(position, position + 1)};
|
||||
}
|
||||
|
||||
// ! Private Functions
|
||||
|
||||
auto Lexer::peek() const -> char {
|
||||
return *position;
|
||||
}
|
||||
|
||||
auto Lexer::get() -> char {
|
||||
return *(position++);
|
||||
}
|
||||
|
||||
auto Lexer::identifier_or_mnemonic() -> Token {
|
||||
const std::string_view::const_iterator begin = position;
|
||||
while (peek() != ' ' && (is_alphabetical(peek()) || is_numeric(peek()))) {
|
||||
get();
|
||||
}
|
||||
const std::string_view::const_iterator end = position;
|
||||
|
||||
// We don't know the type yet, so use UNEXPECTED
|
||||
return {Token::UNEXPECTED, std::string_view(begin, end)};
|
||||
}
|
||||
|
||||
auto Lexer::number() -> Token {
|
||||
const std::string_view::const_iterator begin = position;
|
||||
while (is_numeric(peek())) {
|
||||
get();
|
||||
}
|
||||
const std::string_view::const_iterator end = position;
|
||||
|
||||
return {Token::NUMBER, std::string_view(begin, end)};
|
||||
}
|
||||
|
||||
auto Lexer::address() -> Token {
|
||||
get(); // Eat '['
|
||||
|
||||
const std::string_view::const_iterator begin = position;
|
||||
while (is_numeric(peek())) {
|
||||
get(); // Eat the address number
|
||||
}
|
||||
const std::string_view::const_iterator end = position;
|
||||
|
||||
if (peek() != ']') {
|
||||
throw "Lexer Error: Expected ']'!";
|
||||
}
|
||||
get(); // Eat ']'
|
||||
|
||||
return {Token::ADDRESS, std::string_view(begin, end)};
|
||||
}
|
||||
|
||||
void Lexer::comment() {
|
||||
// Eat the whole line
|
||||
while (peek() != '\n') {
|
||||
get();
|
||||
}
|
||||
}
|
46
src/lexer/Lexer.h
Normal file
46
src/lexer/Lexer.h
Normal file
@ -0,0 +1,46 @@
|
||||
//
|
||||
// Created by christoph on 20.03.23.
|
||||
//
|
||||
|
||||
#ifndef LOGISIMASSEMBLER_LEXER_H
|
||||
#define LOGISIMASSEMBLER_LEXER_H
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include "Token.h"
|
||||
|
||||
class Lexer {
|
||||
public:
|
||||
explicit Lexer(std::string_view input_string);
|
||||
|
||||
Lexer(const Lexer ©) = delete;
|
||||
|
||||
auto operator=(const Lexer ©) -> Lexer & = delete;
|
||||
|
||||
Lexer(Lexer &&move) = delete;
|
||||
|
||||
auto operator=(Lexer &&move) -> Lexer & = delete;
|
||||
|
||||
~Lexer() = default;
|
||||
|
||||
auto next() -> Token;
|
||||
|
||||
private:
|
||||
[[nodiscard]] auto peek() const -> char;
|
||||
|
||||
auto get() -> char;
|
||||
|
||||
auto identifier_or_mnemonic() -> Token;
|
||||
|
||||
auto number() -> Token;
|
||||
|
||||
auto address() -> Token;
|
||||
|
||||
void comment();
|
||||
|
||||
private:
|
||||
std::string_view input_string;
|
||||
std::string_view::const_iterator position;
|
||||
};
|
||||
|
||||
#endif //LOGISIMASSEMBLER_LEXER_H
|
28
src/lexer/Token.cpp
Normal file
28
src/lexer/Token.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
//
|
||||
// Created by christoph on 20.03.23.
|
||||
//
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include "Token.h"
|
||||
|
||||
Token::Token(Token::Type type) : type(type) {}
|
||||
|
||||
Token::Token(Token::Type type, std::string_view lexeme) : type(type), lexeme(lexeme) {}
|
||||
|
||||
auto Token::getType() const -> Token::Type {
|
||||
return type;
|
||||
}
|
||||
|
||||
auto Token::getTypeName() const -> std::string {
|
||||
return std::array<std::string, 6> {"MNEMONIC",
|
||||
"IDENTIFIER",
|
||||
"NUMBER",
|
||||
"ADDRESS",
|
||||
"END",
|
||||
"UNEXPECTED"}[type];
|
||||
}
|
||||
|
||||
Token::operator std::string_view() const {
|
||||
return lexeme;
|
||||
}
|
58
src/lexer/Token.h
Normal file
58
src/lexer/Token.h
Normal file
@ -0,0 +1,58 @@
|
||||
//
|
||||
// Created by christoph on 20.03.23.
|
||||
//
|
||||
|
||||
#ifndef LOGISIMASSEMBLER_TOKEN_H
|
||||
#define LOGISIMASSEMBLER_TOKEN_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <string_view>
|
||||
|
||||
class Token {
|
||||
public:
|
||||
enum Type : uint8_t {
|
||||
MNEMONIC,
|
||||
IDENTIFIER,
|
||||
NUMBER,
|
||||
ADDRESS, // Using []
|
||||
|
||||
// TODO: Inline calculations
|
||||
// PLUS,
|
||||
// MINUS,
|
||||
// ASTERISK,
|
||||
// SLASH,
|
||||
|
||||
END,
|
||||
UNEXPECTED
|
||||
};
|
||||
|
||||
public:
|
||||
explicit Token(Type type);
|
||||
|
||||
Token(Type type, std::string_view lexeme);
|
||||
|
||||
Token(const Token ©) = default;
|
||||
|
||||
auto operator=(const Token ©) -> Token & = default;
|
||||
|
||||
Token(Token &&move) = default;
|
||||
|
||||
auto operator=(Token &&move) -> Token & = default;
|
||||
|
||||
~Token() = default;
|
||||
|
||||
[[nodiscard]] auto getType() const -> Type;
|
||||
|
||||
[[nodiscard]] auto getTypeName() const -> std::string;
|
||||
|
||||
explicit operator std::string_view() const;
|
||||
|
||||
private:
|
||||
Type type;
|
||||
|
||||
// ! The Token only contains a reference to the string kept inside the Lexer.
|
||||
// ! If the Lexer is gone, all Tokens will be invalid!
|
||||
std::string lexeme;
|
||||
};
|
||||
|
||||
#endif //LOGISIMASSEMBLER_TOKEN_H
|
Reference in New Issue
Block a user