diff options
Diffstat (limited to 'src/core/tokenizer.c')
-rw-r--r-- | src/core/tokenizer.c | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/src/core/tokenizer.c b/src/core/tokenizer.c new file mode 100644 index 0000000..2fc366b --- /dev/null +++ b/src/core/tokenizer.c @@ -0,0 +1,279 @@ +#include "tokenizer.h" + +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename) { + return (SandTokenizer) { + .filename = filename, + + .start = source, + .start_line = 0, + .start_column = 0, + + .current = source, + .current_line = 0, + .current_column = 0, + }; +} + +static bool is_at_end(const SandTokenizer *tokenizer) { + // FIXME: This seems bad. In principle the NUL character should not be special. We're leaking our implementation language. + return *tokenizer->current == '\0'; +} + +static SandToken make_token(const SandTokenizer *tokenizer, SandTokenKind kind) { + SandLocation location = (SandLocation) { + .filename = tokenizer->filename, + .start_column = tokenizer->start_column, + .start_line = tokenizer->start_line, + .end_column = tokenizer->current_column, + .end_line = tokenizer->current_line, + }; + + return (SandToken) { + .kind = kind, + .content = tokenizer->start, + .content_length = tokenizer->current - tokenizer->start, + .location = location, + }; +} + +static SandToken make_error_token(const SandTokenizer *tokenizer, const char *message) { + SandLocation location = (SandLocation) { + .filename = tokenizer->filename, + .start_column = tokenizer->start_column, + .start_line = tokenizer->start_line, + .end_column = tokenizer->current_column, + .end_line = tokenizer->current_line, + }; + + return (SandToken) { + .kind = SAND_TOKEN_ERROR, + .content = message, + .content_length = strlen(message), + .location = location, + }; +} + +static char peek(const SandTokenizer *tokenizer) { + return *tokenizer->current; +} + +static char peek_next(const SandTokenizer *tokenizer) { + if (is_at_end(tokenizer)) { + return '\0'; + } else { + return tokenizer->current[1]; + } +} + +static char advance(SandTokenizer *tokenizer) { + assert(!is_at_end(tokenizer)); + tokenizer->current += 1; + tokenizer->current_column += 1; + return tokenizer->current[-1]; +} + +static bool match(SandTokenizer *tokenizer, char expected) { + if (is_at_end(tokenizer)) { + return false; + } else if (peek(tokenizer) != expected) { + return false; + } else { + advance(tokenizer); + return true; + } +} + +static void skip_whitespace(SandTokenizer *tokenizer) { + while (true) { + char c = peek(tokenizer); + switch (c) { + case '\n': + tokenizer->current_line += 1; + tokenizer->current_column = 0; + /* fallthrough */ + case ' ': + case '\t': + case '\r': + advance(tokenizer); + break; + case '/': + if (peek_next(tokenizer) == '/') { + // We use peek to stop right before the newline, so the next iteration of the loop will handle it properly. + while (peek(tokenizer) != '\n' && !is_at_end(tokenizer)) { + advance(tokenizer); + } + } else { + return; + } + break; + default: // We've reached a non-whitespace character (or EOF) so we're done. + return; + } + } +} + +static SandToken parse_string(SandTokenizer *tokenizer) { + while (peek(tokenizer) != '"' && peek(tokenizer) != '\n' && !is_at_end(tokenizer)) { + if (peek(tokenizer) == '\\') { + switch (peek_next(tokenizer)) { + case 'n': + case 'r': + case 't': + case '\\': + case '"': + advance(tokenizer); + break; + case '\n': + case '\0': + advance(tokenizer); // Eat dangling \ so it doesn't mess up the next token. + return make_error_token(tokenizer, "Unfinished escape inside string literal"); + default: + advance(tokenizer); // Eat whatever invalid character was there, so it + // becomes part of the "erroneous" input pointed + // to by the error token. + return make_error_token(tokenizer, "Invalid escape inside string literal"); + } + } + advance(tokenizer); + } + + if (peek(tokenizer) == '\n') { + return make_error_token(tokenizer, "Unexpected end-of-line inside string literal"); + } + if (is_at_end(tokenizer)) { + return make_error_token(tokenizer, "Unexpected end-of-file inside string literal"); + } + + advance(tokenizer); // Eat closing ". + return make_token(tokenizer, SAND_TOKEN_STRING); +} + +static bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + +static SandToken parse_number(SandTokenizer *tokenizer) { + while (is_digit(peek(tokenizer))) { + advance(tokenizer); + } + + // Optional fractional part. + if (peek(tokenizer) == '.' && is_digit(peek_next(tokenizer))) { + advance(tokenizer); // Eat decimal separator. + while (is_digit(peek(tokenizer))) { + advance(tokenizer); + } + } + + return make_token(tokenizer, SAND_TOKEN_NUMBER); +} + +static bool is_alpha(char c) { + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + c == '_'; +} + +static SandTokenKind check_keyword(SandTokenizer *tokenizer, size_t start, size_t length, + const char *rest, SandTokenKind kind) { + if (tokenizer->current - tokenizer->start == start + length && + memcmp(tokenizer->start + start, rest, length) == 0) { + return kind; + } else { + return SAND_TOKEN_IDENTIFIER; + } +} + +// Called after we have consumed an identifier (i.e. the start and current +// fields of the tokenizer delimit the keyword/identifier) and returns either +// the corrent keyword type or SAND_TOKEN_IDENTIFIER otherwise. +// TODO: Benchmark this rather involved manual trie with the simpler linear table-scan approach. See if it's worth it. +static SandTokenKind identifier_type(SandTokenizer *tokenizer) { + switch (tokenizer->start[0]) { + case 'a': return check_keyword(tokenizer, 1, 2, "nd", SAND_TOKEN_AND); + case 'e': return check_keyword(tokenizer, 1, 3, "lse", SAND_TOKEN_ELSE); + case 'f': + if (tokenizer->current - tokenizer->start > 1) { + switch (tokenizer->start[1]) { + case 'a': return check_keyword(tokenizer, 2, 3, "lse", SAND_TOKEN_FALSE); + case 'o': return check_keyword(tokenizer, 2, 1, "r", SAND_TOKEN_FOR); + case 'u': return check_keyword(tokenizer, 2, 1, "n", SAND_TOKEN_FUN); + } + } + break; + case 'i': return check_keyword(tokenizer, 1, 1, "f", SAND_TOKEN_IF); + case 'o': return check_keyword(tokenizer, 1, 1, "r", SAND_TOKEN_OR); + case 'n': return check_keyword(tokenizer, 1, 2, "il", SAND_TOKEN_NIL); + case 'p': return check_keyword(tokenizer, 1, 4, "rint", SAND_TOKEN_PRINT); + case 'r': return check_keyword(tokenizer, 1, 5, "eturn", SAND_TOKEN_RETURN); + case 't': return check_keyword(tokenizer, 1, 3, "rue", SAND_TOKEN_TRUE); + case 'v': return check_keyword(tokenizer, 1, 2, "ar", SAND_TOKEN_VAR); + case 'w': return check_keyword(tokenizer, 1, 4, "hile", SAND_TOKEN_WHILE); + } + + return SAND_TOKEN_IDENTIFIER; +} + +static SandToken parse_identifier_or_keyword(SandTokenizer *tokenizer) { + while (is_alpha(peek(tokenizer)) || is_digit(peek(tokenizer))) { + advance(tokenizer); + } + + return make_token(tokenizer, identifier_type(tokenizer)); +} + +SandToken sand_get_next_token(SandTokenizer *tokenizer) { + skip_whitespace(tokenizer); + + // The new token starts at the end of the previous one. + tokenizer->start = tokenizer->current; + tokenizer->start_column = tokenizer->current_column; + tokenizer->start_line = tokenizer->current_line; + + if (is_at_end(tokenizer)) { + return make_token(tokenizer, SAND_TOKEN_EOF); + } + + char c = advance(tokenizer); + switch (c) { + // Single-character tokens + case '(': return make_token(tokenizer, SAND_TOKEN_LEFT_PAREN); + case ')': return make_token(tokenizer, SAND_TOKEN_RIGHT_PAREN); + case '{': return make_token(tokenizer, SAND_TOKEN_LEFT_BRACE); + case '}': return make_token(tokenizer, SAND_TOKEN_RIGHT_BRACE); + case ';': return make_token(tokenizer, SAND_TOKEN_SEMICOLON); + case ',': return make_token(tokenizer, SAND_TOKEN_COMMA); + case '.': return make_token(tokenizer, SAND_TOKEN_DOT); + case '-': return make_token(tokenizer, SAND_TOKEN_MINUS); + case '+': return make_token(tokenizer, SAND_TOKEN_PLUS); + case '/': return make_token(tokenizer, SAND_TOKEN_SLASH); + case '*': return make_token(tokenizer, SAND_TOKEN_STAR); + // One or two character tokens + case '!': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_BANG_EQUAL : SAND_TOKEN_BANG); + case '=': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_EQUAL_EQUAL : SAND_TOKEN_EQUAL); + case '<': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_LESS_EQUAL : SAND_TOKEN_LESS); + case '>': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_GREATER_EQUAL : SAND_TOKEN_GREATER); + // Literals + case '"': return parse_string(tokenizer); + case '0' ... '9': return parse_number(tokenizer); + default: + if (is_alpha(c)) { // Identifiers can contain alphanumeric characters, but can only start with alpha chars. + return parse_identifier_or_keyword(tokenizer); + } else { + printf("Unepected: '%c'\n", c); + return make_error_token(tokenizer, "Unexpected character."); + } + } +} + +const char *sand_token_kind_to_string(SandTokenKind kind) { + switch (kind) { +#define RETURN_AS_STR(TOK) case TOK: return #TOK; +SAND_EACH_TOKEN(RETURN_AS_STR) +#undef RETURN_AS_STR + } +} |