summaryrefslogtreecommitdiff
path: root/src/core/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/tokenizer.c')
-rw-r--r--src/core/tokenizer.c279
1 files changed, 279 insertions, 0 deletions
diff --git a/src/core/tokenizer.c b/src/core/tokenizer.c
new file mode 100644
index 0000000..2fc366b
--- /dev/null
+++ b/src/core/tokenizer.c
@@ -0,0 +1,279 @@
+#include "tokenizer.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename) {
+ return (SandTokenizer) {
+ .filename = filename,
+
+ .start = source,
+ .start_line = 0,
+ .start_column = 0,
+
+ .current = source,
+ .current_line = 0,
+ .current_column = 0,
+ };
+}
+
+static bool is_at_end(const SandTokenizer *tokenizer) {
+ // FIXME: This seems bad. In principle the NUL character should not be special. We're leaking our implementation language.
+ return *tokenizer->current == '\0';
+}
+
+static SandToken make_token(const SandTokenizer *tokenizer, SandTokenKind kind) {
+ SandLocation location = (SandLocation) {
+ .filename = tokenizer->filename,
+ .start_column = tokenizer->start_column,
+ .start_line = tokenizer->start_line,
+ .end_column = tokenizer->current_column,
+ .end_line = tokenizer->current_line,
+ };
+
+ return (SandToken) {
+ .kind = kind,
+ .content = tokenizer->start,
+ .content_length = tokenizer->current - tokenizer->start,
+ .location = location,
+ };
+}
+
+static SandToken make_error_token(const SandTokenizer *tokenizer, const char *message) {
+ SandLocation location = (SandLocation) {
+ .filename = tokenizer->filename,
+ .start_column = tokenizer->start_column,
+ .start_line = tokenizer->start_line,
+ .end_column = tokenizer->current_column,
+ .end_line = tokenizer->current_line,
+ };
+
+ return (SandToken) {
+ .kind = SAND_TOKEN_ERROR,
+ .content = message,
+ .content_length = strlen(message),
+ .location = location,
+ };
+}
+
+static char peek(const SandTokenizer *tokenizer) {
+ return *tokenizer->current;
+}
+
+static char peek_next(const SandTokenizer *tokenizer) {
+ if (is_at_end(tokenizer)) {
+ return '\0';
+ } else {
+ return tokenizer->current[1];
+ }
+}
+
+static char advance(SandTokenizer *tokenizer) {
+ assert(!is_at_end(tokenizer));
+ tokenizer->current += 1;
+ tokenizer->current_column += 1;
+ return tokenizer->current[-1];
+}
+
+static bool match(SandTokenizer *tokenizer, char expected) {
+ if (is_at_end(tokenizer)) {
+ return false;
+ } else if (peek(tokenizer) != expected) {
+ return false;
+ } else {
+ advance(tokenizer);
+ return true;
+ }
+}
+
+static void skip_whitespace(SandTokenizer *tokenizer) {
+ while (true) {
+ char c = peek(tokenizer);
+ switch (c) {
+ case '\n':
+ tokenizer->current_line += 1;
+ tokenizer->current_column = 0;
+ /* fallthrough */
+ case ' ':
+ case '\t':
+ case '\r':
+ advance(tokenizer);
+ break;
+ case '/':
+ if (peek_next(tokenizer) == '/') {
+ // We use peek to stop right before the newline, so the next iteration of the loop will handle it properly.
+ while (peek(tokenizer) != '\n' && !is_at_end(tokenizer)) {
+ advance(tokenizer);
+ }
+ } else {
+ return;
+ }
+ break;
+ default: // We've reached a non-whitespace character (or EOF) so we're done.
+ return;
+ }
+ }
+}
+
+static SandToken parse_string(SandTokenizer *tokenizer) {
+ while (peek(tokenizer) != '"' && peek(tokenizer) != '\n' && !is_at_end(tokenizer)) {
+ if (peek(tokenizer) == '\\') {
+ switch (peek_next(tokenizer)) {
+ case 'n':
+ case 'r':
+ case 't':
+ case '\\':
+ case '"':
+ advance(tokenizer);
+ break;
+ case '\n':
+ case '\0':
+ advance(tokenizer); // Eat dangling \ so it doesn't mess up the next token.
+ return make_error_token(tokenizer, "Unfinished escape inside string literal");
+ default:
+ advance(tokenizer); // Eat whatever invalid character was there, so it
+ // becomes part of the "erroneous" input pointed
+ // to by the error token.
+ return make_error_token(tokenizer, "Invalid escape inside string literal");
+ }
+ }
+ advance(tokenizer);
+ }
+
+ if (peek(tokenizer) == '\n') {
+ return make_error_token(tokenizer, "Unexpected end-of-line inside string literal");
+ }
+ if (is_at_end(tokenizer)) {
+ return make_error_token(tokenizer, "Unexpected end-of-file inside string literal");
+ }
+
+ advance(tokenizer); // Eat closing ".
+ return make_token(tokenizer, SAND_TOKEN_STRING);
+}
+
+static bool is_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
+static SandToken parse_number(SandTokenizer *tokenizer) {
+ while (is_digit(peek(tokenizer))) {
+ advance(tokenizer);
+ }
+
+ // Optional fractional part.
+ if (peek(tokenizer) == '.' && is_digit(peek_next(tokenizer))) {
+ advance(tokenizer); // Eat decimal separator.
+ while (is_digit(peek(tokenizer))) {
+ advance(tokenizer);
+ }
+ }
+
+ return make_token(tokenizer, SAND_TOKEN_NUMBER);
+}
+
+static bool is_alpha(char c) {
+ return (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ c == '_';
+}
+
+static SandTokenKind check_keyword(SandTokenizer *tokenizer, size_t start, size_t length,
+ const char *rest, SandTokenKind kind) {
+ if (tokenizer->current - tokenizer->start == start + length &&
+ memcmp(tokenizer->start + start, rest, length) == 0) {
+ return kind;
+ } else {
+ return SAND_TOKEN_IDENTIFIER;
+ }
+}
+
+// Called after we have consumed an identifier (i.e. the start and current
+// fields of the tokenizer delimit the keyword/identifier) and returns either
+// the corrent keyword type or SAND_TOKEN_IDENTIFIER otherwise.
+// TODO: Benchmark this rather involved manual trie with the simpler linear table-scan approach. See if it's worth it.
+static SandTokenKind identifier_type(SandTokenizer *tokenizer) {
+ switch (tokenizer->start[0]) {
+ case 'a': return check_keyword(tokenizer, 1, 2, "nd", SAND_TOKEN_AND);
+ case 'e': return check_keyword(tokenizer, 1, 3, "lse", SAND_TOKEN_ELSE);
+ case 'f':
+ if (tokenizer->current - tokenizer->start > 1) {
+ switch (tokenizer->start[1]) {
+ case 'a': return check_keyword(tokenizer, 2, 3, "lse", SAND_TOKEN_FALSE);
+ case 'o': return check_keyword(tokenizer, 2, 1, "r", SAND_TOKEN_FOR);
+ case 'u': return check_keyword(tokenizer, 2, 1, "n", SAND_TOKEN_FUN);
+ }
+ }
+ break;
+ case 'i': return check_keyword(tokenizer, 1, 1, "f", SAND_TOKEN_IF);
+ case 'o': return check_keyword(tokenizer, 1, 1, "r", SAND_TOKEN_OR);
+ case 'n': return check_keyword(tokenizer, 1, 2, "il", SAND_TOKEN_NIL);
+ case 'p': return check_keyword(tokenizer, 1, 4, "rint", SAND_TOKEN_PRINT);
+ case 'r': return check_keyword(tokenizer, 1, 5, "eturn", SAND_TOKEN_RETURN);
+ case 't': return check_keyword(tokenizer, 1, 3, "rue", SAND_TOKEN_TRUE);
+ case 'v': return check_keyword(tokenizer, 1, 2, "ar", SAND_TOKEN_VAR);
+ case 'w': return check_keyword(tokenizer, 1, 4, "hile", SAND_TOKEN_WHILE);
+ }
+
+ return SAND_TOKEN_IDENTIFIER;
+}
+
+static SandToken parse_identifier_or_keyword(SandTokenizer *tokenizer) {
+ while (is_alpha(peek(tokenizer)) || is_digit(peek(tokenizer))) {
+ advance(tokenizer);
+ }
+
+ return make_token(tokenizer, identifier_type(tokenizer));
+}
+
+SandToken sand_get_next_token(SandTokenizer *tokenizer) {
+ skip_whitespace(tokenizer);
+
+ // The new token starts at the end of the previous one.
+ tokenizer->start = tokenizer->current;
+ tokenizer->start_column = tokenizer->current_column;
+ tokenizer->start_line = tokenizer->current_line;
+
+ if (is_at_end(tokenizer)) {
+ return make_token(tokenizer, SAND_TOKEN_EOF);
+ }
+
+ char c = advance(tokenizer);
+ switch (c) {
+ // Single-character tokens
+ case '(': return make_token(tokenizer, SAND_TOKEN_LEFT_PAREN);
+ case ')': return make_token(tokenizer, SAND_TOKEN_RIGHT_PAREN);
+ case '{': return make_token(tokenizer, SAND_TOKEN_LEFT_BRACE);
+ case '}': return make_token(tokenizer, SAND_TOKEN_RIGHT_BRACE);
+ case ';': return make_token(tokenizer, SAND_TOKEN_SEMICOLON);
+ case ',': return make_token(tokenizer, SAND_TOKEN_COMMA);
+ case '.': return make_token(tokenizer, SAND_TOKEN_DOT);
+ case '-': return make_token(tokenizer, SAND_TOKEN_MINUS);
+ case '+': return make_token(tokenizer, SAND_TOKEN_PLUS);
+ case '/': return make_token(tokenizer, SAND_TOKEN_SLASH);
+ case '*': return make_token(tokenizer, SAND_TOKEN_STAR);
+ // One or two character tokens
+ case '!': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_BANG_EQUAL : SAND_TOKEN_BANG);
+ case '=': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_EQUAL_EQUAL : SAND_TOKEN_EQUAL);
+ case '<': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_LESS_EQUAL : SAND_TOKEN_LESS);
+ case '>': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_GREATER_EQUAL : SAND_TOKEN_GREATER);
+ // Literals
+ case '"': return parse_string(tokenizer);
+ case '0' ... '9': return parse_number(tokenizer);
+ default:
+ if (is_alpha(c)) { // Identifiers can contain alphanumeric characters, but can only start with alpha chars.
+ return parse_identifier_or_keyword(tokenizer);
+ } else {
+ printf("Unepected: '%c'\n", c);
+ return make_error_token(tokenizer, "Unexpected character.");
+ }
+ }
+}
+
+const char *sand_token_kind_to_string(SandTokenKind kind) {
+ switch (kind) {
+#define RETURN_AS_STR(TOK) case TOK: return #TOK;
+SAND_EACH_TOKEN(RETURN_AS_STR)
+#undef RETURN_AS_STR
+ }
+}