summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinnnus <[email protected]>2025-04-10 05:21:36 +0000
committerLinnnus <[email protected]>2025-04-15 00:54:43 +0000
commit7bd2461c849b4626653a4744427c904b87354bd7 (patch)
tree38ee0e0ac9408c989f3c50c0c6ff28492faf497d
parent8dd0c4f27aae02dd60f029db4cf03f9902cba26f (diff)
feat(core): Add tokenizer
-rw-r--r--src/core/location.c5
-rw-r--r--src/core/location.h33
-rw-r--r--src/core/tokenizer.c279
-rw-r--r--src/core/tokenizer.h93
-rw-r--r--src/unit/main.c2
-rw-r--r--src/unit/test_tokenizer.c233
6 files changed, 645 insertions, 0 deletions
diff --git a/src/core/location.c b/src/core/location.c
new file mode 100644
index 0000000..9f8de09
--- /dev/null
+++ b/src/core/location.c
@@ -0,0 +1,5 @@
+#include "location.h"
+
+void sand_print_location(FILE *stream, const SandLocation *location) {
+ fprintf(stream, "%s:%u:%u", location->filename, location->start_line + 1, location->start_column);
+}
diff --git a/src/core/location.h b/src/core/location.h
new file mode 100644
index 0000000..c1ca3b8
--- /dev/null
+++ b/src/core/location.h
@@ -0,0 +1,33 @@
+#ifndef SAND_LOCATION_H
+#define SAND_LOCATION_H
+
+// This module defines the location type. It represents a reference to a span of
+// source text. They are carried throughout the compiler so error messages can
+// make useful references to the user's source code.
+
+#include <stddef.h>
+#include <stdio.h>
+
+// Uniquely, the fields of this struct should be considered public, though
+// still read-only.
+//
+// Internally, regardless of presentation, line and column numbers are
+// 0-indexed for consistensy.
+typedef struct {
+ const char *filename;
+ unsigned start_line;
+ unsigned start_column;
+ unsigned end_line;
+ unsigned end_column;
+} SandLocation;
+
+// Construct a new location which minimally encompasses both `a` and `b`.
+// `a` must start before `b` ends.
+// `a` and `b` must have the same `filename` (i.e. they obviously can't cross file boundaries).
+SandLocation sand_location_encompassing(const SandLocation *a, const SandLocation *b);
+
+// Print the location to the given stream.
+// The output will not contain newlines.
+void sand_print_location(FILE *, const SandLocation *);
+
+#endif
diff --git a/src/core/tokenizer.c b/src/core/tokenizer.c
new file mode 100644
index 0000000..2fc366b
--- /dev/null
+++ b/src/core/tokenizer.c
@@ -0,0 +1,279 @@
+#include "tokenizer.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename) {
+ return (SandTokenizer) {
+ .filename = filename,
+
+ .start = source,
+ .start_line = 0,
+ .start_column = 0,
+
+ .current = source,
+ .current_line = 0,
+ .current_column = 0,
+ };
+}
+
+static bool is_at_end(const SandTokenizer *tokenizer) {
+ // FIXME: This seems bad. In principle the NUL character should not be special. We're leaking our implementation language.
+ return *tokenizer->current == '\0';
+}
+
+static SandToken make_token(const SandTokenizer *tokenizer, SandTokenKind kind) {
+ SandLocation location = (SandLocation) {
+ .filename = tokenizer->filename,
+ .start_column = tokenizer->start_column,
+ .start_line = tokenizer->start_line,
+ .end_column = tokenizer->current_column,
+ .end_line = tokenizer->current_line,
+ };
+
+ return (SandToken) {
+ .kind = kind,
+ .content = tokenizer->start,
+ .content_length = tokenizer->current - tokenizer->start,
+ .location = location,
+ };
+}
+
+static SandToken make_error_token(const SandTokenizer *tokenizer, const char *message) {
+ SandLocation location = (SandLocation) {
+ .filename = tokenizer->filename,
+ .start_column = tokenizer->start_column,
+ .start_line = tokenizer->start_line,
+ .end_column = tokenizer->current_column,
+ .end_line = tokenizer->current_line,
+ };
+
+ return (SandToken) {
+ .kind = SAND_TOKEN_ERROR,
+ .content = message,
+ .content_length = strlen(message),
+ .location = location,
+ };
+}
+
+static char peek(const SandTokenizer *tokenizer) {
+ return *tokenizer->current;
+}
+
+static char peek_next(const SandTokenizer *tokenizer) {
+ if (is_at_end(tokenizer)) {
+ return '\0';
+ } else {
+ return tokenizer->current[1];
+ }
+}
+
+static char advance(SandTokenizer *tokenizer) {
+ assert(!is_at_end(tokenizer));
+ tokenizer->current += 1;
+ tokenizer->current_column += 1;
+ return tokenizer->current[-1];
+}
+
+static bool match(SandTokenizer *tokenizer, char expected) {
+ if (is_at_end(tokenizer)) {
+ return false;
+ } else if (peek(tokenizer) != expected) {
+ return false;
+ } else {
+ advance(tokenizer);
+ return true;
+ }
+}
+
+static void skip_whitespace(SandTokenizer *tokenizer) {
+ while (true) {
+ char c = peek(tokenizer);
+ switch (c) {
+ case '\n':
+ tokenizer->current_line += 1;
+ tokenizer->current_column = 0;
+ /* fallthrough */
+ case ' ':
+ case '\t':
+ case '\r':
+ advance(tokenizer);
+ break;
+ case '/':
+ if (peek_next(tokenizer) == '/') {
+ // We use peek to stop right before the newline, so the next iteration of the loop will handle it properly.
+ while (peek(tokenizer) != '\n' && !is_at_end(tokenizer)) {
+ advance(tokenizer);
+ }
+ } else {
+ return;
+ }
+ break;
+ default: // We've reached a non-whitespace character (or EOF) so we're done.
+ return;
+ }
+ }
+}
+
+static SandToken parse_string(SandTokenizer *tokenizer) {
+ while (peek(tokenizer) != '"' && peek(tokenizer) != '\n' && !is_at_end(tokenizer)) {
+ if (peek(tokenizer) == '\\') {
+ switch (peek_next(tokenizer)) {
+ case 'n':
+ case 'r':
+ case 't':
+ case '\\':
+ case '"':
+ advance(tokenizer);
+ break;
+ case '\n':
+ case '\0':
+ advance(tokenizer); // Eat dangling \ so it doesn't mess up the next token.
+ return make_error_token(tokenizer, "Unfinished escape inside string literal");
+ default:
+ advance(tokenizer); // Eat whatever invalid character was there, so it
+ // becomes part of the "erroneous" input pointed
+ // to by the error token.
+ return make_error_token(tokenizer, "Invalid escape inside string literal");
+ }
+ }
+ advance(tokenizer);
+ }
+
+ if (peek(tokenizer) == '\n') {
+ return make_error_token(tokenizer, "Unexpected end-of-line inside string literal");
+ }
+ if (is_at_end(tokenizer)) {
+ return make_error_token(tokenizer, "Unexpected end-of-file inside string literal");
+ }
+
+ advance(tokenizer); // Eat closing ".
+ return make_token(tokenizer, SAND_TOKEN_STRING);
+}
+
+static bool is_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
+static SandToken parse_number(SandTokenizer *tokenizer) {
+ while (is_digit(peek(tokenizer))) {
+ advance(tokenizer);
+ }
+
+ // Optional fractional part.
+ if (peek(tokenizer) == '.' && is_digit(peek_next(tokenizer))) {
+ advance(tokenizer); // Eat decimal separator.
+ while (is_digit(peek(tokenizer))) {
+ advance(tokenizer);
+ }
+ }
+
+ return make_token(tokenizer, SAND_TOKEN_NUMBER);
+}
+
+static bool is_alpha(char c) {
+ return (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ c == '_';
+}
+
+static SandTokenKind check_keyword(SandTokenizer *tokenizer, size_t start, size_t length,
+ const char *rest, SandTokenKind kind) {
+ if (tokenizer->current - tokenizer->start == start + length &&
+ memcmp(tokenizer->start + start, rest, length) == 0) {
+ return kind;
+ } else {
+ return SAND_TOKEN_IDENTIFIER;
+ }
+}
+
+// Called after we have consumed an identifier (i.e. the start and current
+// fields of the tokenizer delimit the keyword/identifier) and returns either
+// the corrent keyword type or SAND_TOKEN_IDENTIFIER otherwise.
+// TODO: Benchmark this rather involved manual trie with the simpler linear table-scan approach. See if it's worth it.
+static SandTokenKind identifier_type(SandTokenizer *tokenizer) {
+ switch (tokenizer->start[0]) {
+ case 'a': return check_keyword(tokenizer, 1, 2, "nd", SAND_TOKEN_AND);
+ case 'e': return check_keyword(tokenizer, 1, 3, "lse", SAND_TOKEN_ELSE);
+ case 'f':
+ if (tokenizer->current - tokenizer->start > 1) {
+ switch (tokenizer->start[1]) {
+ case 'a': return check_keyword(tokenizer, 2, 3, "lse", SAND_TOKEN_FALSE);
+ case 'o': return check_keyword(tokenizer, 2, 1, "r", SAND_TOKEN_FOR);
+ case 'u': return check_keyword(tokenizer, 2, 1, "n", SAND_TOKEN_FUN);
+ }
+ }
+ break;
+ case 'i': return check_keyword(tokenizer, 1, 1, "f", SAND_TOKEN_IF);
+ case 'o': return check_keyword(tokenizer, 1, 1, "r", SAND_TOKEN_OR);
+ case 'n': return check_keyword(tokenizer, 1, 2, "il", SAND_TOKEN_NIL);
+ case 'p': return check_keyword(tokenizer, 1, 4, "rint", SAND_TOKEN_PRINT);
+ case 'r': return check_keyword(tokenizer, 1, 5, "eturn", SAND_TOKEN_RETURN);
+ case 't': return check_keyword(tokenizer, 1, 3, "rue", SAND_TOKEN_TRUE);
+ case 'v': return check_keyword(tokenizer, 1, 2, "ar", SAND_TOKEN_VAR);
+ case 'w': return check_keyword(tokenizer, 1, 4, "hile", SAND_TOKEN_WHILE);
+ }
+
+ return SAND_TOKEN_IDENTIFIER;
+}
+
+static SandToken parse_identifier_or_keyword(SandTokenizer *tokenizer) {
+ while (is_alpha(peek(tokenizer)) || is_digit(peek(tokenizer))) {
+ advance(tokenizer);
+ }
+
+ return make_token(tokenizer, identifier_type(tokenizer));
+}
+
+SandToken sand_get_next_token(SandTokenizer *tokenizer) {
+ skip_whitespace(tokenizer);
+
+ // The new token starts at the end of the previous one.
+ tokenizer->start = tokenizer->current;
+ tokenizer->start_column = tokenizer->current_column;
+ tokenizer->start_line = tokenizer->current_line;
+
+ if (is_at_end(tokenizer)) {
+ return make_token(tokenizer, SAND_TOKEN_EOF);
+ }
+
+ char c = advance(tokenizer);
+ switch (c) {
+ // Single-character tokens
+ case '(': return make_token(tokenizer, SAND_TOKEN_LEFT_PAREN);
+ case ')': return make_token(tokenizer, SAND_TOKEN_RIGHT_PAREN);
+ case '{': return make_token(tokenizer, SAND_TOKEN_LEFT_BRACE);
+ case '}': return make_token(tokenizer, SAND_TOKEN_RIGHT_BRACE);
+ case ';': return make_token(tokenizer, SAND_TOKEN_SEMICOLON);
+ case ',': return make_token(tokenizer, SAND_TOKEN_COMMA);
+ case '.': return make_token(tokenizer, SAND_TOKEN_DOT);
+ case '-': return make_token(tokenizer, SAND_TOKEN_MINUS);
+ case '+': return make_token(tokenizer, SAND_TOKEN_PLUS);
+ case '/': return make_token(tokenizer, SAND_TOKEN_SLASH);
+ case '*': return make_token(tokenizer, SAND_TOKEN_STAR);
+ // One or two character tokens
+ case '!': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_BANG_EQUAL : SAND_TOKEN_BANG);
+ case '=': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_EQUAL_EQUAL : SAND_TOKEN_EQUAL);
+ case '<': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_LESS_EQUAL : SAND_TOKEN_LESS);
+ case '>': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_GREATER_EQUAL : SAND_TOKEN_GREATER);
+ // Literals
+ case '"': return parse_string(tokenizer);
+ case '0' ... '9': return parse_number(tokenizer);
+ default:
+ if (is_alpha(c)) { // Identifiers can contain alphanumeric characters, but can only start with alpha chars.
+ return parse_identifier_or_keyword(tokenizer);
+ } else {
+ printf("Unepected: '%c'\n", c);
+ return make_error_token(tokenizer, "Unexpected character.");
+ }
+ }
+}
+
+const char *sand_token_kind_to_string(SandTokenKind kind) {
+ switch (kind) {
+#define RETURN_AS_STR(TOK) case TOK: return #TOK;
+SAND_EACH_TOKEN(RETURN_AS_STR)
+#undef RETURN_AS_STR
+ }
+}
diff --git a/src/core/tokenizer.h b/src/core/tokenizer.h
new file mode 100644
index 0000000..6105c3e
--- /dev/null
+++ b/src/core/tokenizer.h
@@ -0,0 +1,93 @@
+#ifndef SAND_TOKENIZER_H
+#define SAND_TOKENIZER_H
+
+// This module defines the tokenizer. It takes in the raw source text and splits
+// it into tokens. Tokens (a.k.a. lexemes) are the smallest meaningful units of
+// the Sand language.
+//
+// The tokenizer should not need to do any allocation. The source and filename
+// just have to remain valid for as long as the tokens are in use.
+
+#include "location.h"
+
+#include <stddef.h>
+
+#define SAND_EACH_TOKEN(DO) \
+ /* Single-character tokens */ \
+ DO(SAND_TOKEN_LEFT_PAREN) \
+ DO(SAND_TOKEN_RIGHT_PAREN) \
+ DO(SAND_TOKEN_LEFT_BRACE) \
+ DO(SAND_TOKEN_RIGHT_BRACE) \
+ DO(SAND_TOKEN_COMMA) \
+ DO(SAND_TOKEN_DOT) \
+ DO(SAND_TOKEN_MINUS) \
+ DO(SAND_TOKEN_PLUS) \
+ DO(SAND_TOKEN_SEMICOLON) \
+ DO(SAND_TOKEN_SLASH) \
+ DO(SAND_TOKEN_STAR) \
+ /* One or two character tokens. */ \
+ DO(SAND_TOKEN_BANG) \
+ DO(SAND_TOKEN_BANG_EQUAL) \
+ DO(SAND_TOKEN_EQUAL) \
+ DO(SAND_TOKEN_EQUAL_EQUAL) \
+ DO(SAND_TOKEN_GREATER) \
+ DO(SAND_TOKEN_GREATER_EQUAL) \
+ DO(SAND_TOKEN_LESS) \
+ DO(SAND_TOKEN_LESS_EQUAL) \
+ /* Literals */ \
+ DO(SAND_TOKEN_IDENTIFIER) \
+ DO(SAND_TOKEN_STRING) \
+ DO(SAND_TOKEN_NUMBER) \
+ /* Keywords */ \
+ DO(SAND_TOKEN_AND) \
+ DO(SAND_TOKEN_ELSE) \
+ DO(SAND_TOKEN_FALSE) \
+ DO(SAND_TOKEN_FOR) \
+ DO(SAND_TOKEN_FUN) \
+ DO(SAND_TOKEN_IF) \
+ DO(SAND_TOKEN_NIL) \
+ DO(SAND_TOKEN_OR) \
+ DO(SAND_TOKEN_PRINT) \
+ DO(SAND_TOKEN_RETURN) \
+ DO(SAND_TOKEN_TRUE) \
+ DO(SAND_TOKEN_VAR) \
+ DO(SAND_TOKEN_WHILE) \
+ /* Special tokens */ \
+ DO(SAND_TOKEN_ERROR) \
+ DO(SAND_TOKEN_EOF)
+
+
+typedef enum {
+#define X(n) n,
+SAND_EACH_TOKEN(X)
+#undef X
+} SandTokenKind;
+
+typedef struct {
+ SandTokenKind kind;
+ const char *content;
+ size_t content_length;
+ SandLocation location;
+} SandToken;
+
+typedef struct {
+ const char *const filename;
+
+ const char *start;
+ unsigned start_line;
+ unsigned start_column;
+
+ const char *current;
+ unsigned current_line;
+ unsigned current_column;
+} SandTokenizer;
+
+// There is no corresponding destructor, as a tokenizer does not own any resources.
+SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename);
+
+SandToken sand_get_next_token(SandTokenizer *);
+
+// Returns the string representation of the token kind.
+const char *sand_token_kind_to_string(SandTokenKind);
+
+#endif
diff --git a/src/unit/main.c b/src/unit/main.c
index 32df795..2b5c1e1 100644
--- a/src/unit/main.c
+++ b/src/unit/main.c
@@ -3,6 +3,7 @@
SUITE_EXTERN(std_allocator);
SUITE_EXTERN(arena_allocator);
SUITE_EXTERN(page_allocator);
+SUITE_EXTERN(tokenizer);
GREATEST_MAIN_DEFS();
@@ -12,6 +13,7 @@ int main(int argc, char **argv) {
RUN_SUITE(std_allocator);
RUN_SUITE(arena_allocator);
RUN_SUITE(page_allocator);
+ RUN_SUITE(tokenizer);
GREATEST_MAIN_END(); /* display results */
}
diff --git a/src/unit/test_tokenizer.c b/src/unit/test_tokenizer.c
new file mode 100644
index 0000000..e3861b8
--- /dev/null
+++ b/src/unit/test_tokenizer.c
@@ -0,0 +1,233 @@
+#include "../core/tokenizer.h"
+
+#include "greatest.h"
+#include <string.h>
+
+#define STR(x) _STR(x)
+#define _STR(x) #x
+#define CREATE_TOKENIZER(src) sand_create_tokenizer(src, strlen(src), "<dummy file at " __FILE__ ":" STR(__LINE__) ">")
+
+TEST empty_source_gives_eof(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ // It should still be the case for subsequent calls.
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST single_char_tokens(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("( ) { } , . - + ; / *");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_PAREN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_PAREN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_BRACE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_BRACE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_COMMA);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_MINUS);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PLUS);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SEMICOLON);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SLASH);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_STAR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST one_or_two_char_tokens(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("! != = == > >= < <=");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST comments_are_ignored(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER(".// This is a comment\n.");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+ // No comment node here!
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST literal_string(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\\\"def\\nghi\"");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_STRING);
+ ASSERT_EQ(token.content_length, 15);
+ ASSERT_STRN_EQ(token.content, "\"abc\\\"def\\nghi\"", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_eof(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unexpected end-of-file inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_eol(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\n!");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unexpected end-of-line inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_escape_eol(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\\n!");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_escape_eof(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST literal_number(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_STRN_EQ(token.content, "123", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST fractional_literal_number(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123.00000001");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 12);
+ ASSERT_STRN_EQ(token.content, "123.00000001", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST literal_number_followed_by_dot(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123.");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_STRN_EQ(token.content, "123", token.content_length);
+
+ ASSERT_EQm("Dot should not have been consumed", sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST identifiers(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER);
+ ASSERT_EQ(token.content_length, 61);
+ ASSERT_MEM_EQ(token.content, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST identifiers_cannot_start_with_number(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123abc");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_MEM_EQ(token.content, "123", token.content_length);
+
+ token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_MEM_EQ(token.content, "abc", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST keywords(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("and else false for fun "
+ "if nil or print return "
+ "true var while ");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_AND);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_ELSE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FALSE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FOR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FUN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_IF);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_NIL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_OR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PRINT);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RETURN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_TRUE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_VAR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_WHILE);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+SUITE(tokenizer) {
+ RUN_TEST(empty_source_gives_eof);
+ RUN_TEST(single_char_tokens);
+ RUN_TEST(one_or_two_char_tokens);
+ RUN_TEST(comments_are_ignored);
+ RUN_TEST(literal_string);
+ RUN_TEST(unfinished_literal_string_eof);
+ RUN_TEST(unfinished_literal_string_eol);
+ RUN_TEST(unfinished_literal_string_escape_eol);
+ RUN_TEST(unfinished_literal_string_escape_eof);
+ RUN_TEST(literal_number);
+ RUN_TEST(fractional_literal_number);
+ RUN_TEST(literal_number_followed_by_dot);
+ RUN_TEST(identifiers);
+ RUN_TEST(identifiers_cannot_start_with_number);
+ RUN_TEST(keywords);
+}