feat(core): Add tokenizer

author: Linnnus <[email protected]> 2025-04-10 05:21:36 +0000
committer: Linnnus <[email protected]> 2025-04-15 00:54:43 +0000
commit: 7bd2461c849b4626653a4744427c904b87354bd7 (patch)
tree: 38ee0e0ac9408c989f3c50c0c6ff28492faf497d
parent: 8dd0c4f27aae02dd60f029db4cf03f9902cba26f (diff)
6 files changed, 645 insertions, 0 deletions
diff --git a/src/core/location.c b/src/core/location.c
new file mode 100644
index 0000000..9f8de09
--- /dev/null
+++ b/src/core/location.c
@@ -0,0 +1,5 @@
+#include "location.h"
+
+void sand_print_location(FILE *stream, const SandLocation *location) {
+	fprintf(stream, "%s:%u:%u", location->filename, location->start_line + 1, location->start_column);
+}
diff --git a/src/core/location.h b/src/core/location.h
new file mode 100644
index 0000000..c1ca3b8
--- /dev/null
+++ b/src/core/location.h
@@ -0,0 +1,33 @@
+#ifndef SAND_LOCATION_H
+#define SAND_LOCATION_H
+
+// This module defines the location type. It represents a reference to a span of
+// source text. They are carried throughout the compiler so error messages can
+// make useful references to the user's source code.
+
+#include <stddef.h>
+#include <stdio.h>
+
+// Uniquely, the fields of this struct should be considered public, though
+// still read-only.
+//
+// Internally, regardless of presentation, line and column numbers are
+// 0-indexed for consistensy.
+typedef struct {
+	const char *filename;
+	unsigned start_line;
+	unsigned start_column;
+	unsigned end_line;
+	unsigned end_column;
+} SandLocation;
+
+// Construct a new location which minimally encompasses both `a` and `b`.
+// `a` must start before `b` ends.
+// `a` and `b` must have the same `filename` (i.e. they obviously can't cross file boundaries).
+SandLocation sand_location_encompassing(const SandLocation *a, const SandLocation *b);
+
+// Print the location to the given stream.
+// The output will not contain newlines.
+void sand_print_location(FILE *, const SandLocation *);
+
+#endif
diff --git a/src/core/tokenizer.c b/src/core/tokenizer.c
new file mode 100644
index 0000000..2fc366b
--- /dev/null
+++ b/src/core/tokenizer.c
@@ -0,0 +1,279 @@
+#include "tokenizer.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename) {
+	return (SandTokenizer) {
+		.filename = filename,
+
+		.start = source,
+		.start_line = 0,
+		.start_column = 0,
+
+		.current = source,
+		.current_line = 0,
+		.current_column = 0,
+	};
+}
+
+static bool is_at_end(const SandTokenizer *tokenizer) {
+	// FIXME: This seems bad. In principle the NUL character should not be special. We're leaking our implementation language.
+	return *tokenizer->current == '\0';
+}
+
+static SandToken make_token(const SandTokenizer *tokenizer, SandTokenKind kind) {
+	SandLocation location = (SandLocation) {
+		.filename = tokenizer->filename,
+		.start_column = tokenizer->start_column,
+		.start_line = tokenizer->start_line,
+		.end_column = tokenizer->current_column,
+		.end_line = tokenizer->current_line,
+	};
+
+	return (SandToken) {
+		.kind = kind,
+		.content = tokenizer->start,
+		.content_length = tokenizer->current - tokenizer->start,
+		.location = location,
+	};
+}
+
+static SandToken make_error_token(const SandTokenizer *tokenizer, const char *message) {
+	SandLocation location = (SandLocation) {
+		.filename = tokenizer->filename,
+		.start_column = tokenizer->start_column,
+		.start_line = tokenizer->start_line,
+		.end_column = tokenizer->current_column,
+		.end_line = tokenizer->current_line,
+	};
+
+	return (SandToken) {
+		.kind = SAND_TOKEN_ERROR,
+		.content = message,
+		.content_length = strlen(message),
+		.location = location,
+	};
+}
+
+static char peek(const SandTokenizer *tokenizer) {
+	return *tokenizer->current;
+}
+
+static char peek_next(const SandTokenizer *tokenizer) {
+	if (is_at_end(tokenizer)) {
+		return '\0';
+	} else {
+		return tokenizer->current[1];
+	}
+}
+
+static char advance(SandTokenizer *tokenizer) {
+	assert(!is_at_end(tokenizer));
+	tokenizer->current += 1;
+	tokenizer->current_column += 1;
+	return tokenizer->current[-1];
+}
+
+static bool match(SandTokenizer *tokenizer, char expected) {
+	if (is_at_end(tokenizer)) {
+		return false;
+	} else if (peek(tokenizer) != expected) {
+		return false;
+	} else {
+		advance(tokenizer);
+		return true;
+	}
+}
+
+static void skip_whitespace(SandTokenizer *tokenizer) {
+	while (true) {
+		char c = peek(tokenizer);
+		switch (c) {
+			case '\n':
+				tokenizer->current_line += 1;
+				tokenizer->current_column = 0;
+				/* fallthrough */
+			case ' ':
+			case '\t':
+			case '\r':
+				advance(tokenizer);
+				break;
+			case '/':
+				if (peek_next(tokenizer) == '/') {
+					// We use peek to stop right before the newline, so the next iteration of the loop will handle it properly.
+					while (peek(tokenizer) != '\n' && !is_at_end(tokenizer)) {
+						advance(tokenizer);
+					}
+				} else {
+					return;
+				}
+				break;
+			default: // We've reached a non-whitespace character (or EOF) so we're done.
+				return;
+		}
+	 }
+}
+
+static SandToken parse_string(SandTokenizer *tokenizer) {
+	while (peek(tokenizer) != '"' && peek(tokenizer) != '\n' && !is_at_end(tokenizer)) {
+		if (peek(tokenizer) == '\\') {
+			switch (peek_next(tokenizer)) {
+				case 'n':
+				case 'r':
+				case 't':
+				case '\\':
+				case '"':
+					advance(tokenizer);
+					break;
+				case '\n':
+				case '\0':
+					advance(tokenizer); // Eat dangling \ so it doesn't mess up the next token.
+					return make_error_token(tokenizer, "Unfinished escape inside string literal");
+				default:
+					advance(tokenizer); // Eat whatever invalid character was there, so it
+					                    // becomes part of the "erroneous" input pointed
+					                    // to by the error token.
+					return make_error_token(tokenizer, "Invalid escape inside string literal");
+			}
+		}
+		advance(tokenizer);
+	}
+
+	if (peek(tokenizer) == '\n') {
+		return make_error_token(tokenizer, "Unexpected end-of-line inside string literal");
+	}
+	if (is_at_end(tokenizer)) {
+		return make_error_token(tokenizer, "Unexpected end-of-file inside string literal");
+	}
+
+	advance(tokenizer); // Eat closing ".
+	return make_token(tokenizer, SAND_TOKEN_STRING);
+}
+
+static bool is_digit(char c) {
+	return c >= '0' && c <= '9';
+}
+
+static SandToken parse_number(SandTokenizer *tokenizer) {
+	while (is_digit(peek(tokenizer))) {
+		advance(tokenizer);
+	}
+
+	// Optional fractional part.
+	if (peek(tokenizer) == '.' && is_digit(peek_next(tokenizer))) {
+		advance(tokenizer); // Eat decimal separator.
+		while (is_digit(peek(tokenizer))) {
+			advance(tokenizer);
+		}
+	}
+
+	return make_token(tokenizer, SAND_TOKEN_NUMBER);
+}
+
+static bool is_alpha(char c) {
+	return (c >= 'a' && c <= 'z') ||
+	       (c >= 'A' && c <= 'Z') ||
+	       c == '_';
+}
+
+static SandTokenKind check_keyword(SandTokenizer *tokenizer, size_t start, size_t length,
+                                   const char *rest, SandTokenKind kind) {
+	if (tokenizer->current - tokenizer->start == start + length &&
+	    memcmp(tokenizer->start + start, rest, length) == 0) {
+		return kind;
+	} else {
+		return SAND_TOKEN_IDENTIFIER;
+	}
+}
+
+// Called after we have consumed an identifier (i.e. the start and current
+// fields of the tokenizer delimit the keyword/identifier) and returns either
+// the corrent keyword type or SAND_TOKEN_IDENTIFIER otherwise.
+// TODO: Benchmark this rather involved manual trie with the simpler linear table-scan approach. See if it's worth it.
+static SandTokenKind identifier_type(SandTokenizer *tokenizer) {
+	switch (tokenizer->start[0]) {
+		case 'a': return check_keyword(tokenizer, 1, 2, "nd",    SAND_TOKEN_AND);
+		case 'e': return check_keyword(tokenizer, 1, 3, "lse",   SAND_TOKEN_ELSE);
+		case 'f':
+			if (tokenizer->current - tokenizer->start > 1) {
+				switch (tokenizer->start[1]) {
+					case 'a': return check_keyword(tokenizer, 2, 3, "lse", SAND_TOKEN_FALSE);
+					case 'o': return check_keyword(tokenizer, 2, 1, "r", SAND_TOKEN_FOR);
+					case 'u': return check_keyword(tokenizer, 2, 1, "n", SAND_TOKEN_FUN);
+				}
+			}
+			break;
+		case 'i': return check_keyword(tokenizer, 1, 1, "f",     SAND_TOKEN_IF);
+		case 'o': return check_keyword(tokenizer, 1, 1, "r",     SAND_TOKEN_OR);
+		case 'n': return check_keyword(tokenizer, 1, 2, "il",    SAND_TOKEN_NIL);
+		case 'p': return check_keyword(tokenizer, 1, 4, "rint",  SAND_TOKEN_PRINT);
+		case 'r': return check_keyword(tokenizer, 1, 5, "eturn", SAND_TOKEN_RETURN);
+		case 't': return check_keyword(tokenizer, 1, 3, "rue",   SAND_TOKEN_TRUE);
+		case 'v': return check_keyword(tokenizer, 1, 2, "ar",    SAND_TOKEN_VAR);
+		case 'w': return check_keyword(tokenizer, 1, 4, "hile",  SAND_TOKEN_WHILE);
+	}
+
+	return SAND_TOKEN_IDENTIFIER;
+}
+
+static SandToken parse_identifier_or_keyword(SandTokenizer *tokenizer) {
+	while (is_alpha(peek(tokenizer)) || is_digit(peek(tokenizer))) {
+		advance(tokenizer);
+	}
+
+	return make_token(tokenizer, identifier_type(tokenizer));
+}
+
+SandToken sand_get_next_token(SandTokenizer *tokenizer) {
+	skip_whitespace(tokenizer);
+
+	// The new token starts at the end of the previous one.
+	tokenizer->start = tokenizer->current;
+	tokenizer->start_column = tokenizer->current_column;
+	tokenizer->start_line = tokenizer->current_line;
+
+	if (is_at_end(tokenizer)) {
+		return make_token(tokenizer, SAND_TOKEN_EOF);
+	}
+
+	char c = advance(tokenizer);
+	switch (c) {
+		// Single-character tokens
+		case '(': return make_token(tokenizer, SAND_TOKEN_LEFT_PAREN);
+		case ')': return make_token(tokenizer, SAND_TOKEN_RIGHT_PAREN);
+		case '{': return make_token(tokenizer, SAND_TOKEN_LEFT_BRACE);
+		case '}': return make_token(tokenizer, SAND_TOKEN_RIGHT_BRACE);
+		case ';': return make_token(tokenizer, SAND_TOKEN_SEMICOLON);
+		case ',': return make_token(tokenizer, SAND_TOKEN_COMMA);
+		case '.': return make_token(tokenizer, SAND_TOKEN_DOT);
+		case '-': return make_token(tokenizer, SAND_TOKEN_MINUS);
+		case '+': return make_token(tokenizer, SAND_TOKEN_PLUS);
+		case '/': return make_token(tokenizer, SAND_TOKEN_SLASH);
+		case '*': return make_token(tokenizer, SAND_TOKEN_STAR);
+		// One or two character tokens
+		case '!': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_BANG_EQUAL : SAND_TOKEN_BANG);
+		case '=': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_EQUAL_EQUAL : SAND_TOKEN_EQUAL);
+		case '<': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_LESS_EQUAL : SAND_TOKEN_LESS);
+		case '>': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_GREATER_EQUAL : SAND_TOKEN_GREATER);
+		// Literals
+		case '"': return parse_string(tokenizer);
+		case '0' ... '9': return parse_number(tokenizer);
+		default:
+			if (is_alpha(c)) { // Identifiers can contain alphanumeric characters, but can only start with alpha chars.
+				return parse_identifier_or_keyword(tokenizer);
+			} else {
+				printf("Unepected: '%c'\n", c);
+				return make_error_token(tokenizer, "Unexpected character.");
+			}
+	}
+}
+
+const char *sand_token_kind_to_string(SandTokenKind kind) {
+	switch (kind) {
+#define RETURN_AS_STR(TOK) case TOK: return #TOK;
+SAND_EACH_TOKEN(RETURN_AS_STR)
+#undef RETURN_AS_STR
+	}
+}
diff --git a/src/core/tokenizer.h b/src/core/tokenizer.h
new file mode 100644
index 0000000..6105c3e
--- /dev/null
+++ b/src/core/tokenizer.h
@@ -0,0 +1,93 @@
+#ifndef SAND_TOKENIZER_H
+#define SAND_TOKENIZER_H
+
+// This module defines the tokenizer. It takes in the raw source text and splits
+// it into tokens. Tokens (a.k.a. lexemes) are the smallest meaningful units of
+// the Sand language.
+//
+// The tokenizer should not need to do any allocation. The source and filename
+// just have to remain valid for as long as the tokens are in use.
+
+#include "location.h"
+
+#include <stddef.h>
+
+#define SAND_EACH_TOKEN(DO)                 \
+	/* Single-character tokens */       \
+	DO(SAND_TOKEN_LEFT_PAREN)           \
+	DO(SAND_TOKEN_RIGHT_PAREN)          \
+	DO(SAND_TOKEN_LEFT_BRACE)           \
+	DO(SAND_TOKEN_RIGHT_BRACE)          \
+	DO(SAND_TOKEN_COMMA)                \
+	DO(SAND_TOKEN_DOT)                  \
+	DO(SAND_TOKEN_MINUS)                \
+	DO(SAND_TOKEN_PLUS)                 \
+	DO(SAND_TOKEN_SEMICOLON)            \
+	DO(SAND_TOKEN_SLASH)                \
+	DO(SAND_TOKEN_STAR)                 \
+	/* One or two character tokens. */  \
+	DO(SAND_TOKEN_BANG)                 \
+	DO(SAND_TOKEN_BANG_EQUAL)           \
+	DO(SAND_TOKEN_EQUAL)                \
+	DO(SAND_TOKEN_EQUAL_EQUAL)          \
+	DO(SAND_TOKEN_GREATER)              \
+	DO(SAND_TOKEN_GREATER_EQUAL)        \
+	DO(SAND_TOKEN_LESS)                 \
+	DO(SAND_TOKEN_LESS_EQUAL)           \
+	/* Literals */                      \
+	DO(SAND_TOKEN_IDENTIFIER)           \
+	DO(SAND_TOKEN_STRING)               \
+	DO(SAND_TOKEN_NUMBER)               \
+	/* Keywords */                      \
+	DO(SAND_TOKEN_AND)                  \
+	DO(SAND_TOKEN_ELSE)                 \
+	DO(SAND_TOKEN_FALSE)                \
+	DO(SAND_TOKEN_FOR)                  \
+	DO(SAND_TOKEN_FUN)                  \
+	DO(SAND_TOKEN_IF)                   \
+	DO(SAND_TOKEN_NIL)                  \
+	DO(SAND_TOKEN_OR)                   \
+	DO(SAND_TOKEN_PRINT)                \
+	DO(SAND_TOKEN_RETURN)               \
+	DO(SAND_TOKEN_TRUE)                 \
+	DO(SAND_TOKEN_VAR)                  \
+	DO(SAND_TOKEN_WHILE)                \
+	/* Special tokens */                \
+	DO(SAND_TOKEN_ERROR)                \
+	DO(SAND_TOKEN_EOF)
+
+
+typedef enum {
+#define X(n) n,
+SAND_EACH_TOKEN(X)
+#undef X
+} SandTokenKind;
+
+typedef struct {
+	SandTokenKind kind;
+	const char *content;
+	size_t content_length;
+	SandLocation location;
+} SandToken;
+
+typedef struct {
+	const char *const filename;
+
+	const char *start;
+	unsigned start_line;
+	unsigned start_column;
+
+	const char *current;
+	unsigned current_line;
+	unsigned current_column;
+} SandTokenizer;
+
+// There is no corresponding destructor, as a tokenizer does not own any resources.
+SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename);
+
+SandToken sand_get_next_token(SandTokenizer *);
+
+// Returns the string representation of the token kind.
+const char *sand_token_kind_to_string(SandTokenKind);
+
+#endif
diff --git a/src/unit/main.c b/src/unit/main.c
index 32df795..2b5c1e1 100644
--- a/src/unit/main.c
+++ b/src/unit/main.c
@@ -3,6 +3,7 @@
 SUITE_EXTERN(std_allocator);
 SUITE_EXTERN(arena_allocator);
 SUITE_EXTERN(page_allocator);
+SUITE_EXTERN(tokenizer);
 
 GREATEST_MAIN_DEFS();
 
@@ -12,6 +13,7 @@ int main(int argc, char **argv) {
     RUN_SUITE(std_allocator);
     RUN_SUITE(arena_allocator);
     RUN_SUITE(page_allocator);
+    RUN_SUITE(tokenizer);
 
     GREATEST_MAIN_END();        /* display results */
 }
diff --git a/src/unit/test_tokenizer.c b/src/unit/test_tokenizer.c
new file mode 100644
index 0000000..e3861b8
--- /dev/null
+++ b/src/unit/test_tokenizer.c
@@ -0,0 +1,233 @@
+#include "../core/tokenizer.h"
+
+#include "greatest.h"
+#include <string.h>
+
+#define STR(x) _STR(x)
+#define _STR(x) #x
+#define CREATE_TOKENIZER(src) sand_create_tokenizer(src, strlen(src), "<dummy file at " __FILE__ ":" STR(__LINE__) ">")
+
+TEST empty_source_gives_eof(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+	// It should still be the case for subsequent calls.
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+	PASS();
+}
+
+TEST single_char_tokens(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("( ) { } , . - + ; / *");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_PAREN);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_PAREN);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_BRACE);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_BRACE);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_COMMA);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_MINUS);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PLUS);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SEMICOLON);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SLASH);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_STAR);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+	PASS();
+}
+
+TEST one_or_two_char_tokens(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("! != = == > >= < <=");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG_EQUAL);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL_EQUAL);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER_EQUAL);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS_EQUAL);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+	PASS();
+}
+
+TEST comments_are_ignored(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER(".// This is a comment\n.");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+	// No comment node here!
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+	PASS();
+}
+
+TEST literal_string(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\\\"def\\nghi\"");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_STRING);
+	ASSERT_EQ(token.content_length, 15);
+	ASSERT_STRN_EQ(token.content, "\"abc\\\"def\\nghi\"", token.content_length);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST unfinished_literal_string_eof(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+	ASSERT_STR_EQ(token.content, "Unexpected end-of-file inside string literal");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST unfinished_literal_string_eol(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\n!");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+	ASSERT_STR_EQ(token.content, "Unexpected end-of-line inside string literal");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST unfinished_literal_string_escape_eol(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\\n!");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+	ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST unfinished_literal_string_escape_eof(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+	ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST literal_number(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("123");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+	ASSERT_EQ(token.content_length, 3);
+	ASSERT_STRN_EQ(token.content, "123", token.content_length);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST fractional_literal_number(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("123.00000001");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+	ASSERT_EQ(token.content_length, 12);
+	ASSERT_STRN_EQ(token.content, "123.00000001", token.content_length);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST literal_number_followed_by_dot(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("123.");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+	ASSERT_EQ(token.content_length, 3);
+	ASSERT_STRN_EQ(token.content, "123", token.content_length);
+
+	ASSERT_EQm("Dot should not have been consumed", sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST identifiers(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER);
+	ASSERT_EQ(token.content_length, 61);
+	ASSERT_MEM_EQ(token.content, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789", token.content_length);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST identifiers_cannot_start_with_number(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("123abc");
+
+	SandToken token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+	ASSERT_EQ(token.content_length, 3);
+	ASSERT_MEM_EQ(token.content, "123", token.content_length);
+
+	token = sand_get_next_token(&tokenizer);
+	ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER);
+	ASSERT_EQ(token.content_length, 3);
+	ASSERT_MEM_EQ(token.content, "abc", token.content_length);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+TEST keywords(void) {
+	SandTokenizer tokenizer = CREATE_TOKENIZER("and else false for fun "
+	                                           "if nil or print return "
+	                                           "true var while ");
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_AND);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_ELSE);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FALSE);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FOR);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FUN);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_IF);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_NIL);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_OR);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PRINT);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RETURN);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_TRUE);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_VAR);
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_WHILE);
+
+	ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+	PASS();
+}
+
+SUITE(tokenizer) {
+	RUN_TEST(empty_source_gives_eof);
+	RUN_TEST(single_char_tokens);
+	RUN_TEST(one_or_two_char_tokens);
+	RUN_TEST(comments_are_ignored);
+	RUN_TEST(literal_string);
+	RUN_TEST(unfinished_literal_string_eof);
+	RUN_TEST(unfinished_literal_string_eol);
+	RUN_TEST(unfinished_literal_string_escape_eol);
+	RUN_TEST(unfinished_literal_string_escape_eof);
+	RUN_TEST(literal_number);
+	RUN_TEST(fractional_literal_number);
+	RUN_TEST(literal_number_followed_by_dot);
+	RUN_TEST(identifiers);
+	RUN_TEST(identifiers_cannot_start_with_number);
+	RUN_TEST(keywords);
+}
author	Linnnus <[email protected]>	2025-04-10 05:21:36 +0000
committer	Linnnus <[email protected]>	2025-04-15 00:54:43 +0000
commit	7bd2461c849b4626653a4744427c904b87354bd7 (patch)
tree	38ee0e0ac9408c989f3c50c0c6ff28492faf497d
parent	8dd0c4f27aae02dd60f029db4cf03f9902cba26f (diff)