diff options
author | Linnnus <[email protected]> | 2025-04-10 05:21:36 +0000 |
---|---|---|
committer | Linnnus <[email protected]> | 2025-04-15 00:54:43 +0000 |
commit | 7bd2461c849b4626653a4744427c904b87354bd7 (patch) | |
tree | 38ee0e0ac9408c989f3c50c0c6ff28492faf497d | |
parent | 8dd0c4f27aae02dd60f029db4cf03f9902cba26f (diff) |
feat(core): Add tokenizer
-rw-r--r-- | src/core/location.c | 5 | ||||
-rw-r--r-- | src/core/location.h | 33 | ||||
-rw-r--r-- | src/core/tokenizer.c | 279 | ||||
-rw-r--r-- | src/core/tokenizer.h | 93 | ||||
-rw-r--r-- | src/unit/main.c | 2 | ||||
-rw-r--r-- | src/unit/test_tokenizer.c | 233 |
6 files changed, 645 insertions, 0 deletions
diff --git a/src/core/location.c b/src/core/location.c new file mode 100644 index 0000000..9f8de09 --- /dev/null +++ b/src/core/location.c @@ -0,0 +1,5 @@ +#include "location.h" + +void sand_print_location(FILE *stream, const SandLocation *location) { + fprintf(stream, "%s:%u:%u", location->filename, location->start_line + 1, location->start_column); +} diff --git a/src/core/location.h b/src/core/location.h new file mode 100644 index 0000000..c1ca3b8 --- /dev/null +++ b/src/core/location.h @@ -0,0 +1,33 @@ +#ifndef SAND_LOCATION_H +#define SAND_LOCATION_H + +// This module defines the location type. It represents a reference to a span of +// source text. They are carried throughout the compiler so error messages can +// make useful references to the user's source code. + +#include <stddef.h> +#include <stdio.h> + +// Uniquely, the fields of this struct should be considered public, though +// still read-only. +// +// Internally, regardless of presentation, line and column numbers are +// 0-indexed for consistensy. +typedef struct { + const char *filename; + unsigned start_line; + unsigned start_column; + unsigned end_line; + unsigned end_column; +} SandLocation; + +// Construct a new location which minimally encompasses both `a` and `b`. +// `a` must start before `b` ends. +// `a` and `b` must have the same `filename` (i.e. they obviously can't cross file boundaries). +SandLocation sand_location_encompassing(const SandLocation *a, const SandLocation *b); + +// Print the location to the given stream. +// The output will not contain newlines. +void sand_print_location(FILE *, const SandLocation *); + +#endif diff --git a/src/core/tokenizer.c b/src/core/tokenizer.c new file mode 100644 index 0000000..2fc366b --- /dev/null +++ b/src/core/tokenizer.c @@ -0,0 +1,279 @@ +#include "tokenizer.h" + +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename) { + return (SandTokenizer) { + .filename = filename, + + .start = source, + .start_line = 0, + .start_column = 0, + + .current = source, + .current_line = 0, + .current_column = 0, + }; +} + +static bool is_at_end(const SandTokenizer *tokenizer) { + // FIXME: This seems bad. In principle the NUL character should not be special. We're leaking our implementation language. + return *tokenizer->current == '\0'; +} + +static SandToken make_token(const SandTokenizer *tokenizer, SandTokenKind kind) { + SandLocation location = (SandLocation) { + .filename = tokenizer->filename, + .start_column = tokenizer->start_column, + .start_line = tokenizer->start_line, + .end_column = tokenizer->current_column, + .end_line = tokenizer->current_line, + }; + + return (SandToken) { + .kind = kind, + .content = tokenizer->start, + .content_length = tokenizer->current - tokenizer->start, + .location = location, + }; +} + +static SandToken make_error_token(const SandTokenizer *tokenizer, const char *message) { + SandLocation location = (SandLocation) { + .filename = tokenizer->filename, + .start_column = tokenizer->start_column, + .start_line = tokenizer->start_line, + .end_column = tokenizer->current_column, + .end_line = tokenizer->current_line, + }; + + return (SandToken) { + .kind = SAND_TOKEN_ERROR, + .content = message, + .content_length = strlen(message), + .location = location, + }; +} + +static char peek(const SandTokenizer *tokenizer) { + return *tokenizer->current; +} + +static char peek_next(const SandTokenizer *tokenizer) { + if (is_at_end(tokenizer)) { + return '\0'; + } else { + return tokenizer->current[1]; + } +} + +static char advance(SandTokenizer *tokenizer) { + assert(!is_at_end(tokenizer)); + tokenizer->current += 1; + tokenizer->current_column += 1; + return tokenizer->current[-1]; +} + +static bool match(SandTokenizer *tokenizer, char expected) { + if (is_at_end(tokenizer)) { + return false; + } else if (peek(tokenizer) != expected) { + return false; + } else { + advance(tokenizer); + return true; + } +} + +static void skip_whitespace(SandTokenizer *tokenizer) { + while (true) { + char c = peek(tokenizer); + switch (c) { + case '\n': + tokenizer->current_line += 1; + tokenizer->current_column = 0; + /* fallthrough */ + case ' ': + case '\t': + case '\r': + advance(tokenizer); + break; + case '/': + if (peek_next(tokenizer) == '/') { + // We use peek to stop right before the newline, so the next iteration of the loop will handle it properly. + while (peek(tokenizer) != '\n' && !is_at_end(tokenizer)) { + advance(tokenizer); + } + } else { + return; + } + break; + default: // We've reached a non-whitespace character (or EOF) so we're done. + return; + } + } +} + +static SandToken parse_string(SandTokenizer *tokenizer) { + while (peek(tokenizer) != '"' && peek(tokenizer) != '\n' && !is_at_end(tokenizer)) { + if (peek(tokenizer) == '\\') { + switch (peek_next(tokenizer)) { + case 'n': + case 'r': + case 't': + case '\\': + case '"': + advance(tokenizer); + break; + case '\n': + case '\0': + advance(tokenizer); // Eat dangling \ so it doesn't mess up the next token. + return make_error_token(tokenizer, "Unfinished escape inside string literal"); + default: + advance(tokenizer); // Eat whatever invalid character was there, so it + // becomes part of the "erroneous" input pointed + // to by the error token. + return make_error_token(tokenizer, "Invalid escape inside string literal"); + } + } + advance(tokenizer); + } + + if (peek(tokenizer) == '\n') { + return make_error_token(tokenizer, "Unexpected end-of-line inside string literal"); + } + if (is_at_end(tokenizer)) { + return make_error_token(tokenizer, "Unexpected end-of-file inside string literal"); + } + + advance(tokenizer); // Eat closing ". + return make_token(tokenizer, SAND_TOKEN_STRING); +} + +static bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + +static SandToken parse_number(SandTokenizer *tokenizer) { + while (is_digit(peek(tokenizer))) { + advance(tokenizer); + } + + // Optional fractional part. + if (peek(tokenizer) == '.' && is_digit(peek_next(tokenizer))) { + advance(tokenizer); // Eat decimal separator. + while (is_digit(peek(tokenizer))) { + advance(tokenizer); + } + } + + return make_token(tokenizer, SAND_TOKEN_NUMBER); +} + +static bool is_alpha(char c) { + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + c == '_'; +} + +static SandTokenKind check_keyword(SandTokenizer *tokenizer, size_t start, size_t length, + const char *rest, SandTokenKind kind) { + if (tokenizer->current - tokenizer->start == start + length && + memcmp(tokenizer->start + start, rest, length) == 0) { + return kind; + } else { + return SAND_TOKEN_IDENTIFIER; + } +} + +// Called after we have consumed an identifier (i.e. the start and current +// fields of the tokenizer delimit the keyword/identifier) and returns either +// the corrent keyword type or SAND_TOKEN_IDENTIFIER otherwise. +// TODO: Benchmark this rather involved manual trie with the simpler linear table-scan approach. See if it's worth it. +static SandTokenKind identifier_type(SandTokenizer *tokenizer) { + switch (tokenizer->start[0]) { + case 'a': return check_keyword(tokenizer, 1, 2, "nd", SAND_TOKEN_AND); + case 'e': return check_keyword(tokenizer, 1, 3, "lse", SAND_TOKEN_ELSE); + case 'f': + if (tokenizer->current - tokenizer->start > 1) { + switch (tokenizer->start[1]) { + case 'a': return check_keyword(tokenizer, 2, 3, "lse", SAND_TOKEN_FALSE); + case 'o': return check_keyword(tokenizer, 2, 1, "r", SAND_TOKEN_FOR); + case 'u': return check_keyword(tokenizer, 2, 1, "n", SAND_TOKEN_FUN); + } + } + break; + case 'i': return check_keyword(tokenizer, 1, 1, "f", SAND_TOKEN_IF); + case 'o': return check_keyword(tokenizer, 1, 1, "r", SAND_TOKEN_OR); + case 'n': return check_keyword(tokenizer, 1, 2, "il", SAND_TOKEN_NIL); + case 'p': return check_keyword(tokenizer, 1, 4, "rint", SAND_TOKEN_PRINT); + case 'r': return check_keyword(tokenizer, 1, 5, "eturn", SAND_TOKEN_RETURN); + case 't': return check_keyword(tokenizer, 1, 3, "rue", SAND_TOKEN_TRUE); + case 'v': return check_keyword(tokenizer, 1, 2, "ar", SAND_TOKEN_VAR); + case 'w': return check_keyword(tokenizer, 1, 4, "hile", SAND_TOKEN_WHILE); + } + + return SAND_TOKEN_IDENTIFIER; +} + +static SandToken parse_identifier_or_keyword(SandTokenizer *tokenizer) { + while (is_alpha(peek(tokenizer)) || is_digit(peek(tokenizer))) { + advance(tokenizer); + } + + return make_token(tokenizer, identifier_type(tokenizer)); +} + +SandToken sand_get_next_token(SandTokenizer *tokenizer) { + skip_whitespace(tokenizer); + + // The new token starts at the end of the previous one. + tokenizer->start = tokenizer->current; + tokenizer->start_column = tokenizer->current_column; + tokenizer->start_line = tokenizer->current_line; + + if (is_at_end(tokenizer)) { + return make_token(tokenizer, SAND_TOKEN_EOF); + } + + char c = advance(tokenizer); + switch (c) { + // Single-character tokens + case '(': return make_token(tokenizer, SAND_TOKEN_LEFT_PAREN); + case ')': return make_token(tokenizer, SAND_TOKEN_RIGHT_PAREN); + case '{': return make_token(tokenizer, SAND_TOKEN_LEFT_BRACE); + case '}': return make_token(tokenizer, SAND_TOKEN_RIGHT_BRACE); + case ';': return make_token(tokenizer, SAND_TOKEN_SEMICOLON); + case ',': return make_token(tokenizer, SAND_TOKEN_COMMA); + case '.': return make_token(tokenizer, SAND_TOKEN_DOT); + case '-': return make_token(tokenizer, SAND_TOKEN_MINUS); + case '+': return make_token(tokenizer, SAND_TOKEN_PLUS); + case '/': return make_token(tokenizer, SAND_TOKEN_SLASH); + case '*': return make_token(tokenizer, SAND_TOKEN_STAR); + // One or two character tokens + case '!': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_BANG_EQUAL : SAND_TOKEN_BANG); + case '=': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_EQUAL_EQUAL : SAND_TOKEN_EQUAL); + case '<': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_LESS_EQUAL : SAND_TOKEN_LESS); + case '>': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_GREATER_EQUAL : SAND_TOKEN_GREATER); + // Literals + case '"': return parse_string(tokenizer); + case '0' ... '9': return parse_number(tokenizer); + default: + if (is_alpha(c)) { // Identifiers can contain alphanumeric characters, but can only start with alpha chars. + return parse_identifier_or_keyword(tokenizer); + } else { + printf("Unepected: '%c'\n", c); + return make_error_token(tokenizer, "Unexpected character."); + } + } +} + +const char *sand_token_kind_to_string(SandTokenKind kind) { + switch (kind) { +#define RETURN_AS_STR(TOK) case TOK: return #TOK; +SAND_EACH_TOKEN(RETURN_AS_STR) +#undef RETURN_AS_STR + } +} diff --git a/src/core/tokenizer.h b/src/core/tokenizer.h new file mode 100644 index 0000000..6105c3e --- /dev/null +++ b/src/core/tokenizer.h @@ -0,0 +1,93 @@ +#ifndef SAND_TOKENIZER_H +#define SAND_TOKENIZER_H + +// This module defines the tokenizer. It takes in the raw source text and splits +// it into tokens. Tokens (a.k.a. lexemes) are the smallest meaningful units of +// the Sand language. +// +// The tokenizer should not need to do any allocation. The source and filename +// just have to remain valid for as long as the tokens are in use. + +#include "location.h" + +#include <stddef.h> + +#define SAND_EACH_TOKEN(DO) \ + /* Single-character tokens */ \ + DO(SAND_TOKEN_LEFT_PAREN) \ + DO(SAND_TOKEN_RIGHT_PAREN) \ + DO(SAND_TOKEN_LEFT_BRACE) \ + DO(SAND_TOKEN_RIGHT_BRACE) \ + DO(SAND_TOKEN_COMMA) \ + DO(SAND_TOKEN_DOT) \ + DO(SAND_TOKEN_MINUS) \ + DO(SAND_TOKEN_PLUS) \ + DO(SAND_TOKEN_SEMICOLON) \ + DO(SAND_TOKEN_SLASH) \ + DO(SAND_TOKEN_STAR) \ + /* One or two character tokens. */ \ + DO(SAND_TOKEN_BANG) \ + DO(SAND_TOKEN_BANG_EQUAL) \ + DO(SAND_TOKEN_EQUAL) \ + DO(SAND_TOKEN_EQUAL_EQUAL) \ + DO(SAND_TOKEN_GREATER) \ + DO(SAND_TOKEN_GREATER_EQUAL) \ + DO(SAND_TOKEN_LESS) \ + DO(SAND_TOKEN_LESS_EQUAL) \ + /* Literals */ \ + DO(SAND_TOKEN_IDENTIFIER) \ + DO(SAND_TOKEN_STRING) \ + DO(SAND_TOKEN_NUMBER) \ + /* Keywords */ \ + DO(SAND_TOKEN_AND) \ + DO(SAND_TOKEN_ELSE) \ + DO(SAND_TOKEN_FALSE) \ + DO(SAND_TOKEN_FOR) \ + DO(SAND_TOKEN_FUN) \ + DO(SAND_TOKEN_IF) \ + DO(SAND_TOKEN_NIL) \ + DO(SAND_TOKEN_OR) \ + DO(SAND_TOKEN_PRINT) \ + DO(SAND_TOKEN_RETURN) \ + DO(SAND_TOKEN_TRUE) \ + DO(SAND_TOKEN_VAR) \ + DO(SAND_TOKEN_WHILE) \ + /* Special tokens */ \ + DO(SAND_TOKEN_ERROR) \ + DO(SAND_TOKEN_EOF) + + +typedef enum { +#define X(n) n, +SAND_EACH_TOKEN(X) +#undef X +} SandTokenKind; + +typedef struct { + SandTokenKind kind; + const char *content; + size_t content_length; + SandLocation location; +} SandToken; + +typedef struct { + const char *const filename; + + const char *start; + unsigned start_line; + unsigned start_column; + + const char *current; + unsigned current_line; + unsigned current_column; +} SandTokenizer; + +// There is no corresponding destructor, as a tokenizer does not own any resources. +SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename); + +SandToken sand_get_next_token(SandTokenizer *); + +// Returns the string representation of the token kind. +const char *sand_token_kind_to_string(SandTokenKind); + +#endif diff --git a/src/unit/main.c b/src/unit/main.c index 32df795..2b5c1e1 100644 --- a/src/unit/main.c +++ b/src/unit/main.c @@ -3,6 +3,7 @@ SUITE_EXTERN(std_allocator); SUITE_EXTERN(arena_allocator); SUITE_EXTERN(page_allocator); +SUITE_EXTERN(tokenizer); GREATEST_MAIN_DEFS(); @@ -12,6 +13,7 @@ int main(int argc, char **argv) { RUN_SUITE(std_allocator); RUN_SUITE(arena_allocator); RUN_SUITE(page_allocator); + RUN_SUITE(tokenizer); GREATEST_MAIN_END(); /* display results */ } diff --git a/src/unit/test_tokenizer.c b/src/unit/test_tokenizer.c new file mode 100644 index 0000000..e3861b8 --- /dev/null +++ b/src/unit/test_tokenizer.c @@ -0,0 +1,233 @@ +#include "../core/tokenizer.h" + +#include "greatest.h" +#include <string.h> + +#define STR(x) _STR(x) +#define _STR(x) #x +#define CREATE_TOKENIZER(src) sand_create_tokenizer(src, strlen(src), "<dummy file at " __FILE__ ":" STR(__LINE__) ">") + +TEST empty_source_gives_eof(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER(""); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + + // It should still be the case for subsequent calls. + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + + PASS(); +} + +TEST single_char_tokens(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("( ) { } , . - + ; / *"); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_PAREN); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_PAREN); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_BRACE); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_BRACE); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_COMMA); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_MINUS); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PLUS); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SEMICOLON); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SLASH); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_STAR); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + + PASS(); +} + +TEST one_or_two_char_tokens(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("! != = == > >= < <="); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG_EQUAL); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL_EQUAL); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER_EQUAL); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS_EQUAL); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + + PASS(); +} + +TEST comments_are_ignored(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER(".// This is a comment\n."); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT); + // No comment node here! + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + + PASS(); +} + +TEST literal_string(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\\\"def\\nghi\""); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_STRING); + ASSERT_EQ(token.content_length, 15); + ASSERT_STRN_EQ(token.content, "\"abc\\\"def\\nghi\"", token.content_length); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST unfinished_literal_string_eof(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_ERROR); + ASSERT_STR_EQ(token.content, "Unexpected end-of-file inside string literal"); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST unfinished_literal_string_eol(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\n!"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_ERROR); + ASSERT_STR_EQ(token.content, "Unexpected end-of-line inside string literal"); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST unfinished_literal_string_escape_eol(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\\n!"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_ERROR); + ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal"); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST unfinished_literal_string_escape_eof(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_ERROR); + ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal"); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST literal_number(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("123"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER); + ASSERT_EQ(token.content_length, 3); + ASSERT_STRN_EQ(token.content, "123", token.content_length); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST fractional_literal_number(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("123.00000001"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER); + ASSERT_EQ(token.content_length, 12); + ASSERT_STRN_EQ(token.content, "123.00000001", token.content_length); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST literal_number_followed_by_dot(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("123."); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER); + ASSERT_EQ(token.content_length, 3); + ASSERT_STRN_EQ(token.content, "123", token.content_length); + + ASSERT_EQm("Dot should not have been consumed", sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST identifiers(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER); + ASSERT_EQ(token.content_length, 61); + ASSERT_MEM_EQ(token.content, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789", token.content_length); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST identifiers_cannot_start_with_number(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("123abc"); + + SandToken token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER); + ASSERT_EQ(token.content_length, 3); + ASSERT_MEM_EQ(token.content, "123", token.content_length); + + token = sand_get_next_token(&tokenizer); + ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER); + ASSERT_EQ(token.content_length, 3); + ASSERT_MEM_EQ(token.content, "abc", token.content_length); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +TEST keywords(void) { + SandTokenizer tokenizer = CREATE_TOKENIZER("and else false for fun " + "if nil or print return " + "true var while "); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_AND); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_ELSE); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FALSE); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FOR); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FUN); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_IF); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_NIL); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_OR); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PRINT); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RETURN); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_TRUE); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_VAR); + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_WHILE); + + ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF); + PASS(); +} + +SUITE(tokenizer) { + RUN_TEST(empty_source_gives_eof); + RUN_TEST(single_char_tokens); + RUN_TEST(one_or_two_char_tokens); + RUN_TEST(comments_are_ignored); + RUN_TEST(literal_string); + RUN_TEST(unfinished_literal_string_eof); + RUN_TEST(unfinished_literal_string_eol); + RUN_TEST(unfinished_literal_string_escape_eol); + RUN_TEST(unfinished_literal_string_escape_eof); + RUN_TEST(literal_number); + RUN_TEST(fractional_literal_number); + RUN_TEST(literal_number_followed_by_dot); + RUN_TEST(identifiers); + RUN_TEST(identifiers_cannot_start_with_number); + RUN_TEST(keywords); +} |