#include "tokenizer.h" #include #include #include SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename) { return (SandTokenizer) { .filename = filename, .start = source, .start_line = 0, .start_column = 0, .current = source, .current_line = 0, .current_column = 0, }; } static bool is_at_end(const SandTokenizer *tokenizer) { // FIXME: This seems bad. In principle the NUL character should not be special. We're leaking our implementation language. return *tokenizer->current == '\0'; } static SandToken make_token(const SandTokenizer *tokenizer, SandTokenKind kind) { SandLocation location = (SandLocation) { .filename = tokenizer->filename, .start_column = tokenizer->start_column, .start_line = tokenizer->start_line, .end_column = tokenizer->current_column, .end_line = tokenizer->current_line, }; return (SandToken) { .kind = kind, .content = tokenizer->start, .content_length = tokenizer->current - tokenizer->start, .location = location, }; } static SandToken make_error_token(const SandTokenizer *tokenizer, const char *message) { SandLocation location = (SandLocation) { .filename = tokenizer->filename, .start_column = tokenizer->start_column, .start_line = tokenizer->start_line, .end_column = tokenizer->current_column, .end_line = tokenizer->current_line, }; return (SandToken) { .kind = SAND_TOKEN_ERROR, .content = message, .content_length = strlen(message), .location = location, }; } static char peek(const SandTokenizer *tokenizer) { return *tokenizer->current; } static char peek_next(const SandTokenizer *tokenizer) { if (is_at_end(tokenizer)) { return '\0'; } else { return tokenizer->current[1]; } } static char advance(SandTokenizer *tokenizer) { assert(!is_at_end(tokenizer)); tokenizer->current += 1; tokenizer->current_column += 1; return tokenizer->current[-1]; } static bool match(SandTokenizer *tokenizer, char expected) { if (is_at_end(tokenizer)) { return false; } else if (peek(tokenizer) != expected) { return false; } else { advance(tokenizer); return true; } } static void skip_whitespace(SandTokenizer *tokenizer) { while (true) { char c = peek(tokenizer); switch (c) { case '\n': tokenizer->current_line += 1; tokenizer->current_column = 0; /* fallthrough */ case ' ': case '\t': case '\r': advance(tokenizer); break; case '/': if (peek_next(tokenizer) == '/') { // We use peek to stop right before the newline, so the next iteration of the loop will handle it properly. while (peek(tokenizer) != '\n' && !is_at_end(tokenizer)) { advance(tokenizer); } } else { return; } break; default: // We've reached a non-whitespace character (or EOF) so we're done. return; } } } static SandToken parse_string(SandTokenizer *tokenizer) { while (peek(tokenizer) != '"' && peek(tokenizer) != '\n' && !is_at_end(tokenizer)) { if (peek(tokenizer) == '\\') { switch (peek_next(tokenizer)) { case 'n': case 'r': case 't': case '\\': case '"': advance(tokenizer); break; case '\n': case '\0': advance(tokenizer); // Eat dangling \ so it doesn't mess up the next token. return make_error_token(tokenizer, "Unfinished escape inside string literal"); default: advance(tokenizer); // Eat whatever invalid character was there, so it // becomes part of the "erroneous" input pointed // to by the error token. return make_error_token(tokenizer, "Invalid escape inside string literal"); } } advance(tokenizer); } if (peek(tokenizer) == '\n') { return make_error_token(tokenizer, "Unexpected end-of-line inside string literal"); } if (is_at_end(tokenizer)) { return make_error_token(tokenizer, "Unexpected end-of-file inside string literal"); } advance(tokenizer); // Eat closing ". return make_token(tokenizer, SAND_TOKEN_STRING); } static bool is_digit(char c) { return c >= '0' && c <= '9'; } static SandToken parse_number(SandTokenizer *tokenizer) { while (is_digit(peek(tokenizer))) { advance(tokenizer); } // Optional fractional part. if (peek(tokenizer) == '.' && is_digit(peek_next(tokenizer))) { advance(tokenizer); // Eat decimal separator. while (is_digit(peek(tokenizer))) { advance(tokenizer); } } return make_token(tokenizer, SAND_TOKEN_NUMBER); } static bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } static SandTokenKind check_keyword(SandTokenizer *tokenizer, size_t start, size_t length, const char *rest, SandTokenKind kind) { if (tokenizer->current - tokenizer->start == start + length && memcmp(tokenizer->start + start, rest, length) == 0) { return kind; } else { return SAND_TOKEN_IDENTIFIER; } } // Called after we have consumed an identifier (i.e. the start and current // fields of the tokenizer delimit the keyword/identifier) and returns either // the corrent keyword type or SAND_TOKEN_IDENTIFIER otherwise. // TODO: Benchmark this rather involved manual trie with the simpler linear table-scan approach. See if it's worth it. static SandTokenKind identifier_type(SandTokenizer *tokenizer) { switch (tokenizer->start[0]) { case 'a': return check_keyword(tokenizer, 1, 2, "nd", SAND_TOKEN_AND); case 'e': return check_keyword(tokenizer, 1, 3, "lse", SAND_TOKEN_ELSE); case 'f': if (tokenizer->current - tokenizer->start > 1) { switch (tokenizer->start[1]) { case 'a': return check_keyword(tokenizer, 2, 3, "lse", SAND_TOKEN_FALSE); case 'o': return check_keyword(tokenizer, 2, 1, "r", SAND_TOKEN_FOR); case 'u': return check_keyword(tokenizer, 2, 1, "n", SAND_TOKEN_FUN); } } break; case 'i': return check_keyword(tokenizer, 1, 1, "f", SAND_TOKEN_IF); case 'o': return check_keyword(tokenizer, 1, 1, "r", SAND_TOKEN_OR); case 'n': return check_keyword(tokenizer, 1, 2, "il", SAND_TOKEN_NIL); case 'p': return check_keyword(tokenizer, 1, 4, "rint", SAND_TOKEN_PRINT); case 'r': return check_keyword(tokenizer, 1, 5, "eturn", SAND_TOKEN_RETURN); case 't': return check_keyword(tokenizer, 1, 3, "rue", SAND_TOKEN_TRUE); case 'v': return check_keyword(tokenizer, 1, 2, "ar", SAND_TOKEN_VAR); case 'w': return check_keyword(tokenizer, 1, 4, "hile", SAND_TOKEN_WHILE); } return SAND_TOKEN_IDENTIFIER; } static SandToken parse_identifier_or_keyword(SandTokenizer *tokenizer) { while (is_alpha(peek(tokenizer)) || is_digit(peek(tokenizer))) { advance(tokenizer); } return make_token(tokenizer, identifier_type(tokenizer)); } SandToken sand_get_next_token(SandTokenizer *tokenizer) { skip_whitespace(tokenizer); // The new token starts at the end of the previous one. tokenizer->start = tokenizer->current; tokenizer->start_column = tokenizer->current_column; tokenizer->start_line = tokenizer->current_line; if (is_at_end(tokenizer)) { return make_token(tokenizer, SAND_TOKEN_EOF); } char c = advance(tokenizer); switch (c) { // Single-character tokens case '(': return make_token(tokenizer, SAND_TOKEN_LEFT_PAREN); case ')': return make_token(tokenizer, SAND_TOKEN_RIGHT_PAREN); case '{': return make_token(tokenizer, SAND_TOKEN_LEFT_BRACE); case '}': return make_token(tokenizer, SAND_TOKEN_RIGHT_BRACE); case ';': return make_token(tokenizer, SAND_TOKEN_SEMICOLON); case ',': return make_token(tokenizer, SAND_TOKEN_COMMA); case '.': return make_token(tokenizer, SAND_TOKEN_DOT); case '-': return make_token(tokenizer, SAND_TOKEN_MINUS); case '+': return make_token(tokenizer, SAND_TOKEN_PLUS); case '/': return make_token(tokenizer, SAND_TOKEN_SLASH); case '*': return make_token(tokenizer, SAND_TOKEN_STAR); // One or two character tokens case '!': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_BANG_EQUAL : SAND_TOKEN_BANG); case '=': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_EQUAL_EQUAL : SAND_TOKEN_EQUAL); case '<': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_LESS_EQUAL : SAND_TOKEN_LESS); case '>': return make_token(tokenizer, match(tokenizer, '=') ? SAND_TOKEN_GREATER_EQUAL : SAND_TOKEN_GREATER); // Literals case '"': return parse_string(tokenizer); case '0' ... '9': return parse_number(tokenizer); default: if (is_alpha(c)) { // Identifiers can contain alphanumeric characters, but can only start with alpha chars. return parse_identifier_or_keyword(tokenizer); } else { printf("Unepected: '%c'\n", c); return make_error_token(tokenizer, "Unexpected character."); } } } const char *sand_token_kind_to_string(SandTokenKind kind) { switch (kind) { #define RETURN_AS_STR(TOK) case TOK: return #TOK; SAND_EACH_TOKEN(RETURN_AS_STR) #undef RETURN_AS_STR } }