summaryrefslogtreecommitdiff
path: root/src/unit/test_tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/unit/test_tokenizer.c')
-rw-r--r--src/unit/test_tokenizer.c233
1 files changed, 233 insertions, 0 deletions
diff --git a/src/unit/test_tokenizer.c b/src/unit/test_tokenizer.c
new file mode 100644
index 0000000..e3861b8
--- /dev/null
+++ b/src/unit/test_tokenizer.c
@@ -0,0 +1,233 @@
+#include "../core/tokenizer.h"
+
+#include "greatest.h"
+#include <string.h>
+
+#define STR(x) _STR(x)
+#define _STR(x) #x
+#define CREATE_TOKENIZER(src) sand_create_tokenizer(src, strlen(src), "<dummy file at " __FILE__ ":" STR(__LINE__) ">")
+
+TEST empty_source_gives_eof(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ // It should still be the case for subsequent calls.
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST single_char_tokens(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("( ) { } , . - + ; / *");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_PAREN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_PAREN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LEFT_BRACE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RIGHT_BRACE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_COMMA);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_MINUS);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PLUS);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SEMICOLON);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_SLASH);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_STAR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST one_or_two_char_tokens(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("! != = == > >= < <=");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EQUAL_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_GREATER_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_LESS_EQUAL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST comments_are_ignored(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER(".// This is a comment\n.");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+ // No comment node here!
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+
+ PASS();
+}
+
+TEST literal_string(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\\\"def\\nghi\"");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_STRING);
+ ASSERT_EQ(token.content_length, 15);
+ ASSERT_STRN_EQ(token.content, "\"abc\\\"def\\nghi\"", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_eof(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unexpected end-of-file inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_eol(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"abc\n!");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unexpected end-of-line inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_escape_eol(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\\n!");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_BANG);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST unfinished_literal_string_escape_eof(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("\"\\");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_ERROR);
+ ASSERT_STR_EQ(token.content, "Unfinished escape inside string literal");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST literal_number(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_STRN_EQ(token.content, "123", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST fractional_literal_number(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123.00000001");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 12);
+ ASSERT_STRN_EQ(token.content, "123.00000001", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST literal_number_followed_by_dot(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123.");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_STRN_EQ(token.content, "123", token.content_length);
+
+ ASSERT_EQm("Dot should not have been consumed", sand_get_next_token(&tokenizer).kind, SAND_TOKEN_DOT);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST identifiers(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER);
+ ASSERT_EQ(token.content_length, 61);
+ ASSERT_MEM_EQ(token.content, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012456789", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST identifiers_cannot_start_with_number(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("123abc");
+
+ SandToken token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_NUMBER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_MEM_EQ(token.content, "123", token.content_length);
+
+ token = sand_get_next_token(&tokenizer);
+ ASSERT_EQ(token.kind, SAND_TOKEN_IDENTIFIER);
+ ASSERT_EQ(token.content_length, 3);
+ ASSERT_MEM_EQ(token.content, "abc", token.content_length);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+TEST keywords(void) {
+ SandTokenizer tokenizer = CREATE_TOKENIZER("and else false for fun "
+ "if nil or print return "
+ "true var while ");
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_AND);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_ELSE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FALSE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FOR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_FUN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_IF);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_NIL);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_OR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_PRINT);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_RETURN);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_TRUE);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_VAR);
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_WHILE);
+
+ ASSERT_EQ(sand_get_next_token(&tokenizer).kind, SAND_TOKEN_EOF);
+ PASS();
+}
+
+SUITE(tokenizer) {
+ RUN_TEST(empty_source_gives_eof);
+ RUN_TEST(single_char_tokens);
+ RUN_TEST(one_or_two_char_tokens);
+ RUN_TEST(comments_are_ignored);
+ RUN_TEST(literal_string);
+ RUN_TEST(unfinished_literal_string_eof);
+ RUN_TEST(unfinished_literal_string_eol);
+ RUN_TEST(unfinished_literal_string_escape_eol);
+ RUN_TEST(unfinished_literal_string_escape_eof);
+ RUN_TEST(literal_number);
+ RUN_TEST(fractional_literal_number);
+ RUN_TEST(literal_number_followed_by_dot);
+ RUN_TEST(identifiers);
+ RUN_TEST(identifiers_cannot_start_with_number);
+ RUN_TEST(keywords);
+}