1 files changed, 93 insertions, 0 deletions
diff --git a/src/core/tokenizer.h b/src/core/tokenizer.h
new file mode 100644
index 0000000..6105c3e
--- /dev/null
+++ b/src/core/tokenizer.h
@@ -0,0 +1,93 @@
+#ifndef SAND_TOKENIZER_H
+#define SAND_TOKENIZER_H
+
+// This module defines the tokenizer. It takes in the raw source text and splits
+// it into tokens. Tokens (a.k.a. lexemes) are the smallest meaningful units of
+// the Sand language.
+//
+// The tokenizer should not need to do any allocation. The source and filename
+// just have to remain valid for as long as the tokens are in use.
+
+#include "location.h"
+
+#include <stddef.h>
+
+#define SAND_EACH_TOKEN(DO)                 \
+	/* Single-character tokens */       \
+	DO(SAND_TOKEN_LEFT_PAREN)           \
+	DO(SAND_TOKEN_RIGHT_PAREN)          \
+	DO(SAND_TOKEN_LEFT_BRACE)           \
+	DO(SAND_TOKEN_RIGHT_BRACE)          \
+	DO(SAND_TOKEN_COMMA)                \
+	DO(SAND_TOKEN_DOT)                  \
+	DO(SAND_TOKEN_MINUS)                \
+	DO(SAND_TOKEN_PLUS)                 \
+	DO(SAND_TOKEN_SEMICOLON)            \
+	DO(SAND_TOKEN_SLASH)                \
+	DO(SAND_TOKEN_STAR)                 \
+	/* One or two character tokens. */  \
+	DO(SAND_TOKEN_BANG)                 \
+	DO(SAND_TOKEN_BANG_EQUAL)           \
+	DO(SAND_TOKEN_EQUAL)                \
+	DO(SAND_TOKEN_EQUAL_EQUAL)          \
+	DO(SAND_TOKEN_GREATER)              \
+	DO(SAND_TOKEN_GREATER_EQUAL)        \
+	DO(SAND_TOKEN_LESS)                 \
+	DO(SAND_TOKEN_LESS_EQUAL)           \
+	/* Literals */                      \
+	DO(SAND_TOKEN_IDENTIFIER)           \
+	DO(SAND_TOKEN_STRING)               \
+	DO(SAND_TOKEN_NUMBER)               \
+	/* Keywords */                      \
+	DO(SAND_TOKEN_AND)                  \
+	DO(SAND_TOKEN_ELSE)                 \
+	DO(SAND_TOKEN_FALSE)                \
+	DO(SAND_TOKEN_FOR)                  \
+	DO(SAND_TOKEN_FUN)                  \
+	DO(SAND_TOKEN_IF)                   \
+	DO(SAND_TOKEN_NIL)                  \
+	DO(SAND_TOKEN_OR)                   \
+	DO(SAND_TOKEN_PRINT)                \
+	DO(SAND_TOKEN_RETURN)               \
+	DO(SAND_TOKEN_TRUE)                 \
+	DO(SAND_TOKEN_VAR)                  \
+	DO(SAND_TOKEN_WHILE)                \
+	/* Special tokens */                \
+	DO(SAND_TOKEN_ERROR)                \
+	DO(SAND_TOKEN_EOF)
+
+
+typedef enum {
+#define X(n) n,
+SAND_EACH_TOKEN(X)
+#undef X
+} SandTokenKind;
+
+typedef struct {
+	SandTokenKind kind;
+	const char *content;
+	size_t content_length;
+	SandLocation location;
+} SandToken;
+
+typedef struct {
+	const char *const filename;
+
+	const char *start;
+	unsigned start_line;
+	unsigned start_column;
+
+	const char *current;
+	unsigned current_line;
+	unsigned current_column;
+} SandTokenizer;
+
+// There is no corresponding destructor, as a tokenizer does not own any resources.
+SandTokenizer sand_create_tokenizer(const char *source, size_t source_length, const char *filename);
+
+SandToken sand_get_next_token(SandTokenizer *);
+
+// Returns the string representation of the token kind.
+const char *sand_token_kind_to_string(SandTokenKind);
+
+#endif