From 1ad63a50dcf01171a3e7b9a04b37d36933a769ef Mon Sep 17 00:00:00 2001
From: Linnnus <linnnus@users.noreply.github.com>
Date: Fri, 16 Feb 2024 12:23:27 +0100
Subject: feat(creole): Support raw URLs

---
 src/creole-test.c | 42 ++++++++++++++++++++++++------------------
 src/creole.c      | 44 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 67 insertions(+), 19 deletions(-)

(limited to 'src')
diff --git a/src/creole-test.c b/src/creole-test.c
index c5b6590..8844a88 100644
--- a/src/creole-test.c
+++ b/src/creole-test.c
@@ -90,6 +90,30 @@ struct {
 		.input   =  "[[]]",
 		.output  =  "<p><a href=\"\"></a></p>"
 	},
+	{
+		.name    =  "Raw HTTP URL",
+		.input   =  "Here is a http://example.com/examplepage link.",
+		.output  =  "<p>Here is a <a href=\"http://example.com/examplepage\">"
+		            "http://example.com/examplepage</a> link.</p>"
+	},
+	{ // This is interesting because it doesn't contain a "://".
+		.name    =  "Raw mailto URL",
+		.input   =  "mailto:quandale@dingle.com",
+		.output  =  "<p><a href=\"mailto:quandale@dingle.com\">"
+		            "mailto:quandale@dingle.com</a></p>"
+	},
+	{
+		.name    =  "Unnamed URL",
+		.input   =  "[[http //example.com/examplepage]]",
+		.output  =  "<p><a href=\"http //example.com/examplepage\">"
+		            "http //example.com/examplepage</a></p>"
+	},
+	{
+		.name    =  "Named URL",
+		.input   =  "[[http //example.com/examplepage|Example Page]]",
+		.output  =  "<p>"
+		            "<a href=\"http //example.com/examplepage\">Example Page</a></p>"
+	},
 #if 0
 	{
 		.name    =  "Simple unordered list",
@@ -148,24 +172,6 @@ struct {
 		            "<tr><td> <em>C</em> </td>"
 		            "<td> <strong>D</strong> <br /> E </td></tr></table>"
 	},
-	{
-		.name    =  "Raw URL",
-		.input   =  "http //example.com/examplepage",
-		.output  =  "<p><a href=\"http //example.com/examplepage\">"
-		            "http //example.com/examplepage</a></p>"
-	},
-	{
-		.name    =  "Unnamed URL",
-		.input   =  "[[http //example.com/examplepage]]",
-		.output  =  "<p><a href=\"http //example.com/examplepage\">"
-		            "http //example.com/examplepage</a></p>"
-	},
-	{
-		.name    =  "Named URL",
-		.input   =  "[[http //example.com/examplepage|Example Page]]",
-		.output  =  "<p>"
-		            "<a href=\"http //example.com/examplepage\">Example Page</a></p>"
-	},
 	{
 		.name    =  "Image",
 		.input   =  "{{image.gif|my image}}",
diff --git a/src/creole.c b/src/creole.c
index 79a767d..107b50b 100644
--- a/src/creole.c
+++ b/src/creole.c
@@ -17,6 +17,7 @@ int do_headers(const char *begin, const char *end, bool new_block, FILE *out);
 int do_paragraph(const char *begin, const char *end, bool new_block, FILE *out);
 int do_replacements(const char *begin, const char *end, bool new_block, FILE *out);
 int do_link(const char *begin, const char *end, bool new_block, FILE *out);
+int do_raw_url(const char *begin, const char *end, bool new_block, FILE *out);
 
 // Prints string escaped.
 void hprint(FILE *out, const char *begin, const char *end) {
@@ -41,7 +42,7 @@ void hprint(FILE *out, const char *begin, const char *end) {
 // The sign of the return value determines whether a new block should begin, after the consumed text.
 typedef int (* parser_t)(const char *begin, const char *end, bool new_block, FILE *out);
 
-static parser_t parsers[] = { do_headers, do_paragraph, do_link, do_replacements };
+static parser_t parsers[] = { do_headers, do_paragraph, do_link, do_raw_url, do_replacements };
 
 int do_headers(const char *begin, const char *end, bool new_block, FILE *out) {
 	if (!new_block) { // Headers are block-level elements.
@@ -180,6 +181,47 @@ int do_link(const char *begin, const char *end, bool new_block, FILE *out)
 	return stop - start + 4 /* [[]] */;
 }
 
+int do_raw_url(const char *begin, const char *end, bool new_block, FILE *out)
+{
+	// Eat a scheme followed by a ":". Here are the relevant rules from RFC 3986.
+	// - URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+	// - scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+	// See: <https://www.rfc-editor.org/rfc/rfc3986#section-3.1>
+	const char *p = begin;
+	if (!isalpha(*p)) {
+		return 0;
+	}
+	while (p < end && (isalnum(*p) || *p == '+' || *p == '-' || *p == '.')) {
+		p += 1;
+	}
+	if (p >= end || p[0] != ':') {
+		return 0;
+	}
+	p += 1;
+
+	// Eat the remainder of the URI.
+	// This is not technically correct, but it's a good enough heuristic.
+	const char *q = p;
+	while (q < end && !isspace(*q)) {
+		q += 1;
+	}
+
+        // If there is nothing following the colon, don't accept it as a raw
+        // url. Otherwise we'd incorrectly find a link with the "said" protocol
+        // here: "And he said: blah blah".
+        if (q == p) {
+		return 0;
+	}
+
+	fputs("<a href=\"", out);
+	hprint(out, begin, q);
+	fputs("\">", out);
+	hprint(out, begin, q);
+	fputs("</a>", out);
+
+	return q - begin;
+}
+
 void process(const char *begin, const char *end, bool new_block, FILE *out) {
 	const char *p = begin;
 	while (p < end) {
-- 
cgit v1.2.3