/* smu - simple markup * Copyright (C) <2007, 2008> Enno Boland * 2019-2022 Karl Bartel * 2022 bzt * * See LICENSE for further informations */ #include #include #include #include #include #include #define LENGTH(x) sizeof(x)/sizeof(x[0]) #define ADDC(b,i) if (i % BUFSIZ == 0) { b = realloc(b, (i + BUFSIZ) * sizeof(char)); if (!b) eprint("Malloc failed."); } b[i] typedef int (*Parser)(const char *, const char *, int); typedef struct { char *search; int process; char *before, *after; } Tag; static int docomment(const char *begin, const char *end, int newblock); /* Parser for html-comments */ static int docodefence(const char *begin, const char *end, int newblock); /* Parser for code fences */ static int dohtml(const char *begin, const char *end, int newblock); /* Parser for html */ static int dolineprefix(const char *begin, const char *end, int newblock);/* Parser for line prefix tags */ static int dolink(const char *begin, const char *end, int newblock); /* Parser for links and images */ static int dolist(const char *begin, const char *end, int newblock); /* Parser for lists */ static int dotable(const char *begin, const char *end, int newblock); /* Parser for tables */ static int doparagraph(const char *begin, const char *end, int newblock); /* Parser for paragraphs */ static int doreplace(const char *begin, const char *end, int newblock); /* Parser for simple replaces */ static int doshortlink(const char *begin, const char *end, int newblock); /* Parser for links and images */ static int dosurround(const char *begin, const char *end, int newblock); /* Parser for surrounding tags */ static int dounderline(const char *begin, const char *end, int newblock); /* Parser for underline tags */ static void *ereallocz(void *p, size_t size); static void hprint(const char *begin, const char *end); /* escapes HTML and prints it to output */ static void process(const char *begin, const char *end, int isblock); /* Processes range between begin and end. */ /* list of parsers */ static Parser parsers[] = { dounderline, docomment, docodefence, dolineprefix, dolist, dotable, doparagraph, dosurround, dolink, doshortlink, dohtml, doreplace }; static int nohtml = 0; static int in_paragraph = 0; regex_t p_end_regex; /* End of paragraph */ static Tag lineprefix[] = { { " ", 0, "

", "\n

" }, { "\t", 0, "

", "\n

" }, { ">", 2, "

", "

" }, { "###### ", 1, "

", "

" }, { "##### ", 1, "

", "

" }, { "#### ", 1, "

", "

" }, { "### ", 1, "

", "

" }, { "## ", 1, "

", "

" }, { "# ", 1, "

", "

" }, { "- - -\n", 1, "

", ""}, { "---\n", 1, "

", ""}, }; static Tag underline[] = { { "=", 1, "

", "

\n" }, { "-", 1, "

", "

\n" }, }; static Tag surround[] = { { "```", 0, "", "" }, { "``", 0, "", "" }, { "`", 0, "", "" }, { "___", 1, "", "" }, { "***", 1, "", "" }, { "__", 1, "", "" }, { "**", 1, "", "" }, { "_", 1, "", "" }, { "*", 1, "", "" }, }; static const char *replace[][2] = { /* Backslash escapes */ { "\\\\", "\\" }, { "\\`", "`" }, { "\\*", "*" }, { "\\_", "_" }, { "\\{", "{" }, { "\\}", "}" }, { "\\[", "[" }, { "\\]", "]" }, { "\$", "(" }, { "\$", ")" }, { "\\#", "#" }, { "\\+", "+" }, { "\\-", "-" }, { "\\.", "." }, { "\\!", "!" }, { "\\\"", """ }, { "\\$", "$" }, { "\\%", "%" }, { "\\&", "&" }, { "\\'", "'" }, { "\\,", "," }, { "\\-", "-" }, { "\\.", "." }, { "\\/", "/" }, { "\\:", ":" }, { "\\;", ";" }, { "\\<", "<" }, { "\\>", ">" }, { "\\=", "=" }, { "\\?", "?" }, { "\\@", "@" }, { "\\^", "^" }, { "\\|", "|" }, { "\\~", "~" }, /* HTML syntax symbols that need to be turned into entities */ { "<", "<" }, { ">", ">" }, { "&", "&" }, /* Avoid replacing the & in & */ { "&", "&" }, /* Preserve newlines with two spaces before linebreak */ { " \n", "
\n" }, }; static const char *code_fence = "```"; void eprint(const char *format, ...) { va_list ap; va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); exit(EXIT_FAILURE); } void end_paragraph(void) { if (in_paragraph) { fputs("

\n", stdout); in_paragraph = 0; } } int docomment(const char *begin, const char *end, int newblock) { char *p; if (nohtml || strncmp(""); if (!p || p + 3 >= end) return 0; fprintf(stdout, "%.*s\n", (int)(p + 3 - begin), begin); return (p + 3 - begin) * (newblock ? -1 : 1); } int docodefence(const char *begin, const char *end, int newblock) { const char *p, *start, *stop, *lang_start, *lang_stop; unsigned int l = strlen(code_fence); if (!newblock) return 0; if (strncmp(begin, code_fence, l) != 0) return 0; /* Find start of content and read language string */ start = begin + l; lang_start = start; while (start[0] != '\n') start++; lang_stop = start; start++; /* Find end of fence */ p = start - 1; do { stop = p; p = strstr(p + 1, code_fence); } while (p && p[-1] == '\\'); if (p && p[-1] != '\\') stop = p; /* No closing code fence means the rest of file is code (CommonMark) */ if (!p) stop = end; /* Print output */ if (lang_start == lang_stop) { fputs("

", stdout);
	} else {
		fputs("", stdout);
	}
	hprint(start, stop);
	fputs("\n", stdout);
	return -(stop - begin + l);
}

int
dohtml(const char *begin, const char *end, int newblock) {
	const char *p, *tag, *tagend;

	if (nohtml || begin + 2 >= end)
		return 0;
	p = begin;
	if (p[0] != '<' || !isalpha(p[1]))
		return 0;
	p++;
	tag = p;
	for (; isalnum(*p) && p < end; p++);
	tagend = p;
	if (p > end || tag == tagend)
		return 0;
	while ((p = strstr(p, "') {
			p++;
			fwrite(begin, sizeof(char), p - begin + tagend - tag, stdout);
			return p - begin + tagend - tag;
		}
	}
	p = strchr(tagend, '>');
	if (p) {
		fwrite(begin, sizeof(char), p - begin + 2, stdout);
		return p - begin + 2;
	}
	else
		return 0;
}

int
dolineprefix(const char *begin, const char *end, int newblock) {
	unsigned int i, j, l;
	char *buffer;
	const char *p;
	int consumed_input = 0;

	if (newblock)
		p = begin;
	else if (*begin == '\n') {
		p = begin + 1;
		consumed_input += 1;
	} else
		return 0;
	for (i = 0; i < LENGTH(lineprefix); i++) {
		l = strlen(lineprefix[i].search);
		if (end - p + 1 < l)
			continue;
		if (strncmp(lineprefix[i].search, p, l))
			continue;
		if (*begin == '\n')
			fputc('\n', stdout);

		/* All line prefixes add a block element. These are not allowed
		 * inside paragraphs, so we must end the paragraph first. */
		end_paragraph();

		fputs(lineprefix[i].before, stdout);
		if (lineprefix[i].search[l-1] == '\n') {
			fputc('\n', stdout);
			return l - 1 + consumed_input;
		}
		if (!(buffer = malloc(BUFSIZ)))
			eprint("Malloc failed.");
		buffer[0] = '\0';

		/* Collect lines into buffer while they start with the prefix */
		j = 0;
		while ((strncmp(lineprefix[i].search, p, l) == 0) && p + l < end) {
			p += l;

			/* Special case for blockquotes: optional space after > */
			if (lineprefix[i].search[0] == '>' && *p == ' ') {
				p++;
			}

			while (p < end) {
				ADDC(buffer, j) = *p;
				j++;
				if (*(p++) == '\n')
					break;
			}
		}

		/* Skip empty lines in block */
		while (*(buffer + j - 1) == '\n') {
			j--;
		}

		ADDC(buffer, j) = '\0';
		if (lineprefix[i].process)
			process(buffer, buffer + strlen(buffer), lineprefix[i].process >= 2);
		else
			hprint(buffer, buffer + strlen(buffer));
		puts(lineprefix[i].after);
		free(buffer);
		return -(p - begin);
	}
	return 0;
}

int
dolink(const char *begin, const char *end, int newblock) {
	int img, len, sep, parens_depth = 1;
	const char *desc, *link, *p, *q, *descend, *linkend;
	const char *title = NULL, *titleend = NULL;

	if (*begin == '[')
		img = 0;
	else if (strncmp(begin, "![", 2) == 0)
		img = 1;
	else
		return 0;
	p = desc = begin + 1 + img;
	if (!(p = strstr(desc, "](")) || p > end)
		return 0;
	for (q = strstr(desc, "!["); q && q < end && q < p; q = strstr(q + 1, "!["))
		if (!(p = strstr(p + 1, "](")) || p > end)
			return 0;
	descend = p;
	link = p + 2;

	/* find end of link while handling nested parens */
	q = link;
	while (parens_depth) {
		if (!(q = strpbrk(q, "()")) || q > end)
			return 0;
		if (*q == '(')
			parens_depth++;
		else
			parens_depth--;
		if (parens_depth && q < end)
			q++;
	}

	if ((p = strpbrk(link, "\"'")) && p < end && q > p) {
		sep = p[0]; /* separator: can be " or ' */
		title = p + 1;
		/* strip trailing whitespace */
		for (linkend = p; linkend > link && isspace(*(linkend - 1)); linkend--);
		for (titleend = q - 1; titleend > link && isspace(*(titleend)); titleend--);
		if (titleend < title || *titleend != sep) {
			return 0;
		}
	}
	else {
		linkend = q;
	}

	/* Links can be given in angular brackets */
	if (*link == '<' && *(linkend - 1) == '>') {
		link++;
		linkend--;
	}

	len = q + 1 - begin;
	if (img) {
		fputs("", stdout);
	}
	else {
		fputs("", stdout);
		process(desc, descend, 0);
		fputs("", stdout);
	}
	return len;
}

int
dolist(const char *begin, const char *end, int newblock) {
	unsigned int i, j, indent, run, isblock, start_number;
	const char *p, *q, *num_start;
	char *buffer = NULL;
	char marker = '\0';  /* Bullet symbol or \0 for unordered lists */

	isblock = 0;
	if (newblock)
		p = begin;
	else if (*begin == '\n')
		p = begin + 1;
	else
		return 0;
	q = p;
	if (*p == '-' || *p == '*' || *p == '+') {
		marker = *p;
	} else {
		num_start = p;
		for (; p < end && *p >= '0' && *p <= '9'; p++);
		if (p >= end || *p != '.')
			return 0;
		start_number = atoi(num_start);
	}
	p++;
	if (p >= end || !(*p == ' ' || *p == '\t'))
		return 0;

	end_paragraph();

	for (p++; p != end && (*p == ' ' || *p == '\t'); p++);
	indent = p - q;
	buffer = ereallocz(buffer, BUFSIZ);
	if (!newblock)
		fputc('\n', stdout);

	if (marker) {
		fputs("\n", stdout);
	} else if (start_number == 1) {
		fputs("\n", stdout);
	} else {
		printf("\n", start_number);
	}
	run = 1;
	for (; p < end && run; p++) {
		for (i = 0; p < end && run; p++, i++) {
			if (*p == '\n') {
				if (p + 1 == end)
					break;
				else {
					/* Handle empty lines */
					for (q = p + 1; (*q == ' ' || *q == '\t') && q < end; q++);
					if (*q == '\n') {
						ADDC(buffer, i) = '\n';
						i++;
						run = 0;
						isblock++;
						p = q;
					}
				}
				q = p + 1;
				j = 0;
				if (marker && *q == marker)
					j = 1;
				else {
					for (; q + j != end && q[j] >= '0' && q[j] <= '9' && j < indent; j++);
					if (q + j == end)
						break;
					if (j > 0 && q[j] == '.')
						j++;
					else
						j = 0;
				}
				if (q + indent < end)
					for (; (q[j] == ' ' || q[j] == '\t') && j < indent; j++);
				if (j == indent) {
					ADDC(buffer, i) = '\n';
					i++;
					p += indent;
					run = 1;
					if (*q == ' ' || *q == '\t')
						p++;
					else
						break;
				}
				else if (j < indent)
					run = 0;
			}
			ADDC(buffer, i) = *p;
		}
		ADDC(buffer, i) = '\0';
		fputs("", stdout);
		process(buffer, buffer + i, isblock > 1 || (isblock == 1 && run));
		fputs("\n", stdout);
	}
	fputs(marker ? "\n" : "\n", stdout);
	free(buffer);
	p--;
	while (*(--p) == '\n');
	return -(p - begin + 1);
}

int
dotable(const char *begin, const char *end, int newblock) {
	/* table state */
	static signed char intable, inrow, incell;
	static unsigned long int calign;
	static const char *align_table[] = {
		"",
		" style=\"text-align: left\"",
		" style=\"text-align: right\"",
		" style=\"text-align: center\"",
	};

	const char *p;
	int i, l = (int)sizeof(calign) * 4;

	if(*begin != '|')
		return 0;
	if (intable == 2) { /* in alignment row, skip it. */
		++intable;
		for (p = begin; p < end && *p != '\n'; ++p);
		return p - begin + 1;
	}
	if(inrow && (begin + 1 >= end || begin[1] == '\n')) {       /* close cell and row and if ends, table too */
		fprintf(stdout, "", inrow == -1 ? 'h' : 'd');
		if (inrow == -1)
			intable = 2;
		inrow = 0;
		if(end - begin <= 2 || begin[2] == '\n') {
			intable = 0;
			fputs("\n\n", stdout);
		}
		return 1;
	}

	if(!intable) {                                              /* open table */
		intable = 1; inrow = -1; incell = 0; calign = 0;
		for (p = begin; p < end && *p != '\n'; ++p);
		if(*p == '\n') { /* load alignment from 2nd line */
			for(i = -1, ++p; p < end && *p != '\n'; p++) {
				if(*p == '|') {
					i++;
					do { p++; } while(p < end && (*p == ' ' || *p == '\t'));
					if(i < l && *p == ':')
						calign |= 1ul << (i * 2);
					if (*p == '\n')
						break;
				} else if(i < l && *p == ':') {
					calign |= 1ul << (i * 2 + 1);
				}
			}
		}
		fputs("\n", stdout);
	}
	if(!inrow) {                                                /* open row */
		inrow = 1; incell = 0;
		fputs("", stdout);
	}
	if(incell)                                                  /* close cell */
		fprintf(stdout, "", inrow == -1 ? 'h' : 'd');
	l = incell < l ? (calign >> (incell * 2)) & 3 : 0;          /* open cell */
	fprintf(stdout, "", inrow == -1 ? 'h' : 'd', align_table[l]);
	incell++;
	for(p = begin + 1; p < end && *p == ' '; p++);
	return p - begin;
}

int
doparagraph(const char *begin, const char *end, int newblock) {
	const char *p;
	regmatch_t match;

	if (!newblock)
		return 0;
	if (regexec(&p_end_regex, begin + 1, 1, &match, 0)) {
		p = end;
	} else {
		p = begin + 1 + match.rm_so;
	}

	fputs("", stdout);
	in_paragraph = 1;
	process(begin, p, 0);
	end_paragraph();

	return -(p - begin);
}

int
doreplace(const char *begin, const char *end, int newblock) {
	unsigned int i, l;

	for (i = 0; i < LENGTH(replace); i++) {
		l = strlen(replace[i][0]);
		if (end - begin < l)
			continue;
		if (strncmp(replace[i][0], begin, l) == 0) {
			fputs(replace[i][1], stdout);
			return l;
		}
	}
	return 0;
}

int
doshortlink(const char *begin, const char *end, int newblock) {
	const char *p, *c;
	int ismail = 0;

	if (*begin != '<')
		return 0;
	for (p = begin + 1; p != end; p++) {
		switch (*p) {
		case ' ':
		case '\t':
		case '\n':
			return 0;
		case '#':
		case ':':
			ismail = -1;
			break;
		case '@':
			if (ismail == 0)
				ismail = 1;
			break;
		case '>':
			if (ismail == 0)
				return 0;
			fputs("", stdout);
				for (c = begin + 1; *c != '>'; c++)
					fprintf(stdout, "&#%u;", *c);
			}
			else {
				hprint(begin + 1, p);
				fputs("\">", stdout);
				hprint(begin + 1, p);
			}
			fputs("", stdout);
			return p - begin + 1;
		}
	}
	return 0;
}

int
dosurround(const char *begin, const char *end, int newblock) {
	unsigned int i, l;
	const char *p, *start, *stop;

	for (i = 0; i < LENGTH(surround); i++) {
		l = strlen(surround[i].search);
		if (end - begin < 2*l || strncmp(begin, surround[i].search, l) != 0)
			continue;
		start = begin + l;
		p = start;
		do {
			stop = p;
			p = strstr(p + 1, surround[i].search);
		} while (p && p[-1] == '\\');
		if (p && p[-1] != '\\')
			stop = p;
		if (!stop || stop < start || stop >= end)
			continue;
		fputs(surround[i].before, stdout);

		/* Single space at start and end are ignored */
		if (start[0] == ' ' && stop[-1] == ' ' && start < stop - 1) {
			start++;
			stop--;
			l++;
		}

		if (surround[i].process)
			process(start, stop, 0);
		else
			hprint(start, stop);
		fputs(surround[i].after, stdout);
		return stop - start + 2 * l;
	}
	return 0;
}

int
dounderline(const char *begin, const char *end, int newblock) {
	unsigned int i, j, l;
	const char *p;

	if (!newblock)
		return 0;
	p = begin;
	for (l = 0; p + l != end && p[l] != '\n'; l++);
	p += l + 1;
	if (l == 0)
		return 0;
	for (i = 0; i < LENGTH(underline); i++) {
		for (j = 0; p + j < end && p[j] != '\n' && p[j] == underline[i].search[0]; j++);
		if (j >= 3) {
			fputs(underline[i].before, stdout);
			if (underline[i].process)
				process(begin, begin + l, 0);
			else
				hprint(begin, begin + l);
			fputs(underline[i].after, stdout);
			return -(j + p - begin);
		}
	}
	return 0;
}

void *
ereallocz(void *p, size_t size) {
	void *res;
	res = realloc(p, size);
	if (!res)
		eprint("realloc: %zu bytes\n", size);
	return res;
}

void
hprint(const char *begin, const char *end) {
	const char *p;

	for (p = begin; p != end; p++) {
		if (*p == '&')
			fputs("&", stdout);
		else if (*p == '"')
			fputs(""", stdout);
		else if (*p == '>')
			fputs(">", stdout);
		else if (*p == '<')
			fputs("<", stdout);
		else
			fputc(*p, stdout);
	}
}

void
process(const char *begin, const char *end, int newblock) {
	const char *p;
	int affected;
	unsigned int i;

	for (p = begin; p < end;) {
		if (newblock)
			while (*p == '\n')
				if (++p == end)
					return;

		for (i = 0; i < LENGTH(parsers); i++)
			if ((affected = parsers[i](p, end, newblock)))
				break;
		if (affected)
			p += abs(affected);
		else
			fputc(*p++, stdout);

		/* Don't print single newline at end */
		if (p + 1 == end && *p == '\n')
			return;

		if (p[0] == '\n' && p + 1 != end && p[1] == '\n')
			newblock = 1;
		else
			newblock = affected < 0;
	}
}

int
main(int argc, char *argv[]) {
	char *buffer = NULL;
	int s, i;
	unsigned long len, bsize;
	FILE *source = stdin;

	regcomp(&p_end_regex, "(\n\n|(^|\n)```)", REG_EXTENDED);

	for (i = 1; i < argc; i++) {
		if (!strcmp("-v", argv[i]))
			eprint("simple markup %s (C) Enno Boland\n",VERSION);
		else if (!strcmp("-n", argv[i]))
			nohtml = 1;
		else if (argv[i][0] != '-')
			break;
		else if (!strcmp("--", argv[i])) {
			i++;
			break;
		}
		else
			eprint("Usage %s [-n] [file]\n -n escape html strictly\n", argv[0]);
	}
	if (i < argc && !(source = fopen(argv[i], "r")))
		eprint("Cannot open file `%s`\n",argv[i]);
	bsize = 2 * BUFSIZ;
	buffer = ereallocz(buffer, bsize);
	len = 0;
	while ((s = fread(buffer + len, 1, BUFSIZ, source))) {
		len += s;
		if (BUFSIZ + len + 1 > bsize) {
			bsize += BUFSIZ;
			if (!(buffer = realloc(buffer, bsize)))
				eprint("realloc failed.");
		}
	}
	buffer[len] = '\0';
	process(buffer, buffer + len, 1);
	fclose(source);
	free(buffer);
	return EXIT_SUCCESS;
}