From cebd9a4d78455b80e35e6cdc905936bde678dc2e Mon Sep 17 00:00:00 2001
From: Aleteoryx <alyx@aleteoryx.me>
Date: Thu, 28 Aug 2025 20:15:26 -0400
Subject: [PATCH] rewrite markup language in C

---
 gloss.py       |  88 +++++++-----
 markup.c       | 375 +++++++++++++++++++++++++++++++++++++++++++++++++
 testout/a.html |   2 +-
 testout/b.html |   2 +-
 4 files changed, 433 insertions(+), 34 deletions(-)
 create mode 100644 markup.c
diff --git a/gloss.py b/gloss.py
index 33cd4529f9abc6131add9c81ad74488c461d25f5..5a17559c4fec193086a1f71f5762cb7b217802bb 100755
--- a/gloss.py
+++ b/gloss.py
@@ -3,11 +3,13 @@
 import html
 import re
 from datetime import datetime
-from os import stat
+from os import stat, system
 from glob import glob
 from typing import List, Optional, Union, Dict, Set, Tuple
 from dataclasses import dataclass
 from sys import argv, stderr, exit
+from pathlib import Path
+from subprocess import Popen, PIPE, run as runcmd
 
 usage = f'''
 usage: {argv[0]} <SRCDIR> <OUTDIR>
@@ -115,10 +117,47 @@ def first_pass(slug, fp):
 	
 	return GlsFile(slug, title, [*names], blocks, see_also)
 
-quote_pat = re.compile("((?:(?!@@)(?!//).)+)(?:@@((?:(?!//).)+))?(?://(.+))?")
-link_pat = re.compile("(?<!\\\\)<([^<>|]+)(?:\\|([^>]*))?>(?<!\\\\>)")
-stripped_link_pat = "(?<!\\\\)<[^<>|]+(?:\\|[^>]*)?>(?<!\\\\>)"
-italic_pat = re.compile(f"(?:(?<=\\W)|^)/(?:[^<>/]|{stripped_link_pat})+/(?:(?=\\W)|$)")
+class Markup:
+	def __init__(self, where, *, cfile='markup.c', bfile='markup'):
+		self.cfile = str(Path(where, cfile))
+		self.bfile = str(Path(where, bfile))
+
+		if stat(self.cfile).st_mtime > stat(self.bfile).st_mtime:
+			print("recompiling markup subsystem...")
+			runcmd(['cc', self.cfile, '-DDEBUG', '-o', self.bfile, '-Wall'], check=True)
+			print("recompiled!")
+		
+		self.proc = Popen([self.bfile, 'convert'], stdin=PIPE, stdout=PIPE, text=True)
+	
+	def process(self, text):
+		print(f'{text=}')
+		self.proc.stdin.write(text+"\n")
+		self.proc.stdin.flush()
+
+		segments = []
+		while (line := self.proc.stdout.readline()) != '':
+			ty = line[0:4]
+			if ty == 'NEXT':
+				break
+
+			length = int(line[5:9])
+			ltext = line[12:12+length]
+			print(f'{ltext=}')
+
+			if ty == 'HTML':
+				segments.append((ltext,))
+			elif ty == 'IESC':
+				segments.append((html.escape(ltext),))
+			elif ty == 'TEXT':
+				segments.append(ltext)
+			else:
+				print(f'read in unknown type "{ty}" from markup subprocess', file=stderr)
+		
+		print(segments)
+		return segments
+			
+
+quote_pat = re.compile("((?:(?!@@)(?!//).)*)(?:@@((?:(?!//).)+))?(?://(.+))?")
 
 
 ### GENERATION ###
@@ -153,29 +192,9 @@ class Indexes:
 		
 		sorted(self.names_sorted, key=lambda x: len(x[0]))
 			
-def gen_inner_html(file, idx):
-	for block in file.blocks:		# populate italics+external links, listify block.text
-		text = block.text
-		working = []
-		while (m := italic_pat.search(text)) is not None:
-			s,e = m.span()
-			working.append(text[:s])
-			working.append(('<i>',))
-			working.append(text[s:e])
-			working.append(('</i>',))
-			text = text[e:]
-		working.append(text)
-		block.text = []
-		for text in working:
-			if type(text) != str:
-				block.text.append(text)
-				continue
-			while (m := link_pat.search(text)) is not None:
-				s,e = m.span()
-				block.text.append(text[:s].replace('\\<', '<').replace('\\>', '>'))
-				block.text.append((link_repl(m),))
-				text = text[e:]
-			block.text.append(text)
+def gen_inner_html(fmt, file, idx):
+	for block in file.blocks:		# format text, listify it
+		block.text = fmt.process(block.text)
 
 	blacklist = set()
 	for name, pat in idx.names_sorted:	# populate local links
@@ -205,9 +224,12 @@ def gen_inner_html(file, idx):
 		if block.ty == 'para':
 			content += f"\n<p>{text}</p>"
 		elif block.ty == 'quote':
-			sauce, date, url = map(
-				lambda x: x if x is None else html.escape(x.strip()),
-				quote_pat.match(block.meta).groups() )
+			if block.meta is not None and len(block.meta):
+				sauce, date, url = map(
+					lambda x: x if x is None else html.escape(x.strip()),
+					quote_pat.match(block.meta).groups() )
+			else:
+				sauce = date = url = None
 
 			content += "\n<div>\n\t<blockquote" + ('' if url is None else f' cite="{url}"') + '>'
 			content += text
@@ -272,6 +294,8 @@ if __name__ == '__main__':
 </html>
 '''
 
+	fmt = Markup(Path(__file__).parent)
+
 	files = []
 	for fn in glob('*.gls', root_dir=srcdir):
 		with open(f'{srcdir}/{fn}', 'rt') as fp:
@@ -285,7 +309,7 @@ if __name__ == '__main__':
 			ctx = {
 				'title': html.escape(file.title),
 				'slug': html.escape(file.slug),
-				'body': gen_inner_html(file, indexes),
+				'body': gen_inner_html(fmt, file, indexes),
 				'modtime': datetime.fromtimestamp(stat(f'{srcdir}/{file.slug}.gls').st_mtime)
 			}
 			fp.write(template.format(**ctx))
diff --git a/markup.c b/markup.c
new file mode 100644
index 0000000000000000000000000000000000000000..5501850e8255b133e06f0e92ac348a57812e2439
--- /dev/null
+++ b/markup.c
@@ -0,0 +1,375 @@
+/* a minimal markup language */
+
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#define SEG_HEAD 0	// never export
+#define SEG_HTML 1	// OK to include in the output
+#define SEG_TEXT 2	// raw user-generated text
+#define SEG_IESC 3	// should be immediately escaped python-side and treated as html
+
+#ifdef DEBUG
+#undef DEBUG
+#define PDEBUG(...) fprintf(stderr, __VA_ARGS__);
+#define PEXPR(exp) fprintf(stderr, "(%s) = %ld\n", #exp, (long)(exp));
+#define DEBUG 1
+#else
+#define PDEBUG(...)
+#define PEXPR(exp)
+#define DEBUG 0
+#endif
+
+#define PLINE() fprintf(stderr, "on line %d in func %s\n", __LINE__, __FILE__)
+
+#define LEN(x) (sizeof(x) / sizeof(x[0]))
+
+static char *argv0 = "markup";
+
+
+struct segment {
+	int type;
+	size_t length;
+	const char *text;
+	struct segment *next;
+};
+
+static int
+isword(char c)
+{
+	return c == '_' || c == '<' || c == '>' || isalnum(c);
+}
+
+static void
+print_segment(struct segment *seg)
+{
+	const char *ty;
+	switch(seg->type){
+	case SEG_HEAD:
+		ty = "HEAD";
+		break;
+	case SEG_HTML:
+		ty = "HTML";
+		break;
+	case SEG_TEXT:
+		ty = "TEXT";
+		break;
+	case SEG_IESC:
+		ty = "IESC";
+		break;
+	}
+	if(DEBUG)
+		fprintf(stderr, "%s(%04zu)->%p: %.*s\n", ty, seg->length, seg->next, (int)seg->length, seg->text ?: "");
+	printf("%s(%04zu): %.*s\n", ty, seg->length, (int)seg->length, seg->text ?: "");
+}
+
+static inline struct segment *
+new_segment(int type, size_t length, const char *text)
+{
+	struct segment *new = malloc(sizeof (struct segment));
+	new->type = type;
+	new->length = length;
+	new->text = text;
+	new->next = NULL;
+
+	fprintf(stderr, "new_segment @ %p: %d / %zu / %.*s\n", new, type, length, (int)length, text);
+
+	return new;
+}
+
+static inline void
+push_segment(struct segment **seg, int type, size_t length, const char *text)
+{
+	struct segment *new = new_segment(type, length, text);
+	(*seg)->next = new;
+	*seg = new;
+}
+
+static inline void
+push_html(struct segment **seg, size_t length, const char *text)
+{
+	push_segment(seg, SEG_HTML, length, text);
+}
+static inline void
+push_text(struct segment **seg, size_t length, const char *text)
+{
+	push_segment(seg, SEG_TEXT, length, text);
+}
+static inline void
+push_iesc(struct segment **seg, size_t length, const char *text)
+{
+	push_segment(seg, SEG_IESC, length, text);
+}
+
+/*
+ * attempt to convert a string of the form <href[|name]> into either:
+ * - H'<a href="'  I[href]  H'">&lt;'  I[href]  H'&gt;</a>'
+ * - H'<a href="'  I[href]  H'">'      I[name]      H'</a>'
+ */
+static struct segment *
+try_link(const char **ip, const char *end, struct segment **tailp)
+{
+	const char *i = *ip+1, *href = i, *hend = NULL, *name = NULL;
+	struct segment *head, *tail;
+
+	for(; i < end; i++){
+		switch(*i){
+		case '|':
+			if(name == NULL){
+				name = i+1;
+				hend = i;
+			}
+			break;
+		case '>':
+			if(hend == NULL)
+				hend = i;
+
+			head = tail = new_segment(SEG_HTML, 9, "<a href=\"");
+			push_iesc(&tail, hend - href, href);
+			if(name == NULL){
+				push_html(&tail, 5, "\"&lt;");
+				push_iesc(&tail, hend - href, href);
+				push_html(&tail, 8, "&gt;</a>");
+			}else{
+				push_html(&tail, 2, "\">");
+				push_iesc(&tail, i - name, name);
+				push_html(&tail, 4, "</a>");
+			}
+			
+			*ip = i;
+			*tailp = tail;
+			return head;
+		}
+	}
+
+	return NULL;
+}
+
+static struct segment * // TODO: rewrite with ICU
+segmentize(size_t length, const char *source)
+{
+	// p is the start of the most recent text segment
+	const char *p = source, *i, *newi, *end = source + length;
+	int in_italic = 0, in_escape = 0;
+	struct segment *head = new_segment(SEG_HEAD, 0, NULL), *ret;
+	struct segment *tail = head;
+	struct segment *italic_head, *italic_tail;
+	struct segment *link_head, *link_tail;
+
+	for(i = source; i < end; i++){
+		switch(*i){
+		case '\\':
+			i++;
+			break;
+		case '/':
+			if(!in_italic){	// begin italic section
+				PDEBUG("starting italics @ source[%zu]\n", i - source);
+				PEXPR(i != source);
+				PEXPR(i != source && isword(i[-1]));
+				PEXPR(end - i);
+				PEXPR(!isword(i[1]));
+				PEXPR((i != source && isword(i[-1])) || (end - i) < 3 || !isword(i[1]));
+				if((i != source && isword(i[-1])) ||	// previous char must be ^ or non-word
+				   (end - i) < 3 ||			// cannot start near EOB
+				   !isword(i[1]))			// next char must be word
+				   	continue;
+				in_italic = 1;
+				PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
+				push_text(&tail, i - p, p);
+				PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
+				italic_head = italic_tail = new_segment(SEG_HTML, 3, "<i>");
+				
+				p = i;
+			}else{		// end italic section
+				PDEBUG("stopping italics @ source[%zu]\n", i - source);
+				PEXPR(!isword(i[-1]));
+				PEXPR((i+1) != end);
+				PEXPR(isword(i[1]));
+				PEXPR(!isword(i[-1]) || ((i+1) != end && isword(i[1])));
+				if(!isword(i[-1]) ||			// previous char must be word
+				   ((i+1) != end && isword(i[1])))	// next char must be EOB or non-word
+					continue;
+				
+				in_italic = 0;
+				push_text(&italic_tail, i - p + 1, p);
+				push_html(&italic_tail, 4, "</i>");
+				tail->next = italic_head;
+				tail = italic_tail;
+				p = i + 1;
+			}
+			break;
+		case '<':
+			newi = i;
+			link_head = try_link(&newi, end, &link_tail);
+			if(link_head == NULL){ // reached EOB before link could finish, bail out
+				i = end;
+				break;
+			}
+			
+			if (DEBUG)
+				print_segment(link_tail);
+			PDEBUG("*newi = '%c'\n", *newi);
+			
+			if(in_italic){
+				push_text(&italic_tail, i - p, p);
+				italic_tail->next = link_head;
+				italic_tail = link_tail;
+			}else{
+				push_text(&tail, i - p, p);
+				tail->next = link_head;
+				tail = link_tail;
+			}
+			i = newi;
+			p = i+1;
+			break;
+		}
+	}
+	if(in_italic){
+		PLINE();
+		tail->next = italic_head->next;
+		if(tail->next != NULL)
+			tail = italic_tail;
+		
+		tail->next = italic_head; // reuse it :D
+		tail = tail->next;
+		tail->type = SEG_TEXT;
+		tail->length = end - p;
+		tail->text = p;
+		tail->next = NULL;
+	}else{
+		tail->next = new_segment(SEG_TEXT, end - p, p);
+	}
+
+	ret = head->next;
+	free(head);
+	return ret;
+}
+
+static const char *
+nextline()
+{
+	static size_t buflen = 1024;
+	static char *buf = NULL;
+	size_t off;
+	char *p;
+
+	if(buf == NULL)
+		buf = malloc(buflen);
+	p = buf;
+	
+	while((*p = getc(stdin)) != EOF){
+		if(*p == '\n'){
+			*p = '\0';
+			return buf;
+		}else if(*p == '\0'){
+			p--;
+		}
+
+		if(++p == buf + buflen){
+			off = p - buf;
+			buflen *= 3;
+			buflen /= 2;
+			buf = realloc(buf, buflen);
+			p = buf + off;
+		}
+	}
+	return NULL;
+}
+
+static void
+convert()
+{
+	size_t length;
+	const char *s;
+	struct segment *seg;
+
+	while((s = nextline()) != NULL){
+		fprintf(stderr, "processing line: %s\n", s);
+		length = strlen(s);
+		seg = segmentize(length, s);
+		for(; seg != NULL; seg = seg->next)
+			print_segment(seg);
+		puts("NEXT");
+		fflush(stdout);
+	}
+}
+
+static void
+usage()
+{
+	printf(
+	"usage: %s <VERB> [...args]\n\n"
+	"VERB may be one of:\n"
+	"	test [N]  -  format the in-built test strings, or test string N if provided.\n"
+	"	convert   -  read input lines, convert them, and print the result until EOF.\n"
+	, argv0);
+	exit(1);
+}
+
+const static char *strings[] = {
+	"This string should be one SEG_TEXT.",
+	"And this//",
+	"This string should have an /italic/ part.",
+	"And this /o/",
+	"/This/ string should have /two/.",
+	"<link>",
+	"<example.com|link text>",
+	"how about an /<href|italicized link>/?",
+	"but no <href|/partially/ italicized ones>, right?",
+	NULL
+};
+
+static void
+run_test(const char *s)
+{
+	struct segment *seg;
+
+	printf("\nsegmentizing: \"%s\"(%zu)\n", s, strlen(s));
+	seg = segmentize(strlen(s), s);
+	for(; seg != NULL; seg = seg->next)
+		print_segment(seg);
+}
+
+static void
+run_testn(int n)
+{
+	if(n < LEN(strings))
+		run_test(strings[n]);
+	else{
+		fprintf(stderr, "test # out of range: %d\n", n);
+		exit(2);
+	}
+}
+
+static void
+run_tests()
+{
+	const char **s;
+	for(s = strings; *s != NULL; s++){
+		run_test(*s);
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	if(argc < 2)
+		usage();
+	argv0 = (argv++)[0];
+	argc--;
+
+	if(strcmp(argv[0], "test") == 0){
+		if(argc > 1)
+			run_testn(atoi(argv[1]));
+		else
+			run_tests();
+	}else if(strcmp(argv[0], "convert") == 0)
+		convert();
+	else
+		usage();
+
+	return 0;
+}
+
diff --git a/testout/a.html b/testout/a.html
index 5bdae30351991253dfac966b58c83924b5dabe0a..1dd63ae7cf15edfcab5260d87bd2abf9c7860d32 100644
--- a/testout/a.html
+++ b/testout/a.html
@@ -17,7 +17,7 @@
 <h3>See Also:</h3>
 <ul>
 	<li><a href="b.html">The Letter B</a></li>
-</li>
+</ul>
 </main>
 <footer>File last modified Fri, 2025-15-51 00:51:39 </footer>
 </body>
diff --git a/testout/b.html b/testout/b.html
index cc370364413a22092bb7a397a05031dfd0f6f61b..4d80b42e3c2a9c185ac298f3a7f19056d93df2be 100644
--- a/testout/b.html
+++ b/testout/b.html
@@ -12,7 +12,7 @@
 <h3>See Also:</h3>
 <ul>
 	<li><a href="a.html">The Letter A</a></li>
-</li>
+</ul>
 </main>
 <footer>File last modified Thu, 2025-14-40 18:40:52 </footer>
 </body>