~aleteoryx/gloss

cebd9a4d78455b80e35e6cdc905936bde678dc2e — Aleteoryx 3 months ago 3a6bec3
rewrite markup language in C
4 files changed, 433 insertions(+), 34 deletions(-)

M gloss.py
A markup.c
M testout/a.html
M testout/b.html
M gloss.py => gloss.py +56 -32
@@ 3,11 3,13 @@
import html
import re
from datetime import datetime
from os import stat
from os import stat, system
from glob import glob
from typing import List, Optional, Union, Dict, Set, Tuple
from dataclasses import dataclass
from sys import argv, stderr, exit
from pathlib import Path
from subprocess import Popen, PIPE, run as runcmd

usage = f'''
usage: {argv[0]} <SRCDIR> <OUTDIR>


@@ 115,10 117,47 @@ def first_pass(slug, fp):
	
	return GlsFile(slug, title, [*names], blocks, see_also)

quote_pat = re.compile("((?:(?!@@)(?!//).)+)(?:@@((?:(?!//).)+))?(?://(.+))?")
link_pat = re.compile("(?<!\\\\)<([^<>|]+)(?:\\|([^>]*))?>(?<!\\\\>)")
stripped_link_pat = "(?<!\\\\)<[^<>|]+(?:\\|[^>]*)?>(?<!\\\\>)"
italic_pat = re.compile(f"(?:(?<=\\W)|^)/(?:[^<>/]|{stripped_link_pat})+/(?:(?=\\W)|$)")
class Markup:
	def __init__(self, where, *, cfile='markup.c', bfile='markup'):
		self.cfile = str(Path(where, cfile))
		self.bfile = str(Path(where, bfile))

		if stat(self.cfile).st_mtime > stat(self.bfile).st_mtime:
			print("recompiling markup subsystem...")
			runcmd(['cc', self.cfile, '-DDEBUG', '-o', self.bfile, '-Wall'], check=True)
			print("recompiled!")
		
		self.proc = Popen([self.bfile, 'convert'], stdin=PIPE, stdout=PIPE, text=True)
	
	def process(self, text):
		print(f'{text=}')
		self.proc.stdin.write(text+"\n")
		self.proc.stdin.flush()

		segments = []
		while (line := self.proc.stdout.readline()) != '':
			ty = line[0:4]
			if ty == 'NEXT':
				break

			length = int(line[5:9])
			ltext = line[12:12+length]
			print(f'{ltext=}')

			if ty == 'HTML':
				segments.append((ltext,))
			elif ty == 'IESC':
				segments.append((html.escape(ltext),))
			elif ty == 'TEXT':
				segments.append(ltext)
			else:
				print(f'read in unknown type "{ty}" from markup subprocess', file=stderr)
		
		print(segments)
		return segments
			

quote_pat = re.compile("((?:(?!@@)(?!//).)*)(?:@@((?:(?!//).)+))?(?://(.+))?")


### GENERATION ###


@@ 153,29 192,9 @@ class Indexes:
		
		sorted(self.names_sorted, key=lambda x: len(x[0]))
			
def gen_inner_html(file, idx):
	for block in file.blocks:		# populate italics+external links, listify block.text
		text = block.text
		working = []
		while (m := italic_pat.search(text)) is not None:
			s,e = m.span()
			working.append(text[:s])
			working.append(('<i>',))
			working.append(text[s:e])
			working.append(('</i>',))
			text = text[e:]
		working.append(text)
		block.text = []
		for text in working:
			if type(text) != str:
				block.text.append(text)
				continue
			while (m := link_pat.search(text)) is not None:
				s,e = m.span()
				block.text.append(text[:s].replace('\\<', '<').replace('\\>', '>'))
				block.text.append((link_repl(m),))
				text = text[e:]
			block.text.append(text)
def gen_inner_html(fmt, file, idx):
	for block in file.blocks:		# format text, listify it
		block.text = fmt.process(block.text)

	blacklist = set()
	for name, pat in idx.names_sorted:	# populate local links


@@ 205,9 224,12 @@ def gen_inner_html(file, idx):
		if block.ty == 'para':
			content += f"\n<p>{text}</p>"
		elif block.ty == 'quote':
			sauce, date, url = map(
				lambda x: x if x is None else html.escape(x.strip()),
				quote_pat.match(block.meta).groups() )
			if block.meta is not None and len(block.meta):
				sauce, date, url = map(
					lambda x: x if x is None else html.escape(x.strip()),
					quote_pat.match(block.meta).groups() )
			else:
				sauce = date = url = None

			content += "\n<div>\n\t<blockquote" + ('' if url is None else f' cite="{url}"') + '>'
			content += text


@@ 272,6 294,8 @@ if __name__ == '__main__':
</html>
'''

	fmt = Markup(Path(__file__).parent)

	files = []
	for fn in glob('*.gls', root_dir=srcdir):
		with open(f'{srcdir}/{fn}', 'rt') as fp:


@@ 285,7 309,7 @@ if __name__ == '__main__':
			ctx = {
				'title': html.escape(file.title),
				'slug': html.escape(file.slug),
				'body': gen_inner_html(file, indexes),
				'body': gen_inner_html(fmt, file, indexes),
				'modtime': datetime.fromtimestamp(stat(f'{srcdir}/{file.slug}.gls').st_mtime)
			}
			fp.write(template.format(**ctx))

A markup.c => markup.c +375 -0
@@ 0,0 1,375 @@
/* a minimal markup language */

#include <string.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>


#define SEG_HEAD 0	// never export
#define SEG_HTML 1	// OK to include in the output
#define SEG_TEXT 2	// raw user-generated text
#define SEG_IESC 3	// should be immediately escaped python-side and treated as html

#ifdef DEBUG
#undef DEBUG
#define PDEBUG(...) fprintf(stderr, __VA_ARGS__);
#define PEXPR(exp) fprintf(stderr, "(%s) = %ld\n", #exp, (long)(exp));
#define DEBUG 1
#else
#define PDEBUG(...)
#define PEXPR(exp)
#define DEBUG 0
#endif

#define PLINE() fprintf(stderr, "on line %d in func %s\n", __LINE__, __FILE__)

#define LEN(x) (sizeof(x) / sizeof(x[0]))

static char *argv0 = "markup";


struct segment {
	int type;
	size_t length;
	const char *text;
	struct segment *next;
};

static int
isword(char c)
{
	return c == '_' || c == '<' || c == '>' || isalnum(c);
}

static void
print_segment(struct segment *seg)
{
	const char *ty;
	switch(seg->type){
	case SEG_HEAD:
		ty = "HEAD";
		break;
	case SEG_HTML:
		ty = "HTML";
		break;
	case SEG_TEXT:
		ty = "TEXT";
		break;
	case SEG_IESC:
		ty = "IESC";
		break;
	}
	if(DEBUG)
		fprintf(stderr, "%s(%04zu)->%p: %.*s\n", ty, seg->length, seg->next, (int)seg->length, seg->text ?: "");
	printf("%s(%04zu): %.*s\n", ty, seg->length, (int)seg->length, seg->text ?: "");
}

static inline struct segment *
new_segment(int type, size_t length, const char *text)
{
	struct segment *new = malloc(sizeof (struct segment));
	new->type = type;
	new->length = length;
	new->text = text;
	new->next = NULL;

	fprintf(stderr, "new_segment @ %p: %d / %zu / %.*s\n", new, type, length, (int)length, text);

	return new;
}

static inline void
push_segment(struct segment **seg, int type, size_t length, const char *text)
{
	struct segment *new = new_segment(type, length, text);
	(*seg)->next = new;
	*seg = new;
}

static inline void
push_html(struct segment **seg, size_t length, const char *text)
{
	push_segment(seg, SEG_HTML, length, text);
}
static inline void
push_text(struct segment **seg, size_t length, const char *text)
{
	push_segment(seg, SEG_TEXT, length, text);
}
static inline void
push_iesc(struct segment **seg, size_t length, const char *text)
{
	push_segment(seg, SEG_IESC, length, text);
}

/*
 * attempt to convert a string of the form <href[|name]> into either:
 * - H'<a href="'  I[href]  H'">&lt;'  I[href]  H'&gt;</a>'
 * - H'<a href="'  I[href]  H'">'      I[name]      H'</a>'
 */
static struct segment *
try_link(const char **ip, const char *end, struct segment **tailp)
{
	const char *i = *ip+1, *href = i, *hend = NULL, *name = NULL;
	struct segment *head, *tail;

	for(; i < end; i++){
		switch(*i){
		case '|':
			if(name == NULL){
				name = i+1;
				hend = i;
			}
			break;
		case '>':
			if(hend == NULL)
				hend = i;

			head = tail = new_segment(SEG_HTML, 9, "<a href=\"");
			push_iesc(&tail, hend - href, href);
			if(name == NULL){
				push_html(&tail, 5, "\"&lt;");
				push_iesc(&tail, hend - href, href);
				push_html(&tail, 8, "&gt;</a>");
			}else{
				push_html(&tail, 2, "\">");
				push_iesc(&tail, i - name, name);
				push_html(&tail, 4, "</a>");
			}
			
			*ip = i;
			*tailp = tail;
			return head;
		}
	}

	return NULL;
}

static struct segment * // TODO: rewrite with ICU
segmentize(size_t length, const char *source)
{
	// p is the start of the most recent text segment
	const char *p = source, *i, *newi, *end = source + length;
	int in_italic = 0, in_escape = 0;
	struct segment *head = new_segment(SEG_HEAD, 0, NULL), *ret;
	struct segment *tail = head;
	struct segment *italic_head, *italic_tail;
	struct segment *link_head, *link_tail;

	for(i = source; i < end; i++){
		switch(*i){
		case '\\':
			i++;
			break;
		case '/':
			if(!in_italic){	// begin italic section
				PDEBUG("starting italics @ source[%zu]\n", i - source);
				PEXPR(i != source);
				PEXPR(i != source && isword(i[-1]));
				PEXPR(end - i);
				PEXPR(!isword(i[1]));
				PEXPR((i != source && isword(i[-1])) || (end - i) < 3 || !isword(i[1]));
				if((i != source && isword(i[-1])) ||	// previous char must be ^ or non-word
				   (end - i) < 3 ||			// cannot start near EOB
				   !isword(i[1]))			// next char must be word
				   	continue;
				in_italic = 1;
				PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
				push_text(&tail, i - p, p);
				PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
				italic_head = italic_tail = new_segment(SEG_HTML, 3, "<i>");
				
				p = i;
			}else{		// end italic section
				PDEBUG("stopping italics @ source[%zu]\n", i - source);
				PEXPR(!isword(i[-1]));
				PEXPR((i+1) != end);
				PEXPR(isword(i[1]));
				PEXPR(!isword(i[-1]) || ((i+1) != end && isword(i[1])));
				if(!isword(i[-1]) ||			// previous char must be word
				   ((i+1) != end && isword(i[1])))	// next char must be EOB or non-word
					continue;
				
				in_italic = 0;
				push_text(&italic_tail, i - p + 1, p);
				push_html(&italic_tail, 4, "</i>");
				tail->next = italic_head;
				tail = italic_tail;
				p = i + 1;
			}
			break;
		case '<':
			newi = i;
			link_head = try_link(&newi, end, &link_tail);
			if(link_head == NULL){ // reached EOB before link could finish, bail out
				i = end;
				break;
			}
			
			if (DEBUG)
				print_segment(link_tail);
			PDEBUG("*newi = '%c'\n", *newi);
			
			if(in_italic){
				push_text(&italic_tail, i - p, p);
				italic_tail->next = link_head;
				italic_tail = link_tail;
			}else{
				push_text(&tail, i - p, p);
				tail->next = link_head;
				tail = link_tail;
			}
			i = newi;
			p = i+1;
			break;
		}
	}
	if(in_italic){
		PLINE();
		tail->next = italic_head->next;
		if(tail->next != NULL)
			tail = italic_tail;
		
		tail->next = italic_head; // reuse it :D
		tail = tail->next;
		tail->type = SEG_TEXT;
		tail->length = end - p;
		tail->text = p;
		tail->next = NULL;
	}else{
		tail->next = new_segment(SEG_TEXT, end - p, p);
	}

	ret = head->next;
	free(head);
	return ret;
}

static const char *
nextline()
{
	static size_t buflen = 1024;
	static char *buf = NULL;
	size_t off;
	char *p;

	if(buf == NULL)
		buf = malloc(buflen);
	p = buf;
	
	while((*p = getc(stdin)) != EOF){
		if(*p == '\n'){
			*p = '\0';
			return buf;
		}else if(*p == '\0'){
			p--;
		}

		if(++p == buf + buflen){
			off = p - buf;
			buflen *= 3;
			buflen /= 2;
			buf = realloc(buf, buflen);
			p = buf + off;
		}
	}
	return NULL;
}

static void
convert()
{
	size_t length;
	const char *s;
	struct segment *seg;

	while((s = nextline()) != NULL){
		fprintf(stderr, "processing line: %s\n", s);
		length = strlen(s);
		seg = segmentize(length, s);
		for(; seg != NULL; seg = seg->next)
			print_segment(seg);
		puts("NEXT");
		fflush(stdout);
	}
}

static void
usage()
{
	printf(
	"usage: %s <VERB> [...args]\n\n"
	"VERB may be one of:\n"
	"	test [N]  -  format the in-built test strings, or test string N if provided.\n"
	"	convert   -  read input lines, convert them, and print the result until EOF.\n"
	, argv0);
	exit(1);
}

const static char *strings[] = {
	"This string should be one SEG_TEXT.",
	"And this//",
	"This string should have an /italic/ part.",
	"And this /o/",
	"/This/ string should have /two/.",
	"<link>",
	"<example.com|link text>",
	"how about an /<href|italicized link>/?",
	"but no <href|/partially/ italicized ones>, right?",
	NULL
};

static void
run_test(const char *s)
{
	struct segment *seg;

	printf("\nsegmentizing: \"%s\"(%zu)\n", s, strlen(s));
	seg = segmentize(strlen(s), s);
	for(; seg != NULL; seg = seg->next)
		print_segment(seg);
}

static void
run_testn(int n)
{
	if(n < LEN(strings))
		run_test(strings[n]);
	else{
		fprintf(stderr, "test # out of range: %d\n", n);
		exit(2);
	}
}

static void
run_tests()
{
	const char **s;
	for(s = strings; *s != NULL; s++){
		run_test(*s);
	}
}

int
main(int argc, char **argv)
{
	if(argc < 2)
		usage();
	argv0 = (argv++)[0];
	argc--;

	if(strcmp(argv[0], "test") == 0){
		if(argc > 1)
			run_testn(atoi(argv[1]));
		else
			run_tests();
	}else if(strcmp(argv[0], "convert") == 0)
		convert();
	else
		usage();

	return 0;
}


M testout/a.html => testout/a.html +1 -1
@@ 17,7 17,7 @@
<h3>See Also:</h3>
<ul>
	<li><a href="b.html">The Letter B</a></li>
</li>
</ul>
</main>
<footer>File last modified Fri, 2025-15-51 00:51:39 </footer>
</body>

M testout/b.html => testout/b.html +1 -1
@@ 12,7 12,7 @@
<h3>See Also:</h3>
<ul>
	<li><a href="a.html">The Letter A</a></li>
</li>
</ul>
</main>
<footer>File last modified Thu, 2025-14-40 18:40:52 </footer>
</body>