From cebd9a4d78455b80e35e6cdc905936bde678dc2e Mon Sep 17 00:00:00 2001 From: Aleteoryx Date: Thu, 28 Aug 2025 20:15:26 -0400 Subject: [PATCH] rewrite markup language in C --- gloss.py | 88 +++++++----- markup.c | 375 +++++++++++++++++++++++++++++++++++++++++++++++++ testout/a.html | 2 +- testout/b.html | 2 +- 4 files changed, 433 insertions(+), 34 deletions(-) create mode 100644 markup.c diff --git a/gloss.py b/gloss.py index 33cd4529f9abc6131add9c81ad74488c461d25f5..5a17559c4fec193086a1f71f5762cb7b217802bb 100755 --- a/gloss.py +++ b/gloss.py @@ -3,11 +3,13 @@ import html import re from datetime import datetime -from os import stat +from os import stat, system from glob import glob from typing import List, Optional, Union, Dict, Set, Tuple from dataclasses import dataclass from sys import argv, stderr, exit +from pathlib import Path +from subprocess import Popen, PIPE, run as runcmd usage = f''' usage: {argv[0]} @@ -115,10 +117,47 @@ def first_pass(slug, fp): return GlsFile(slug, title, [*names], blocks, see_also) -quote_pat = re.compile("((?:(?!@@)(?!//).)+)(?:@@((?:(?!//).)+))?(?://(.+))?") -link_pat = re.compile("(?|]+)(?:\\|([^>]*))?>(?)") -stripped_link_pat = "(?|]+(?:\\|[^>]*)?>(?)" -italic_pat = re.compile(f"(?:(?<=\\W)|^)/(?:[^<>/]|{stripped_link_pat})+/(?:(?=\\W)|$)") +class Markup: + def __init__(self, where, *, cfile='markup.c', bfile='markup'): + self.cfile = str(Path(where, cfile)) + self.bfile = str(Path(where, bfile)) + + if stat(self.cfile).st_mtime > stat(self.bfile).st_mtime: + print("recompiling markup subsystem...") + runcmd(['cc', self.cfile, '-DDEBUG', '-o', self.bfile, '-Wall'], check=True) + print("recompiled!") + + self.proc = Popen([self.bfile, 'convert'], stdin=PIPE, stdout=PIPE, text=True) + + def process(self, text): + print(f'{text=}') + self.proc.stdin.write(text+"\n") + self.proc.stdin.flush() + + segments = [] + while (line := self.proc.stdout.readline()) != '': + ty = line[0:4] + if ty == 'NEXT': + break + + length = int(line[5:9]) + ltext = line[12:12+length] + print(f'{ltext=}') + + if ty == 'HTML': + segments.append((ltext,)) + elif ty == 'IESC': + segments.append((html.escape(ltext),)) + elif ty == 'TEXT': + segments.append(ltext) + else: + print(f'read in unknown type "{ty}" from markup subprocess', file=stderr) + + print(segments) + return segments + + +quote_pat = re.compile("((?:(?!@@)(?!//).)*)(?:@@((?:(?!//).)+))?(?://(.+))?") ### GENERATION ### @@ -153,29 +192,9 @@ class Indexes: sorted(self.names_sorted, key=lambda x: len(x[0])) -def gen_inner_html(file, idx): - for block in file.blocks: # populate italics+external links, listify block.text - text = block.text - working = [] - while (m := italic_pat.search(text)) is not None: - s,e = m.span() - working.append(text[:s]) - working.append(('',)) - working.append(text[s:e]) - working.append(('',)) - text = text[e:] - working.append(text) - block.text = [] - for text in working: - if type(text) != str: - block.text.append(text) - continue - while (m := link_pat.search(text)) is not None: - s,e = m.span() - block.text.append(text[:s].replace('\\<', '<').replace('\\>', '>')) - block.text.append((link_repl(m),)) - text = text[e:] - block.text.append(text) +def gen_inner_html(fmt, file, idx): + for block in file.blocks: # format text, listify it + block.text = fmt.process(block.text) blacklist = set() for name, pat in idx.names_sorted: # populate local links @@ -205,9 +224,12 @@ def gen_inner_html(file, idx): if block.ty == 'para': content += f"\n

{text}

" elif block.ty == 'quote': - sauce, date, url = map( - lambda x: x if x is None else html.escape(x.strip()), - quote_pat.match(block.meta).groups() ) + if block.meta is not None and len(block.meta): + sauce, date, url = map( + lambda x: x if x is None else html.escape(x.strip()), + quote_pat.match(block.meta).groups() ) + else: + sauce = date = url = None content += "\n
\n\t' content += text @@ -272,6 +294,8 @@ if __name__ == '__main__': ''' + fmt = Markup(Path(__file__).parent) + files = [] for fn in glob('*.gls', root_dir=srcdir): with open(f'{srcdir}/{fn}', 'rt') as fp: @@ -285,7 +309,7 @@ if __name__ == '__main__': ctx = { 'title': html.escape(file.title), 'slug': html.escape(file.slug), - 'body': gen_inner_html(file, indexes), + 'body': gen_inner_html(fmt, file, indexes), 'modtime': datetime.fromtimestamp(stat(f'{srcdir}/{file.slug}.gls').st_mtime) } fp.write(template.format(**ctx)) diff --git a/markup.c b/markup.c new file mode 100644 index 0000000000000000000000000000000000000000..5501850e8255b133e06f0e92ac348a57812e2439 --- /dev/null +++ b/markup.c @@ -0,0 +1,375 @@ +/* a minimal markup language */ + +#include +#include +#include +#include + + +#define SEG_HEAD 0 // never export +#define SEG_HTML 1 // OK to include in the output +#define SEG_TEXT 2 // raw user-generated text +#define SEG_IESC 3 // should be immediately escaped python-side and treated as html + +#ifdef DEBUG +#undef DEBUG +#define PDEBUG(...) fprintf(stderr, __VA_ARGS__); +#define PEXPR(exp) fprintf(stderr, "(%s) = %ld\n", #exp, (long)(exp)); +#define DEBUG 1 +#else +#define PDEBUG(...) +#define PEXPR(exp) +#define DEBUG 0 +#endif + +#define PLINE() fprintf(stderr, "on line %d in func %s\n", __LINE__, __FILE__) + +#define LEN(x) (sizeof(x) / sizeof(x[0])) + +static char *argv0 = "markup"; + + +struct segment { + int type; + size_t length; + const char *text; + struct segment *next; +}; + +static int +isword(char c) +{ + return c == '_' || c == '<' || c == '>' || isalnum(c); +} + +static void +print_segment(struct segment *seg) +{ + const char *ty; + switch(seg->type){ + case SEG_HEAD: + ty = "HEAD"; + break; + case SEG_HTML: + ty = "HTML"; + break; + case SEG_TEXT: + ty = "TEXT"; + break; + case SEG_IESC: + ty = "IESC"; + break; + } + if(DEBUG) + fprintf(stderr, "%s(%04zu)->%p: %.*s\n", ty, seg->length, seg->next, (int)seg->length, seg->text ?: ""); + printf("%s(%04zu): %.*s\n", ty, seg->length, (int)seg->length, seg->text ?: ""); +} + +static inline struct segment * +new_segment(int type, size_t length, const char *text) +{ + struct segment *new = malloc(sizeof (struct segment)); + new->type = type; + new->length = length; + new->text = text; + new->next = NULL; + + fprintf(stderr, "new_segment @ %p: %d / %zu / %.*s\n", new, type, length, (int)length, text); + + return new; +} + +static inline void +push_segment(struct segment **seg, int type, size_t length, const char *text) +{ + struct segment *new = new_segment(type, length, text); + (*seg)->next = new; + *seg = new; +} + +static inline void +push_html(struct segment **seg, size_t length, const char *text) +{ + push_segment(seg, SEG_HTML, length, text); +} +static inline void +push_text(struct segment **seg, size_t length, const char *text) +{ + push_segment(seg, SEG_TEXT, length, text); +} +static inline void +push_iesc(struct segment **seg, size_t length, const char *text) +{ + push_segment(seg, SEG_IESC, length, text); +} + +/* + * attempt to convert a string of the form into either: + * - H'<' I[href] H'>' + * - H'' I[name] H'' + */ +static struct segment * +try_link(const char **ip, const char *end, struct segment **tailp) +{ + const char *i = *ip+1, *href = i, *hend = NULL, *name = NULL; + struct segment *head, *tail; + + for(; i < end; i++){ + switch(*i){ + case '|': + if(name == NULL){ + name = i+1; + hend = i; + } + break; + case '>': + if(hend == NULL) + hend = i; + + head = tail = new_segment(SEG_HTML, 9, ""); + }else{ + push_html(&tail, 2, "\">"); + push_iesc(&tail, i - name, name); + push_html(&tail, 4, ""); + } + + *ip = i; + *tailp = tail; + return head; + } + } + + return NULL; +} + +static struct segment * // TODO: rewrite with ICU +segmentize(size_t length, const char *source) +{ + // p is the start of the most recent text segment + const char *p = source, *i, *newi, *end = source + length; + int in_italic = 0, in_escape = 0; + struct segment *head = new_segment(SEG_HEAD, 0, NULL), *ret; + struct segment *tail = head; + struct segment *italic_head, *italic_tail; + struct segment *link_head, *link_tail; + + for(i = source; i < end; i++){ + switch(*i){ + case '\\': + i++; + break; + case '/': + if(!in_italic){ // begin italic section + PDEBUG("starting italics @ source[%zu]\n", i - source); + PEXPR(i != source); + PEXPR(i != source && isword(i[-1])); + PEXPR(end - i); + PEXPR(!isword(i[1])); + PEXPR((i != source && isword(i[-1])) || (end - i) < 3 || !isword(i[1])); + if((i != source && isword(i[-1])) || // previous char must be ^ or non-word + (end - i) < 3 || // cannot start near EOB + !isword(i[1])) // next char must be word + continue; + in_italic = 1; + PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail); + push_text(&tail, i - p, p); + PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail); + italic_head = italic_tail = new_segment(SEG_HTML, 3, ""); + + p = i; + }else{ // end italic section + PDEBUG("stopping italics @ source[%zu]\n", i - source); + PEXPR(!isword(i[-1])); + PEXPR((i+1) != end); + PEXPR(isword(i[1])); + PEXPR(!isword(i[-1]) || ((i+1) != end && isword(i[1]))); + if(!isword(i[-1]) || // previous char must be word + ((i+1) != end && isword(i[1]))) // next char must be EOB or non-word + continue; + + in_italic = 0; + push_text(&italic_tail, i - p + 1, p); + push_html(&italic_tail, 4, ""); + tail->next = italic_head; + tail = italic_tail; + p = i + 1; + } + break; + case '<': + newi = i; + link_head = try_link(&newi, end, &link_tail); + if(link_head == NULL){ // reached EOB before link could finish, bail out + i = end; + break; + } + + if (DEBUG) + print_segment(link_tail); + PDEBUG("*newi = '%c'\n", *newi); + + if(in_italic){ + push_text(&italic_tail, i - p, p); + italic_tail->next = link_head; + italic_tail = link_tail; + }else{ + push_text(&tail, i - p, p); + tail->next = link_head; + tail = link_tail; + } + i = newi; + p = i+1; + break; + } + } + if(in_italic){ + PLINE(); + tail->next = italic_head->next; + if(tail->next != NULL) + tail = italic_tail; + + tail->next = italic_head; // reuse it :D + tail = tail->next; + tail->type = SEG_TEXT; + tail->length = end - p; + tail->text = p; + tail->next = NULL; + }else{ + tail->next = new_segment(SEG_TEXT, end - p, p); + } + + ret = head->next; + free(head); + return ret; +} + +static const char * +nextline() +{ + static size_t buflen = 1024; + static char *buf = NULL; + size_t off; + char *p; + + if(buf == NULL) + buf = malloc(buflen); + p = buf; + + while((*p = getc(stdin)) != EOF){ + if(*p == '\n'){ + *p = '\0'; + return buf; + }else if(*p == '\0'){ + p--; + } + + if(++p == buf + buflen){ + off = p - buf; + buflen *= 3; + buflen /= 2; + buf = realloc(buf, buflen); + p = buf + off; + } + } + return NULL; +} + +static void +convert() +{ + size_t length; + const char *s; + struct segment *seg; + + while((s = nextline()) != NULL){ + fprintf(stderr, "processing line: %s\n", s); + length = strlen(s); + seg = segmentize(length, s); + for(; seg != NULL; seg = seg->next) + print_segment(seg); + puts("NEXT"); + fflush(stdout); + } +} + +static void +usage() +{ + printf( + "usage: %s [...args]\n\n" + "VERB may be one of:\n" + " test [N] - format the in-built test strings, or test string N if provided.\n" + " convert - read input lines, convert them, and print the result until EOF.\n" + , argv0); + exit(1); +} + +const static char *strings[] = { + "This string should be one SEG_TEXT.", + "And this//", + "This string should have an /italic/ part.", + "And this /o/", + "/This/ string should have /two/.", + "", + "", + "how about an //?", + "but no , right?", + NULL +}; + +static void +run_test(const char *s) +{ + struct segment *seg; + + printf("\nsegmentizing: \"%s\"(%zu)\n", s, strlen(s)); + seg = segmentize(strlen(s), s); + for(; seg != NULL; seg = seg->next) + print_segment(seg); +} + +static void +run_testn(int n) +{ + if(n < LEN(strings)) + run_test(strings[n]); + else{ + fprintf(stderr, "test # out of range: %d\n", n); + exit(2); + } +} + +static void +run_tests() +{ + const char **s; + for(s = strings; *s != NULL; s++){ + run_test(*s); + } +} + +int +main(int argc, char **argv) +{ + if(argc < 2) + usage(); + argv0 = (argv++)[0]; + argc--; + + if(strcmp(argv[0], "test") == 0){ + if(argc > 1) + run_testn(atoi(argv[1])); + else + run_tests(); + }else if(strcmp(argv[0], "convert") == 0) + convert(); + else + usage(); + + return 0; +} + diff --git a/testout/a.html b/testout/a.html index 5bdae30351991253dfac966b58c83924b5dabe0a..1dd63ae7cf15edfcab5260d87bd2abf9c7860d32 100644 --- a/testout/a.html +++ b/testout/a.html @@ -17,7 +17,7 @@

See Also:

File last modified Fri, 2025-15-51 00:51:39
diff --git a/testout/b.html b/testout/b.html index cc370364413a22092bb7a397a05031dfd0f6f61b..4d80b42e3c2a9c185ac298f3a7f19056d93df2be 100644 --- a/testout/b.html +++ b/testout/b.html @@ -12,7 +12,7 @@

See Also: