@@ 3,11 3,13 @@
import html
import re
from datetime import datetime
-from os import stat
+from os import stat, system
from glob import glob
from typing import List, Optional, Union, Dict, Set, Tuple
from dataclasses import dataclass
from sys import argv, stderr, exit
+from pathlib import Path
+from subprocess import Popen, PIPE, run as runcmd
usage = f'''
usage: {argv[0]} <SRCDIR> <OUTDIR>
@@ 115,10 117,47 @@ def first_pass(slug, fp):
return GlsFile(slug, title, [*names], blocks, see_also)
-quote_pat = re.compile("((?:(?!@@)(?!//).)+)(?:@@((?:(?!//).)+))?(?://(.+))?")
-link_pat = re.compile("(?<!\\\\)<([^<>|]+)(?:\\|([^>]*))?>(?<!\\\\>)")
-stripped_link_pat = "(?<!\\\\)<[^<>|]+(?:\\|[^>]*)?>(?<!\\\\>)"
-italic_pat = re.compile(f"(?:(?<=\\W)|^)/(?:[^<>/]|{stripped_link_pat})+/(?:(?=\\W)|$)")
+class Markup:
+ def __init__(self, where, *, cfile='markup.c', bfile='markup'):
+ self.cfile = str(Path(where, cfile))
+ self.bfile = str(Path(where, bfile))
+
+ if stat(self.cfile).st_mtime > stat(self.bfile).st_mtime:
+ print("recompiling markup subsystem...")
+ runcmd(['cc', self.cfile, '-DDEBUG', '-o', self.bfile, '-Wall'], check=True)
+ print("recompiled!")
+
+ self.proc = Popen([self.bfile, 'convert'], stdin=PIPE, stdout=PIPE, text=True)
+
+ def process(self, text):
+ print(f'{text=}')
+ self.proc.stdin.write(text+"\n")
+ self.proc.stdin.flush()
+
+ segments = []
+ while (line := self.proc.stdout.readline()) != '':
+ ty = line[0:4]
+ if ty == 'NEXT':
+ break
+
+ length = int(line[5:9])
+ ltext = line[12:12+length]
+ print(f'{ltext=}')
+
+ if ty == 'HTML':
+ segments.append((ltext,))
+ elif ty == 'IESC':
+ segments.append((html.escape(ltext),))
+ elif ty == 'TEXT':
+ segments.append(ltext)
+ else:
+ print(f'read in unknown type "{ty}" from markup subprocess', file=stderr)
+
+ print(segments)
+ return segments
+
+
+quote_pat = re.compile("((?:(?!@@)(?!//).)*)(?:@@((?:(?!//).)+))?(?://(.+))?")
### GENERATION ###
@@ 153,29 192,9 @@ class Indexes:
sorted(self.names_sorted, key=lambda x: len(x[0]))
-def gen_inner_html(file, idx):
- for block in file.blocks: # populate italics+external links, listify block.text
- text = block.text
- working = []
- while (m := italic_pat.search(text)) is not None:
- s,e = m.span()
- working.append(text[:s])
- working.append(('<i>',))
- working.append(text[s:e])
- working.append(('</i>',))
- text = text[e:]
- working.append(text)
- block.text = []
- for text in working:
- if type(text) != str:
- block.text.append(text)
- continue
- while (m := link_pat.search(text)) is not None:
- s,e = m.span()
- block.text.append(text[:s].replace('\\<', '<').replace('\\>', '>'))
- block.text.append((link_repl(m),))
- text = text[e:]
- block.text.append(text)
+def gen_inner_html(fmt, file, idx):
+ for block in file.blocks: # format text, listify it
+ block.text = fmt.process(block.text)
blacklist = set()
for name, pat in idx.names_sorted: # populate local links
@@ 205,9 224,12 @@ def gen_inner_html(file, idx):
if block.ty == 'para':
content += f"\n<p>{text}</p>"
elif block.ty == 'quote':
- sauce, date, url = map(
- lambda x: x if x is None else html.escape(x.strip()),
- quote_pat.match(block.meta).groups() )
+ if block.meta is not None and len(block.meta):
+ sauce, date, url = map(
+ lambda x: x if x is None else html.escape(x.strip()),
+ quote_pat.match(block.meta).groups() )
+ else:
+ sauce = date = url = None
content += "\n<div>\n\t<blockquote" + ('' if url is None else f' cite="{url}"') + '>'
content += text
@@ 272,6 294,8 @@ if __name__ == '__main__':
</html>
'''
+ fmt = Markup(Path(__file__).parent)
+
files = []
for fn in glob('*.gls', root_dir=srcdir):
with open(f'{srcdir}/{fn}', 'rt') as fp:
@@ 285,7 309,7 @@ if __name__ == '__main__':
ctx = {
'title': html.escape(file.title),
'slug': html.escape(file.slug),
- 'body': gen_inner_html(file, indexes),
+ 'body': gen_inner_html(fmt, file, indexes),
'modtime': datetime.fromtimestamp(stat(f'{srcdir}/{file.slug}.gls').st_mtime)
}
fp.write(template.format(**ctx))
@@ 0,0 1,375 @@
+/* a minimal markup language */
+
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#define SEG_HEAD 0 // never export
+#define SEG_HTML 1 // OK to include in the output
+#define SEG_TEXT 2 // raw user-generated text
+#define SEG_IESC 3 // should be immediately escaped python-side and treated as html
+
+#ifdef DEBUG
+#undef DEBUG
+#define PDEBUG(...) fprintf(stderr, __VA_ARGS__);
+#define PEXPR(exp) fprintf(stderr, "(%s) = %ld\n", #exp, (long)(exp));
+#define DEBUG 1
+#else
+#define PDEBUG(...)
+#define PEXPR(exp)
+#define DEBUG 0
+#endif
+
+#define PLINE() fprintf(stderr, "on line %d in func %s\n", __LINE__, __FILE__)
+
+#define LEN(x) (sizeof(x) / sizeof(x[0]))
+
+static char *argv0 = "markup";
+
+
+struct segment {
+ int type;
+ size_t length;
+ const char *text;
+ struct segment *next;
+};
+
+static int
+isword(char c)
+{
+ return c == '_' || c == '<' || c == '>' || isalnum(c);
+}
+
+static void
+print_segment(struct segment *seg)
+{
+ const char *ty;
+ switch(seg->type){
+ case SEG_HEAD:
+ ty = "HEAD";
+ break;
+ case SEG_HTML:
+ ty = "HTML";
+ break;
+ case SEG_TEXT:
+ ty = "TEXT";
+ break;
+ case SEG_IESC:
+ ty = "IESC";
+ break;
+ }
+ if(DEBUG)
+ fprintf(stderr, "%s(%04zu)->%p: %.*s\n", ty, seg->length, seg->next, (int)seg->length, seg->text ?: "");
+ printf("%s(%04zu): %.*s\n", ty, seg->length, (int)seg->length, seg->text ?: "");
+}
+
+static inline struct segment *
+new_segment(int type, size_t length, const char *text)
+{
+ struct segment *new = malloc(sizeof (struct segment));
+ new->type = type;
+ new->length = length;
+ new->text = text;
+ new->next = NULL;
+
+ fprintf(stderr, "new_segment @ %p: %d / %zu / %.*s\n", new, type, length, (int)length, text);
+
+ return new;
+}
+
+static inline void
+push_segment(struct segment **seg, int type, size_t length, const char *text)
+{
+ struct segment *new = new_segment(type, length, text);
+ (*seg)->next = new;
+ *seg = new;
+}
+
+static inline void
+push_html(struct segment **seg, size_t length, const char *text)
+{
+ push_segment(seg, SEG_HTML, length, text);
+}
+static inline void
+push_text(struct segment **seg, size_t length, const char *text)
+{
+ push_segment(seg, SEG_TEXT, length, text);
+}
+static inline void
+push_iesc(struct segment **seg, size_t length, const char *text)
+{
+ push_segment(seg, SEG_IESC, length, text);
+}
+
+/*
+ * attempt to convert a string of the form <href[|name]> into either:
+ * - H'<a href="' I[href] H'"><' I[href] H'></a>'
+ * - H'<a href="' I[href] H'">' I[name] H'</a>'
+ */
+static struct segment *
+try_link(const char **ip, const char *end, struct segment **tailp)
+{
+ const char *i = *ip+1, *href = i, *hend = NULL, *name = NULL;
+ struct segment *head, *tail;
+
+ for(; i < end; i++){
+ switch(*i){
+ case '|':
+ if(name == NULL){
+ name = i+1;
+ hend = i;
+ }
+ break;
+ case '>':
+ if(hend == NULL)
+ hend = i;
+
+ head = tail = new_segment(SEG_HTML, 9, "<a href=\"");
+ push_iesc(&tail, hend - href, href);
+ if(name == NULL){
+ push_html(&tail, 5, "\"<");
+ push_iesc(&tail, hend - href, href);
+ push_html(&tail, 8, "></a>");
+ }else{
+ push_html(&tail, 2, "\">");
+ push_iesc(&tail, i - name, name);
+ push_html(&tail, 4, "</a>");
+ }
+
+ *ip = i;
+ *tailp = tail;
+ return head;
+ }
+ }
+
+ return NULL;
+}
+
+static struct segment * // TODO: rewrite with ICU
+segmentize(size_t length, const char *source)
+{
+ // p is the start of the most recent text segment
+ const char *p = source, *i, *newi, *end = source + length;
+ int in_italic = 0, in_escape = 0;
+ struct segment *head = new_segment(SEG_HEAD, 0, NULL), *ret;
+ struct segment *tail = head;
+ struct segment *italic_head, *italic_tail;
+ struct segment *link_head, *link_tail;
+
+ for(i = source; i < end; i++){
+ switch(*i){
+ case '\\':
+ i++;
+ break;
+ case '/':
+ if(!in_italic){ // begin italic section
+ PDEBUG("starting italics @ source[%zu]\n", i - source);
+ PEXPR(i != source);
+ PEXPR(i != source && isword(i[-1]));
+ PEXPR(end - i);
+ PEXPR(!isword(i[1]));
+ PEXPR((i != source && isword(i[-1])) || (end - i) < 3 || !isword(i[1]));
+ if((i != source && isword(i[-1])) || // previous char must be ^ or non-word
+ (end - i) < 3 || // cannot start near EOB
+ !isword(i[1])) // next char must be word
+ continue;
+ in_italic = 1;
+ PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
+ push_text(&tail, i - p, p);
+ PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
+ italic_head = italic_tail = new_segment(SEG_HTML, 3, "<i>");
+
+ p = i;
+ }else{ // end italic section
+ PDEBUG("stopping italics @ source[%zu]\n", i - source);
+ PEXPR(!isword(i[-1]));
+ PEXPR((i+1) != end);
+ PEXPR(isword(i[1]));
+ PEXPR(!isword(i[-1]) || ((i+1) != end && isword(i[1])));
+ if(!isword(i[-1]) || // previous char must be word
+ ((i+1) != end && isword(i[1]))) // next char must be EOB or non-word
+ continue;
+
+ in_italic = 0;
+ push_text(&italic_tail, i - p + 1, p);
+ push_html(&italic_tail, 4, "</i>");
+ tail->next = italic_head;
+ tail = italic_tail;
+ p = i + 1;
+ }
+ break;
+ case '<':
+ newi = i;
+ link_head = try_link(&newi, end, &link_tail);
+ if(link_head == NULL){ // reached EOB before link could finish, bail out
+ i = end;
+ break;
+ }
+
+ if (DEBUG)
+ print_segment(link_tail);
+ PDEBUG("*newi = '%c'\n", *newi);
+
+ if(in_italic){
+ push_text(&italic_tail, i - p, p);
+ italic_tail->next = link_head;
+ italic_tail = link_tail;
+ }else{
+ push_text(&tail, i - p, p);
+ tail->next = link_head;
+ tail = link_tail;
+ }
+ i = newi;
+ p = i+1;
+ break;
+ }
+ }
+ if(in_italic){
+ PLINE();
+ tail->next = italic_head->next;
+ if(tail->next != NULL)
+ tail = italic_tail;
+
+ tail->next = italic_head; // reuse it :D
+ tail = tail->next;
+ tail->type = SEG_TEXT;
+ tail->length = end - p;
+ tail->text = p;
+ tail->next = NULL;
+ }else{
+ tail->next = new_segment(SEG_TEXT, end - p, p);
+ }
+
+ ret = head->next;
+ free(head);
+ return ret;
+}
+
+static const char *
+nextline()
+{
+ static size_t buflen = 1024;
+ static char *buf = NULL;
+ size_t off;
+ char *p;
+
+ if(buf == NULL)
+ buf = malloc(buflen);
+ p = buf;
+
+ while((*p = getc(stdin)) != EOF){
+ if(*p == '\n'){
+ *p = '\0';
+ return buf;
+ }else if(*p == '\0'){
+ p--;
+ }
+
+ if(++p == buf + buflen){
+ off = p - buf;
+ buflen *= 3;
+ buflen /= 2;
+ buf = realloc(buf, buflen);
+ p = buf + off;
+ }
+ }
+ return NULL;
+}
+
+static void
+convert()
+{
+ size_t length;
+ const char *s;
+ struct segment *seg;
+
+ while((s = nextline()) != NULL){
+ fprintf(stderr, "processing line: %s\n", s);
+ length = strlen(s);
+ seg = segmentize(length, s);
+ for(; seg != NULL; seg = seg->next)
+ print_segment(seg);
+ puts("NEXT");
+ fflush(stdout);
+ }
+}
+
+static void
+usage()
+{
+ printf(
+ "usage: %s <VERB> [...args]\n\n"
+ "VERB may be one of:\n"
+ " test [N] - format the in-built test strings, or test string N if provided.\n"
+ " convert - read input lines, convert them, and print the result until EOF.\n"
+ , argv0);
+ exit(1);
+}
+
+const static char *strings[] = {
+ "This string should be one SEG_TEXT.",
+ "And this//",
+ "This string should have an /italic/ part.",
+ "And this /o/",
+ "/This/ string should have /two/.",
+ "<link>",
+ "<example.com|link text>",
+ "how about an /<href|italicized link>/?",
+ "but no <href|/partially/ italicized ones>, right?",
+ NULL
+};
+
+static void
+run_test(const char *s)
+{
+ struct segment *seg;
+
+ printf("\nsegmentizing: \"%s\"(%zu)\n", s, strlen(s));
+ seg = segmentize(strlen(s), s);
+ for(; seg != NULL; seg = seg->next)
+ print_segment(seg);
+}
+
+static void
+run_testn(int n)
+{
+ if(n < LEN(strings))
+ run_test(strings[n]);
+ else{
+ fprintf(stderr, "test # out of range: %d\n", n);
+ exit(2);
+ }
+}
+
+static void
+run_tests()
+{
+ const char **s;
+ for(s = strings; *s != NULL; s++){
+ run_test(*s);
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ if(argc < 2)
+ usage();
+ argv0 = (argv++)[0];
+ argc--;
+
+ if(strcmp(argv[0], "test") == 0){
+ if(argc > 1)
+ run_testn(atoi(argv[1]));
+ else
+ run_tests();
+ }else if(strcmp(argv[0], "convert") == 0)
+ convert();
+ else
+ usage();
+
+ return 0;
+}
+