/* a minimal markup language */ #include #include #include #include #define SEG_HEAD 0 // never export #define SEG_HTML 1 // OK to include in the output #define SEG_TEXT 2 // raw user-generated text #define SEG_IESC 3 // should be immediately escaped python-side and treated as html #ifdef DEBUG #undef DEBUG #define PDEBUG(...) fprintf(stderr, __VA_ARGS__); #define PEXPR(exp) fprintf(stderr, "(%s) = %ld\n", #exp, (long)(exp)); #define DEBUG 1 #else #define PDEBUG(...) #define PEXPR(exp) #define DEBUG 0 #endif #define PLINE() fprintf(stderr, "on line %d in func %s\n", __LINE__, __FILE__) #define LEN(x) (sizeof(x) / sizeof(x[0])) static char *argv0 = "markup"; struct segment { int type; size_t length; const char *text; struct segment *next; }; static int isword(char c) { return c == '_' || c == '<' || c == '>' || isalnum(c); } static void print_segment(struct segment *seg) { const char *ty; switch(seg->type){ case SEG_HEAD: ty = "HEAD"; break; case SEG_HTML: ty = "HTML"; break; case SEG_TEXT: ty = "TEXT"; break; case SEG_IESC: ty = "IESC"; break; } if(DEBUG) fprintf(stderr, "%s(%04zu)->%p: %.*s\n", ty, seg->length, seg->next, (int)seg->length, seg->text ?: ""); printf("%s(%04zu): %.*s\n", ty, seg->length, (int)seg->length, seg->text ?: ""); } static inline struct segment * new_segment(int type, size_t length, const char *text) { struct segment *new = malloc(sizeof (struct segment)); new->type = type; new->length = length; new->text = text; new->next = NULL; if (DEBUG) fprintf(stderr, "new_segment @ %p: %d / %zu / %.*s\n", new, type, length, (int)length, text); return new; } static inline void push_segment(struct segment **seg, int type, size_t length, const char *text) { struct segment *new = new_segment(type, length, text); (*seg)->next = new; *seg = new; } static inline void push_html(struct segment **seg, size_t length, const char *text) { push_segment(seg, SEG_HTML, length, text); } static inline void push_text(struct segment **seg, size_t length, const char *text) { push_segment(seg, SEG_TEXT, length, text); } static inline void push_iesc(struct segment **seg, size_t length, const char *text) { push_segment(seg, SEG_IESC, length, text); } /* * attempt to convert a string of the form into either: * - H'<' I[href] H'>' * - H'' I[name] H'' */ static struct segment * try_link(const char **ip, const char *end, struct segment **tailp) { const char *i = *ip+1, *href = i, *hend = NULL, *name = NULL; struct segment *head, *tail; for(; i < end; i++){ switch(*i){ case '|': if(name == NULL){ name = i+1; hend = i; } break; case '>': if(hend == NULL) hend = i; head = tail = new_segment(SEG_HTML, 9, ""); }else{ push_html(&tail, 2, "\">"); push_iesc(&tail, i - name, name); push_html(&tail, 4, ""); } *ip = i; *tailp = tail; return head; } } return NULL; } static struct segment * // TODO: rewrite with ICU segmentize(size_t length, const char *source) { // p is the start of the most recent text segment const char *p = source, *i, *newi, *end = source + length; int in_italic = 0; struct segment *head = new_segment(SEG_HEAD, 0, NULL), *ret; struct segment *tail = head; struct segment *italic_head, *italic_tail; struct segment *link_head, *link_tail; for(i = source; i < end; i++){ switch(*i){ case '\\': i++; break; case '/': if(!in_italic){ // begin italic section PDEBUG("starting italics @ source[%zu]\n", i - source); PEXPR(i != source); PEXPR(i != source && isword(i[-1])); PEXPR(end - i); PEXPR(!isword(i[1])); PEXPR((i != source && isword(i[-1])) || (end - i) < 3 || !isword(i[1])); if((i != source && isword(i[-1])) || // previous char must be ^ or non-word (end - i) < 3 || // cannot start near EOB !isword(i[1])) // next char must be word continue; in_italic = 1; PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail); push_text(&tail, i - p, p); PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail); italic_head = italic_tail = new_segment(SEG_HTML, 3, ""); p = i; }else{ // end italic section PDEBUG("stopping italics @ source[%zu]\n", i - source); PEXPR(!isword(i[-1])); PEXPR((i+1) != end); PEXPR(isword(i[1])); PEXPR(!isword(i[-1]) || ((i+1) != end && isword(i[1]))); if(!isword(i[-1]) || // previous char must be word ((i+1) != end && isword(i[1]))) // next char must be EOB or non-word continue; in_italic = 0; push_text(&italic_tail, i - p + 1, p); push_html(&italic_tail, 4, ""); tail->next = italic_head; tail = italic_tail; p = i + 1; } break; case '<': newi = i; link_head = try_link(&newi, end, &link_tail); if(link_head == NULL){ // reached EOB before link could finish, bail out i = end; break; } if (DEBUG) print_segment(link_tail); PDEBUG("*newi = '%c'\n", *newi); if(in_italic){ push_text(&italic_tail, i - p, p); italic_tail->next = link_head; italic_tail = link_tail; }else{ push_text(&tail, i - p, p); tail->next = link_head; tail = link_tail; } i = newi; p = i+1; break; } } if(in_italic){ PLINE(); tail->next = italic_head->next; if(tail->next != NULL) tail = italic_tail; tail->next = italic_head; // reuse it :D tail = tail->next; tail->type = SEG_TEXT; tail->length = end - p; tail->text = p; tail->next = NULL; }else{ tail->next = new_segment(SEG_TEXT, end - p, p); } ret = head->next; free(head); return ret; } static const char * nextline() { static size_t buflen = 1024; static char *buf = NULL; size_t off; char *p; if(buf == NULL) buf = malloc(buflen); p = buf; while((*p = getc(stdin)) != EOF){ if(*p == '\n'){ *p = '\0'; return buf; }else if(*p == '\0'){ p--; } if(++p == buf + buflen){ off = p - buf; buflen *= 3; buflen /= 2; buf = realloc(buf, buflen); p = buf + off; } } return NULL; } static void convert() { size_t length; const char *s; struct segment *seg; while((s = nextline()) != NULL){ if(DEBUG) fprintf(stderr, "processing line: %s\n", s); length = strlen(s); seg = segmentize(length, s); for(; seg != NULL; seg = seg->next) print_segment(seg); puts("NEXT"); fflush(stdout); } } static void usage() { printf( "usage: %s [...args]\n\n" "VERB may be one of:\n" " test [N] - format the in-built test strings, or test string N if provided.\n" " convert - read input lines, convert them, and print the result until EOF.\n" , argv0); exit(1); } const static char *strings[] = { "This string should be one SEG_TEXT.", "And this//", "This string should have an /italic/ part.", "And this /o/", "/This/ string should have /two/.", "", "", "how about an //?", "but no , right?", NULL }; static void run_test(const char *s) { struct segment *seg; printf("\nsegmentizing: \"%s\"(%zu)\n", s, strlen(s)); seg = segmentize(strlen(s), s); for(; seg != NULL; seg = seg->next) print_segment(seg); } static void run_testn(int n) { if(n < LEN(strings)) run_test(strings[n]); else{ fprintf(stderr, "test # out of range: %d\n", n); exit(2); } } static void run_tests() { const char **s; for(s = strings; *s != NULL; s++){ run_test(*s); } } int main(int argc, char **argv) { if(argc < 2) usage(); argv0 = (argv++)[0]; argc--; if(strcmp(argv[0], "test") == 0){ if(argc > 1) run_testn(atoi(argv[1])); else run_tests(); }else if(strcmp(argv[0], "convert") == 0) convert(); else usage(); return 0; }