/* a minimal markup language */
#include <string.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#define SEG_HEAD 0 // never export
#define SEG_HTML 1 // OK to include in the output
#define SEG_TEXT 2 // raw user-generated text
#define SEG_IESC 3 // should be immediately escaped python-side and treated as html
#ifdef DEBUG
#undef DEBUG
#define PDEBUG(...) fprintf(stderr, __VA_ARGS__);
#define PEXPR(exp) fprintf(stderr, "(%s) = %ld\n", #exp, (long)(exp));
#define DEBUG 1
#else
#define PDEBUG(...)
#define PEXPR(exp)
#define DEBUG 0
#endif
#define PLINE() fprintf(stderr, "on line %d in func %s\n", __LINE__, __FILE__)
#define LEN(x) (sizeof(x) / sizeof(x[0]))
static char *argv0 = "markup";
struct segment {
int type;
size_t length;
const char *text;
struct segment *next;
};
static int
isword(char c)
{
return c == '_' || c == '<' || c == '>' || isalnum(c);
}
static void
print_segment(struct segment *seg)
{
const char *ty;
switch(seg->type){
case SEG_HEAD:
ty = "HEAD";
break;
case SEG_HTML:
ty = "HTML";
break;
case SEG_TEXT:
ty = "TEXT";
break;
case SEG_IESC:
ty = "IESC";
break;
}
if(DEBUG)
fprintf(stderr, "%s(%04zu)->%p: %.*s\n", ty, seg->length, seg->next, (int)seg->length, seg->text ?: "");
printf("%s(%04zu): %.*s\n", ty, seg->length, (int)seg->length, seg->text ?: "");
}
static inline struct segment *
new_segment(int type, size_t length, const char *text)
{
struct segment *new = malloc(sizeof (struct segment));
new->type = type;
new->length = length;
new->text = text;
new->next = NULL;
fprintf(stderr, "new_segment @ %p: %d / %zu / %.*s\n", new, type, length, (int)length, text);
return new;
}
static inline void
push_segment(struct segment **seg, int type, size_t length, const char *text)
{
struct segment *new = new_segment(type, length, text);
(*seg)->next = new;
*seg = new;
}
static inline void
push_html(struct segment **seg, size_t length, const char *text)
{
push_segment(seg, SEG_HTML, length, text);
}
static inline void
push_text(struct segment **seg, size_t length, const char *text)
{
push_segment(seg, SEG_TEXT, length, text);
}
static inline void
push_iesc(struct segment **seg, size_t length, const char *text)
{
push_segment(seg, SEG_IESC, length, text);
}
/*
* attempt to convert a string of the form <href[|name]> into either:
* - H'<a href="' I[href] H'"><' I[href] H'></a>'
* - H'<a href="' I[href] H'">' I[name] H'</a>'
*/
static struct segment *
try_link(const char **ip, const char *end, struct segment **tailp)
{
const char *i = *ip+1, *href = i, *hend = NULL, *name = NULL;
struct segment *head, *tail;
for(; i < end; i++){
switch(*i){
case '|':
if(name == NULL){
name = i+1;
hend = i;
}
break;
case '>':
if(hend == NULL)
hend = i;
head = tail = new_segment(SEG_HTML, 9, "<a href=\"");
push_iesc(&tail, hend - href, href);
if(name == NULL){
push_html(&tail, 5, "\"<");
push_iesc(&tail, hend - href, href);
push_html(&tail, 8, "></a>");
}else{
push_html(&tail, 2, "\">");
push_iesc(&tail, i - name, name);
push_html(&tail, 4, "</a>");
}
*ip = i;
*tailp = tail;
return head;
}
}
return NULL;
}
static struct segment * // TODO: rewrite with ICU
segmentize(size_t length, const char *source)
{
// p is the start of the most recent text segment
const char *p = source, *i, *newi, *end = source + length;
int in_italic = 0, in_escape = 0;
struct segment *head = new_segment(SEG_HEAD, 0, NULL), *ret;
struct segment *tail = head;
struct segment *italic_head, *italic_tail;
struct segment *link_head, *link_tail;
for(i = source; i < end; i++){
switch(*i){
case '\\':
i++;
break;
case '/':
if(!in_italic){ // begin italic section
PDEBUG("starting italics @ source[%zu]\n", i - source);
PEXPR(i != source);
PEXPR(i != source && isword(i[-1]));
PEXPR(end - i);
PEXPR(!isword(i[1]));
PEXPR((i != source && isword(i[-1])) || (end - i) < 3 || !isword(i[1]));
if((i != source && isword(i[-1])) || // previous char must be ^ or non-word
(end - i) < 3 || // cannot start near EOB
!isword(i[1])) // next char must be word
continue;
in_italic = 1;
PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
push_text(&tail, i - p, p);
PDEBUG("head = %p, head->next = %p, tail = %p\n", head, head->next, tail);
italic_head = italic_tail = new_segment(SEG_HTML, 3, "<i>");
p = i;
}else{ // end italic section
PDEBUG("stopping italics @ source[%zu]\n", i - source);
PEXPR(!isword(i[-1]));
PEXPR((i+1) != end);
PEXPR(isword(i[1]));
PEXPR(!isword(i[-1]) || ((i+1) != end && isword(i[1])));
if(!isword(i[-1]) || // previous char must be word
((i+1) != end && isword(i[1]))) // next char must be EOB or non-word
continue;
in_italic = 0;
push_text(&italic_tail, i - p + 1, p);
push_html(&italic_tail, 4, "</i>");
tail->next = italic_head;
tail = italic_tail;
p = i + 1;
}
break;
case '<':
newi = i;
link_head = try_link(&newi, end, &link_tail);
if(link_head == NULL){ // reached EOB before link could finish, bail out
i = end;
break;
}
if (DEBUG)
print_segment(link_tail);
PDEBUG("*newi = '%c'\n", *newi);
if(in_italic){
push_text(&italic_tail, i - p, p);
italic_tail->next = link_head;
italic_tail = link_tail;
}else{
push_text(&tail, i - p, p);
tail->next = link_head;
tail = link_tail;
}
i = newi;
p = i+1;
break;
}
}
if(in_italic){
PLINE();
tail->next = italic_head->next;
if(tail->next != NULL)
tail = italic_tail;
tail->next = italic_head; // reuse it :D
tail = tail->next;
tail->type = SEG_TEXT;
tail->length = end - p;
tail->text = p;
tail->next = NULL;
}else{
tail->next = new_segment(SEG_TEXT, end - p, p);
}
ret = head->next;
free(head);
return ret;
}
static const char *
nextline()
{
static size_t buflen = 1024;
static char *buf = NULL;
size_t off;
char *p;
if(buf == NULL)
buf = malloc(buflen);
p = buf;
while((*p = getc(stdin)) != EOF){
if(*p == '\n'){
*p = '\0';
return buf;
}else if(*p == '\0'){
p--;
}
if(++p == buf + buflen){
off = p - buf;
buflen *= 3;
buflen /= 2;
buf = realloc(buf, buflen);
p = buf + off;
}
}
return NULL;
}
static void
convert()
{
size_t length;
const char *s;
struct segment *seg;
while((s = nextline()) != NULL){
fprintf(stderr, "processing line: %s\n", s);
length = strlen(s);
seg = segmentize(length, s);
for(; seg != NULL; seg = seg->next)
print_segment(seg);
puts("NEXT");
fflush(stdout);
}
}
static void
usage()
{
printf(
"usage: %s <VERB> [...args]\n\n"
"VERB may be one of:\n"
" test [N] - format the in-built test strings, or test string N if provided.\n"
" convert - read input lines, convert them, and print the result until EOF.\n"
, argv0);
exit(1);
}
const static char *strings[] = {
"This string should be one SEG_TEXT.",
"And this//",
"This string should have an /italic/ part.",
"And this /o/",
"/This/ string should have /two/.",
"<link>",
"<example.com|link text>",
"how about an /<href|italicized link>/?",
"but no <href|/partially/ italicized ones>, right?",
NULL
};
static void
run_test(const char *s)
{
struct segment *seg;
printf("\nsegmentizing: \"%s\"(%zu)\n", s, strlen(s));
seg = segmentize(strlen(s), s);
for(; seg != NULL; seg = seg->next)
print_segment(seg);
}
static void
run_testn(int n)
{
if(n < LEN(strings))
run_test(strings[n]);
else{
fprintf(stderr, "test # out of range: %d\n", n);
exit(2);
}
}
static void
run_tests()
{
const char **s;
for(s = strings; *s != NULL; s++){
run_test(*s);
}
}
int
main(int argc, char **argv)
{
if(argc < 2)
usage();
argv0 = (argv++)[0];
argc--;
if(strcmp(argv[0], "test") == 0){
if(argc > 1)
run_testn(atoi(argv[1]));
else
run_tests();
}else if(strcmp(argv[0], "convert") == 0)
convert();
else
usage();
return 0;
}