htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

html.c (34967B)


      1 #include <stdio.h>
      2 #include <string.h>
      3 #include <stdint.h>
      4 #include <stdbool.h>
      5 #include <stdlib.h>
      6 #include <grapheme.h>
      7 #include "html.h"
      8 #include "entities.h"
      9 #include "misc.h"
     10 
     11 static const char *void_elements[] = {
     12 	"area", "base", "br", "col", "embed", "hr", "img",
     13 	"input", "link", "meta", "source", "track", "wbr"
     14 };
     15 
     16 /* Only needed for debugging */
     17 /* static const char *state_to_string(enum State state)
     18 {
     19 	switch(state) {
     20 		case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
     21 		case STATE_TAG: return "STATE_TAG";
     22 		case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
     23 		case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
     24 		case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
     25 		case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
     26 		case STATE_COMMENT: return "STATE_COMMENT";
     27 		case STATE_SCRIPT: return "STATE_SCRIPT";
     28 		case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
     29 		case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
     30 		case STATE_STYLE: return "STATE_STYLE";
     31 		case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
     32 		case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
     33 		case STATE_CHAR_REF: return "STATE_CHAR_REF";
     34 		case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC";
     35 	}
     36 	return "";
     37 } */
     38 
     39 static inline bool tag_is_void_element(struct Tag *tag)
     40 {
     41 	for (int i=0; i<13; i++) {
     42 		if (strcmp(tag->name, void_elements[i]) == 0)
     43 			return true;
     44 	}
     45 	return false;
     46 }
     47 
     48 static inline bool is_c0_control(uint_least32_t cp)
     49 {
     50 	if (cp >= 0x00 && cp <= 0x1F)
     51 		return true;
     52 	return false;
     53 }
     54 
     55 static inline bool is_control(uint_least32_t cp)
     56 {
     57 	if (is_c0_control(cp))
     58 		return true;
     59 	if (cp >= 0x7F && cp <= 0x9F)
     60 		return true;
     61 	return false;
     62 }
     63 
     64 static inline bool is_non_char(uint_least32_t cp)
     65 {
     66 	if (cp >= 0xFDD0 && cp <= 0xFDEF)
     67 		return true;
     68 	if (
     69 		cp == 0xFFFE		|| cp == 0xFFFF		||
     70 		cp == 0x1FFFE		|| cp == 0x1FFFF	||
     71 		cp == 0x2FFFE		|| cp == 0x2FFFF	||
     72 		cp == 0x3FFFE		|| cp == 0x3FFFF	||
     73 		cp == 0x4FFFE		|| cp == 0x4FFFF	||
     74 		cp == 0x5FFFE		|| cp == 0x5FFFF	||
     75 		cp == 0x6FFFE		|| cp == 0x6FFFF	||
     76 		cp == 0x7FFFE		|| cp == 0x7FFFF	||
     77 		cp == 0x8FFFE		|| cp == 0x8FFFF	||
     78 		cp == 0x9FFFE		|| cp == 0x9FFFF	||
     79 		cp == 0xAFFFE		|| cp == 0xAFFFF	||
     80 		cp == 0xBFFFE		|| cp == 0xBFFFF	||
     81 		cp == 0xCFFFE		|| cp == 0xCFFFF	||
     82 		cp == 0xDFFFE		|| cp == 0xDFFFF	||
     83 		cp == 0xEFFFE		|| cp == 0xEFFFF	||
     84 		cp == 0xFFFFE		|| cp == 0xFFFFF	||
     85 		cp == 0x10FFFE      || cp == 0x10FFFF
     86 	)
     87 		return true;
     88 	return false;
     89 }
     90 
     91 static inline bool attr_name_char_is_valid(uint_least32_t cp)
     92 {
     93 	if (is_control(cp))
     94 		return false;
     95 	if (is_non_char(cp))
     96 		return false;
     97 	if (
     98 		cp == SPACE             ||
     99 		cp == QUOTATION_MARK    ||
    100 		cp == APOSTROPHE        ||
    101 		cp == GREATER_THAN_SIGN ||
    102 		cp == SOLIDUS           ||
    103 		cp == EQUALS_SIGN
    104 	)
    105 		return false;
    106 	return true;
    107 }
    108 
    109 static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp)
    110 {
    111 	/*
    112 		Not mentioned invalid characters.
    113 		They are already handled before
    114 		function call.
    115 	*/
    116 	if (
    117 		cp == EQUALS_SIGN       ||
    118 		cp == LESS_THAN_SIGN    ||
    119 		cp == GREATER_THAN_SIGN ||
    120 		cp == GRAVE_ACCENT
    121 	)
    122 		return false;
    123 	return true;
    124 }
    125 
    126 static inline bool ascii_is_digit(uint_least32_t cp)
    127 {
    128 	if (cp >= 0x30 && cp <= 0x39)
    129 		return true;
    130 	return false;
    131 }
    132 
    133 static inline bool ascii_alpha_is_upper(uint_least32_t cp)
    134 {
    135 	if (cp >= 0x41 && cp <= 0x5A)
    136 		return true;
    137 	return false;
    138 }
    139 
    140 static inline bool ascii_alpha_is_lower(uint_least32_t cp)
    141 {
    142 	if (cp >= 0x61 && cp <= 0x7A)
    143 		return true;
    144 	return false;
    145 }
    146 
    147 static inline bool ascii_is_alpha(uint_least32_t cp)
    148 {
    149 	if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp))
    150 		return true;
    151 	return false;
    152 }
    153 
    154 static inline bool ascii_is_whitespace(uint_least32_t cp)
    155 {
    156 	if (
    157 		cp == TAB	||
    158 		cp == LF	||
    159 		cp == FF	||
    160 		cp == CR	||
    161 		cp == SPACE
    162 	)
    163 		return true;
    164 	return false;
    165 }
    166 
    167 static bool find_opts_exist(struct FindOpts *opts)
    168 {
    169 	if (opts->tag && strlen(opts->tag) > 0)
    170 		return true;
    171 	if (opts->attr && strlen(opts->attr) > 0)
    172 		return true;
    173 	if (opts->key && strlen(opts->key) > 0)
    174 		return true;
    175 	return false;
    176 }
    177 
    178 struct FindOpts *find_opts_parse(const char *pattern)
    179 {
    180 	struct FindOpts *opts = malloc(sizeof(struct FindOpts));
    181 	opts->out = OUT_OUTER_HTML;
    182     opts->tag = NULL;
    183     opts->attr = NULL;
    184     opts->key = NULL;
    185 	bool is_class_value = false;
    186 	bool is_id_value = false;
    187 	int i = 0;
    188 	bool is_attr_key = false;
    189 	bool is_attr_or_tag = true;
    190 	char *attr_or_tag = NULL;
    191 	int aot = 0;
    192 	int ak = 0;
    193 	int av = 0;
    194 	switch (pattern[0]) {
    195 		case '.':
    196 			is_class_value = true;
    197 			i = 1;
    198 			break;
    199 		case '#':
    200 			is_id_value = true;
    201 			i = 1;
    202 			break;
    203 	}
    204 	for (; i<strlen(pattern); i++) {
    205 		if (pattern[i] == ']')
    206 			break;
    207 		if (
    208 		    !is_attr_key &&
    209 			!is_attr_or_tag &&
    210 			pattern[i] != ']' &&
    211 			pattern[i] != '"'
    212 		) {
    213 			opts->attr = realloc(opts->attr, (av+1) * sizeof(char));
    214 			opts->attr[av] = pattern[i];
    215 			av++;
    216 		}
    217 		if (pattern[i] == '=')
    218 			is_attr_key = false;
    219 		if (is_attr_key && !is_attr_or_tag) {
    220 			opts->key = realloc(opts->key, (ak+1) * sizeof(char));
    221 			opts->key[ak] = pattern[i];
    222 			ak++;
    223 		}
    224 		if (pattern[i] == '[') {
    225 			is_attr_key = true;
    226 			is_attr_or_tag = false;
    227 		}
    228 		if (is_attr_or_tag) {
    229 			attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char));
    230 			attr_or_tag[aot] = pattern[i];
    231 			aot++;
    232 		}
    233 	}
    234 	attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char));
    235 	attr_or_tag[aot] = 0;
    236 	if (is_id_value) {
    237 		free(opts->key);
    238 		opts->key = NULL;
    239 		free(opts->attr);
    240 		opts->attr = NULL;
    241 		opts->attr = attr_or_tag;
    242 		opts->key = realloc(opts->key, 3 * sizeof(char));
    243 		opts->key[0] = 'i';
    244 		opts->key[1] = 'd';
    245 		opts->key[2] = 0;
    246 	} else if (is_class_value) {
    247 		free(opts->key);
    248 		opts->key = NULL;
    249 		free(opts->attr);
    250 		opts->attr = NULL;
    251 		opts->attr = attr_or_tag;
    252 		opts->key = realloc(opts->key, 6 * sizeof(char));
    253 		opts->key[0] = 'c';
    254 		opts->key[1] = 'l';
    255 		opts->key[2] = 'a';
    256 		opts->key[3] = 's';
    257 		opts->key[4] = 's';
    258 		opts->key[5] = 0;
    259 	} else {
    260 		free(opts->tag);
    261 		opts->tag = attr_or_tag;
    262 		if (av > 0) {
    263 			opts->attr = realloc(opts->attr, (av+1) * sizeof(char));
    264 			opts->attr[av] = 0;
    265 		}
    266 		if (ak > 0) {
    267 			opts->key = realloc(opts->key, (ak+1) * sizeof(char));
    268 			opts->key[ak] = 0;
    269 		}
    270 	}
    271 	return opts;
    272 }
    273 
    274 void find_opts_free(struct FindOpts *opts)
    275 {
    276 	free(opts->tag);
    277 	free(opts->attr);
    278 	free(opts->key);
    279 	free(opts);
    280 }
    281 
    282 enum OutType output_type_parse(const char *type)
    283 {
    284 	if (type == NULL)
    285 		return OUT_OUTER_HTML;
    286 	if (strcmp(type, "outerhtml") == 0)
    287 		return OUT_OUTER_HTML;
    288 	if (strcmp(type, "innerhtml") == 0)
    289 		return OUT_INNER_HTML;
    290 	if (strcmp(type, "innertext") == 0)
    291 		return OUT_INNER_TEXT;
    292 	if (strcmp(type, "attr_value") == 0)
    293 		return OUT_ATTR_VALUE;
    294 	return -1;
    295 }
    296 
    297 static struct Attr *attr_init(void)
    298 {
    299 	struct Attr *attr = malloc(sizeof(struct Attr));
    300     attr->name = NULL;
    301     attr->value = NULL;
    302 	return attr;
    303 }
    304 
    305 static char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base)
    306 {
    307     size_t old_offset = offset;
    308     char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char));
    309     char *numeric_charref = NULL;
    310     size_t ret;
    311     uint_least32_t cp;
    312     do {
    313         ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
    314         numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret));
    315         offset += ret;
    316     } while (cp != SEMICOLON);
    317     *new_offset = offset - old_offset;
    318     long i = strtol(numeric_charref, NULL, base);
    319     ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE);
    320     character[ret] = 0;
    321     free(numeric_charref);
    322     return character;
    323 }
    324 
    325 static char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs)
    326 {
    327     uint_least32_t stop_at = 0;
    328     switch(avs) {
    329         case AVS_QUOTATION_MARK:
    330             stop_at = QUOTATION_MARK;
    331             break;
    332         case AVS_APOSTROPHE:
    333             stop_at = APOSTROPHE;
    334             break;
    335         case AVS_UNQUOTED:
    336             stop_at = GREATER_THAN_SIGN;
    337             break;
    338         case AVS_NO: /* Just to silence the compilier warning */
    339             break;
    340     }
    341     char *named_charref = NULL;
    342     size_t ret;
    343     uint_least32_t cp;
    344     int i = 0;
    345     for (;;) {
    346         ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
    347         if (cp == AMPERSAND || ascii_is_whitespace(cp))
    348             break;
    349         if (avs > AVS_NO && cp == stop_at)
    350             break;
    351         named_charref = string_concat(named_charref, cp_to_string(cp, ret));
    352         if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
    353             break;
    354         offset += ret;
    355         i++;
    356     }
    357     return named_charref;
    358 }
    359 
    360 static void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf)
    361 {
    362     const char *remaining = &parsed_string[strlen(charref)];
    363     size_t remaining_len = strlen(remaining);
    364     size_t buf_len = strlen(*buf);
    365     if (remaining_len > 0) {
    366         if (remaining_len == 1 && remaining[0] == ';')
    367             return;
    368         *buf = realloc(*buf, buf_len+remaining_len+1);
    369         strcat(*buf, remaining);
    370     }
    371 }
    372 
    373 static char *charref_named_encode(const char *name)
    374 {
    375     char *buf = NULL;
    376     size_t len;
    377     int i;
    378     if (name) {
    379         for (i=0; i<2138; i++) {
    380             if (string_starts_with(name, single_cp_entities[i].name)) {
    381                 buf = realloc(buf, MAX_CODEPOINT_SIZE+1);
    382                 len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE);
    383                 buf[len] = 0;
    384                 charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf);
    385                 return buf;
    386             }
    387         }
    388         for (i=0; i<93; i++) {
    389             if (string_starts_with(name, double_cp_entities[i].name)) {
    390                 size_t buf_len = 0;
    391                 buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1);
    392                 len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE);
    393                 buf_len += len;
    394                 buf += len;
    395                 len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE);
    396                 buf_len += len;
    397                 buf[buf_len] = 0;
    398                 charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf);
    399                 return buf;
    400             }
    401         }
    402         buf = realloc(buf, (strlen(name)+2) * sizeof(char));
    403         buf[0] = '&';
    404         buf[1] = 0;
    405         strcat(buf, name);
    406         return buf;
    407     } else {
    408         buf = realloc(buf, 2 * sizeof(char));
    409         buf[0] = '&';
    410         buf[1] = 0;
    411         return buf;
    412     }
    413 }
    414 
    415 static struct Tag *tag_init(void)
    416 {
    417 	struct Tag *tag = malloc(sizeof(struct Tag));
    418     tag->name = NULL;
    419     tag->inner_text = NULL;
    420 	tag->attrs = NULL;
    421 	tag->children = NULL;
    422 	tag->attrs_len = 0;
    423 	tag->children_len = 0;
    424 	tag->is_void_element = false;
    425 	tag->is_closed = false;
    426 	tag->outer_html_begin_offset = 0;
    427     tag->outer_html_end_offset = 0;
    428     tag->inner_html_begin_offset = 0;
    429     tag->inner_html_end_offset = 0;
    430 	return tag;
    431 }
    432 
    433 static struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset)
    434 {
    435 	for (int i=tag_list->len-1; i>-1; i--) {
    436 		if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->is_closed) {
    437 			tag_list->tags[i]->is_closed = true;
    438 			tag_list->tags[i]->outer_html_end_offset = end_offset;
    439 			return tag_list->tags[i];
    440 		}
    441 	}
    442 	return NULL;
    443 }
    444 
    445 static struct Tag *tag_get_last_open(struct TagList *tag_list)
    446 {
    447 	for (int i=tag_list->len-1; i>-1; i--) {
    448 		if (!tag_list->tags[i]->is_void_element && !tag_list->tags[i]->is_closed) {
    449 			return tag_list->tags[i];
    450 		}
    451 	}
    452 	return tag_list->tags[0];
    453 }
    454 
    455 static char *tag_get_outer_html(struct Tag *tag, char *text)
    456 {
    457 	char *outer_html = NULL;
    458 	int o = 0;
    459 	for (int i=tag->outer_html_begin_offset; i<tag->outer_html_end_offset; i++) {
    460 		outer_html = realloc(outer_html, (o+1) * sizeof(char));
    461 		outer_html[o] = text[i];
    462 		o++;
    463 	}
    464 	outer_html = realloc(outer_html, (o+1) * sizeof(char));
    465 	outer_html[o] = 0;
    466 	return outer_html;
    467 }
    468 
    469 static char *tag_get_inner_html(struct Tag *tag, char *text)
    470 {
    471 	char *inner_html = NULL;
    472 	int o = 0;
    473 	for (int i=tag->inner_html_begin_offset; i<tag->inner_html_end_offset; i++) {
    474 		inner_html = realloc(inner_html, (o+1) * sizeof(char));
    475 		inner_html[o] = text[i];
    476 		o++;
    477 	}
    478 	inner_html = realloc(inner_html, (o+1) * sizeof(char));
    479 	inner_html[o] = 0;
    480 	return inner_html;
    481 }
    482 
    483 static enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset)
    484 {
    485 	tag->inner_html_begin_offset = offset+1;
    486 	tag->is_void_element = tag_is_void_element(tag);
    487 	if (tag->is_void_element)
    488 		tag->outer_html_end_offset = offset+1;
    489 	if (strcmp(tag->name, "script") == 0)
    490 		return STATE_SCRIPT;
    491 	else if (strcmp(tag->name, "style") == 0)
    492 		return STATE_STYLE;
    493     return STATE_INNER_TEXT;
    494 }
    495 
    496 static void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset)
    497 {
    498 	int i = offset;
    499 	while (text[i] != '<')
    500 		i--;
    501 	closed_tag->inner_html_end_offset = i;
    502 }
    503 
    504 static void tag_free(struct Tag *tag)
    505 {
    506     free(tag->name);
    507 	free(tag->inner_text);
    508 	for (int i=0; i<tag->attrs_len; i++) {
    509 		free(tag->attrs[i]->name);
    510 		free(tag->attrs[i]->value);
    511 		free(tag->attrs[i]);
    512 	}
    513 	free(tag->attrs);
    514 	for (int i=0; i<tag->children_len; i++) {
    515 		if (tag->children[i] != NULL)
    516 			tag_free(tag->children[i]);
    517 	}
    518 	free(tag->children);
    519 	free(tag);
    520 }
    521 
    522 static void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags)
    523 {
    524 	if (opts->limit > 0 && found_tags->len == opts->limit)
    525 		return;
    526 	bool matches_tag = false;
    527 	bool matches_attr_key = false;
    528 	bool matches_attr_value = false;
    529     if (!string_is_empty(opts->tag)) {
    530         if (strcmp(tag->name, opts->tag) == 0)
    531             matches_tag = true;
    532     }
    533     if (!string_is_empty(opts->key)) {
    534         for (int i=0; i<tag->attrs_len; i++) {
    535             if (strcmp(tag->attrs[i]->name, opts->key) == 0)
    536                 matches_attr_key = true;
    537         }
    538     }
    539     if (!string_is_empty(opts->attr)) {
    540         for (int i=0; i<tag->attrs_len; i++) {
    541             if (tag->attrs[i]->value && strcmp(tag->attrs[i]->value, opts->attr) == 0)
    542                 matches_attr_value = true;
    543         }
    544     }
    545 	if (!string_is_empty(opts->tag) && !string_is_empty(opts->key) && !string_is_empty(opts->attr)) {
    546 		if (matches_tag && matches_attr_key && matches_attr_value) {
    547 			found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
    548 			found_tags->tags[found_tags->len] = tag;
    549 			found_tags->len++;
    550 		}
    551 	} else if (!string_is_empty(opts->tag) && !string_is_empty(opts->key)) {
    552 		if (matches_tag && matches_attr_key) {
    553 			found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
    554 			found_tags->tags[found_tags->len] = tag;
    555 			found_tags->len++;
    556 		}
    557 	} else if (!string_is_empty(opts->tag)) {
    558 		if (matches_tag) {
    559 			found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
    560 			found_tags->tags[found_tags->len] = tag;
    561 			found_tags->len++;
    562 		}
    563 	} else if (!string_is_empty(opts->key) && !string_is_empty(opts->attr)) {
    564 		if (matches_attr_key && matches_attr_value) {
    565 			found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
    566 			found_tags->tags[found_tags->len] = tag;
    567 			found_tags->len++;
    568 		}
    569 	} else if (!string_is_empty(opts->key)) {
    570 		if (matches_attr_key) {
    571 			found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
    572 			found_tags->tags[found_tags->len] = tag;
    573 			found_tags->len++;
    574 		}
    575 	} else if (!string_is_empty(opts->attr)) {
    576 		if (matches_attr_value) {
    577 			found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
    578 			found_tags->tags[found_tags->len] = tag;
    579 			found_tags->len++;
    580 		}
    581 	}
    582 	for (int i=tag->children_len-1; i>-1; i--) {
    583 		tag_find(tag->children[i], opts, found_tags);
    584 	}
    585 }
    586 
    587 static int tag_doctype_parse(const char *text)
    588 {
    589 	size_t offset = 0;
    590 	enum DoctypeState state = DSTATE_TEXT;
    591 	char *doctype = NULL;
    592 	char *lower_doctype = NULL;
    593 	uint_least32_t cp;
    594 	size_t len = strlen(text);
    595 	size_t ret, off;
    596 	for (off = 0; off<len; off += ret) {
    597 		if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
    598             fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n");
    599 		} else {
    600 			switch (state) {
    601 				case DSTATE_TEXT:
    602 					if (cp == LESS_THAN_SIGN) {
    603 						state = DSTATE_POSSIBLE_DTYPE;
    604 						break;
    605 					}
    606 					if (cp == GREATER_THAN_SIGN) {
    607 						offset = off;
    608 						goto CLEANUP;
    609 					}
    610 					break;
    611 				case DSTATE_POSSIBLE_DTYPE:
    612 					if (cp == EXCLAMATION_MARK)
    613 						state = DSTATE_DTYPE_OR_COMMENT;
    614 					else
    615 						goto CLEANUP;
    616 					break;
    617 				case DSTATE_DTYPE_OR_COMMENT:
    618 					if (cp == HYPHEN_MINUS) {
    619 						goto CLEANUP;
    620                     } else {
    621 						doctype = string_concat(doctype, cp_to_string(cp, ret));
    622 						state = DSTATE_DTYPE;
    623 						break;
    624 					}
    625 					break;
    626 				case DSTATE_DTYPE:
    627 					if (ascii_is_whitespace(cp)) {
    628 						size_t dlen = strlen(doctype)+1;
    629 						lower_doctype = malloc(dlen * sizeof(char));
    630 						grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen);
    631 						if (strcmp(lower_doctype, "doctype") == 0) {
    632 							state = DSTATE_TEXT;
    633                         } else {
    634 							offset = -1;
    635 							goto CLEANUP;
    636 						}
    637 						break;
    638 					}
    639 					doctype = string_concat(doctype, cp_to_string(cp, ret));
    640 					break;
    641 			}
    642 		}
    643 	}
    644 CLEANUP:
    645 	free(doctype);
    646 	free(lower_doctype);
    647 	return offset;
    648 }
    649 
    650 static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state)
    651 {
    652     struct Tag *tag = tag_init();
    653     tag->outer_html_begin_offset= offset-1;
    654     tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag));
    655     tag_list->tags[tag_list->len] = tag;
    656     tag_list->len++;
    657     struct Tag *still_open_tag = tag;
    658     char *end_tag = NULL;
    659     enum State return_to_state = STATE_INNER_TEXT;
    660     size_t a = 0;
    661     size_t attr_name_count = 0;
    662     enum AttrValueSyntax avs = AVS_NO;
    663     size_t hyphen_count = 0;
    664     uint_least32_t cp;
    665     size_t len = strlen(text);
    666     size_t ret, off;
    667     for (off = offset; off<len; off += ret) {
    668         if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
    669             fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n");
    670         } else {
    671             /* char *the_codepoint = cp_to_string(cp, ret);
    672             printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state));
    673             free(the_codepoint); */
    674             switch (state) {
    675                 case STATE_INNER_TEXT:
    676                     if (cp == LESS_THAN_SIGN) {
    677                         state = STATE_TAG;
    678                         break;
    679                     }
    680                     if (cp == AMPERSAND) {
    681                         return_to_state = STATE_INNER_TEXT;
    682                         state = STATE_CHAR_REF;
    683                         break;
    684                     }
    685                     still_open_tag = tag_get_last_open(tag_list);
    686                     still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret));
    687                     break;
    688                 case STATE_TAG:
    689                     if (cp == SOLIDUS) {
    690                         state = STATE_END_TAG_NAME;
    691                         break;
    692                     }
    693                     if (cp == EXCLAMATION_MARK) {
    694                         state = STATE_COMMENT;
    695                         break;
    696                     }
    697                     still_open_tag = tag_get_last_open(tag_list);
    698                     struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME);
    699                     still_open_tag->children = realloc(
    700                         still_open_tag->children,
    701                         (still_open_tag->children_len+1) * sizeof(struct Tag)
    702                     );
    703                     still_open_tag->children[still_open_tag->children_len] = one_tag;
    704                     still_open_tag->children_len++;
    705                     free(end_tag);
    706                     return tag;
    707                 case STATE_BEGIN_TAG_NAME:
    708                     if (cp == GREATER_THAN_SIGN) {
    709                         state = tag_process_end_of_opening_tag(tag, off);
    710                         break;
    711                     }
    712                     if (ascii_is_whitespace(cp)) {
    713                         state = STATE_ATTR_NAME;
    714                         break;
    715                     }
    716                     if (ascii_is_digit(cp) || ascii_is_alpha(cp)) {
    717                         tag->name = string_concat(tag->name, cp_to_string(cp, ret));
    718                     }
    719                     break;
    720                 case STATE_END_TAG_NAME:
    721                     if (cp == GREATER_THAN_SIGN) {
    722                         struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
    723                         if (closed_tag != NULL)
    724                             tag_set_inner_html_end_offset(closed_tag, text, off);
    725                         free(end_tag);
    726                         end_tag = NULL;
    727                         state = STATE_INNER_TEXT;
    728                         break;
    729                     }
    730                     if (!ascii_is_whitespace(cp))
    731                         end_tag = string_concat(end_tag, cp_to_string(cp, ret));
    732                     break;
    733                 case STATE_ATTR_NAME:
    734                     if (cp == GREATER_THAN_SIGN) {
    735                         state = tag_process_end_of_opening_tag(tag, off);
    736                         break;
    737                     }
    738                     if (ascii_is_whitespace(cp)) {
    739                         if (attr_name_count == a+1)
    740                             a++;
    741                         break;
    742                     }
    743                     if (cp == EQUALS_SIGN) {
    744                         state = STATE_ATTR_VALUE;
    745                         break;
    746                     }
    747                     if (attr_name_char_is_valid(cp)) {
    748                         if (attr_name_count != a+1) {
    749                             tag->attrs = realloc(tag->attrs, (a+1) * sizeof(struct Attr));
    750                             tag->attrs[a] = attr_init();
    751                             attr_name_count = a + 1;
    752                             tag->attrs_len = attr_name_count;
    753                         }
    754                         tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret));
    755                     }
    756                     break;
    757                 case STATE_ATTR_VALUE:
    758                     if (ascii_is_whitespace(cp)) {
    759                         if (avs == AVS_UNQUOTED) {
    760                             avs = AVS_NO;
    761                             state = STATE_ATTR_NAME;
    762                         } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) {
    763                             if (
    764                                 strcmp("id", tag->attrs[a]->name) == 0 ||
    765                                 strcmp("class", tag->attrs[a]->name) == 0
    766                             ) {
    767                                 char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
    768                                 strcpy(tmp_name, tag->attrs[a]->name);
    769                                 tag->attrs = realloc(
    770                                     tag->attrs,
    771                                     (a+1) * sizeof(struct Attr)
    772                                 );
    773                                 a++;
    774                                 tag->attrs[a] = attr_init();
    775                                 free(tag->attrs[a]->name);
    776                                 tag->attrs[a]->name = tmp_name;
    777                                 tag->attrs_len++;
    778                                 attr_name_count = a + 1;
    779                             } else {
    780                                 tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
    781                             }
    782                         }
    783                         break;
    784                     }
    785                     if (cp == QUOTATION_MARK) {
    786                         if (avs == AVS_NO) {
    787                             avs = AVS_QUOTATION_MARK;
    788                             break;
    789                         }
    790                         if (avs == AVS_QUOTATION_MARK) {
    791                             avs = AVS_NO;
    792                             state = STATE_ATTR_NAME;
    793                             break;
    794                         }
    795                     }
    796                     if (cp == APOSTROPHE) {
    797                         if (avs == AVS_NO) {
    798                             avs = AVS_APOSTROPHE;
    799                             break;
    800                         }
    801                         if (avs == AVS_APOSTROPHE) {
    802                             avs = AVS_NO;
    803                             state = STATE_ATTR_NAME;
    804                             break;
    805                         }
    806                     }
    807                     if (cp == GREATER_THAN_SIGN) {
    808                         state = tag_process_end_of_opening_tag(tag, off);
    809                         break;
    810                     }
    811                     if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) {
    812                         avs = AVS_UNQUOTED;
    813                     }
    814                     if (avs > AVS_NO) {
    815                         if (cp == AMPERSAND) {
    816                             state = STATE_CHAR_REF;
    817                             return_to_state = STATE_ATTR_VALUE;
    818                             break;
    819                         }
    820                         tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
    821                     }
    822                     break;
    823                 case STATE_COMMENT:
    824                     if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) {
    825                         state = STATE_INNER_TEXT;
    826                         break;
    827                     }
    828                     if (cp == HYPHEN_MINUS)
    829                         hyphen_count++;
    830                     else
    831                         hyphen_count = 0;
    832                     break;
    833                 case STATE_STYLE:
    834                     if (cp == LESS_THAN_SIGN) {
    835                         state = STATE_STYLE_POSSIBLE_END_TAG;
    836                         break;
    837                     }
    838                     break;
    839                 case STATE_STYLE_POSSIBLE_END_TAG:
    840                     if (cp == SOLIDUS)
    841                         state = STATE_STYLE_END_TAG;
    842                     else
    843                         state = STATE_STYLE;
    844                     break;
    845                 case STATE_STYLE_END_TAG:
    846                     if (cp == GREATER_THAN_SIGN) {
    847                         struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
    848                         if (closed_tag != NULL)
    849                             tag_set_inner_html_end_offset(closed_tag, text, off);
    850                         free(end_tag);
    851                         end_tag = NULL;
    852                         state = STATE_INNER_TEXT;
    853                         break;
    854                     }
    855                     if (!ascii_is_whitespace(cp))
    856                         end_tag = string_concat(end_tag, cp_to_string(cp, ret));
    857                     break;
    858                 case STATE_SCRIPT:
    859                     if (cp == LESS_THAN_SIGN) {
    860                         state = STATE_SCRIPT_POSSIBLE_END_TAG;
    861                         break;
    862                     }
    863                     break;
    864                 case STATE_SCRIPT_POSSIBLE_END_TAG:
    865                     if (cp == SOLIDUS)
    866                         state = STATE_SCRIPT_END_TAG;
    867                     else
    868                         state = STATE_SCRIPT;
    869                     break;
    870                 case STATE_SCRIPT_END_TAG:
    871                     if (cp == GREATER_THAN_SIGN) {
    872                         struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
    873                         if (closed_tag != NULL)
    874                             tag_set_inner_html_end_offset(closed_tag, text, off);
    875                         free(end_tag);
    876                         end_tag = NULL;
    877                         state = STATE_INNER_TEXT;
    878                         break;
    879                     }
    880                     if (!ascii_is_whitespace(cp))
    881                         end_tag = string_concat(end_tag, cp_to_string(cp, ret));
    882                     break;
    883                 case STATE_CHAR_REF:
    884                     if (cp == NUMBER_SIGN) { /* hashtag */
    885                         state = STATE_CHAR_REF_NUMERIC;
    886                         break;
    887                     }
    888                     char *named_charref = charref_named_parse(text, off, len, avs);
    889                     if (named_charref) {
    890                         off += strlen(named_charref)-1;
    891                     } else {
    892                         off--;
    893                     }
    894                     char *encoded_named_charref = charref_named_encode(named_charref);
    895                     if (return_to_state == STATE_INNER_TEXT) {
    896                         still_open_tag = tag_get_last_open(tag_list);
    897                         still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref);
    898                     } else if (return_to_state == STATE_ATTR_VALUE) {
    899                         tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref);
    900                     }
    901                     free(named_charref);
    902                     state = return_to_state;
    903                     break;
    904                 case STATE_CHAR_REF_NUMERIC:
    905                     if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) {
    906                         size_t new_offset;
    907                         char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16);
    908                         off += new_offset;
    909                         if (return_to_state == STATE_INNER_TEXT) {
    910                             still_open_tag = tag_get_last_open(tag_list);
    911                             still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
    912                         } else if (return_to_state == STATE_ATTR_VALUE) {
    913                             tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
    914                         }
    915                         state = return_to_state;
    916                         break;
    917                     } else if (ascii_is_digit(cp)) {
    918                         size_t new_offset;
    919                         char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10);
    920                         off += new_offset-1;
    921                         if (return_to_state == STATE_INNER_TEXT) {
    922                             still_open_tag = tag_get_last_open(tag_list);
    923                             still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
    924                         } else if (return_to_state == STATE_ATTR_VALUE) {
    925                             tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
    926                         }
    927                         state = return_to_state;
    928                         break;
    929                     }
    930                 state = return_to_state;
    931                 break;
    932             }
    933         }
    934     }
    935     free(end_tag);
    936     return tag;
    937 }
    938 
    939 static void tag_debug_print(struct Tag *tag, int indent)
    940 {
    941 	for (int i=0; i<indent; i++)
    942 		putchar(' ');
    943 	printf("%s", tag->name);
    944 	for (int i=0; i<tag->attrs_len; i++)
    945 		printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value);
    946 	printf("\n");
    947 	indent++;
    948 	for (int i=tag->children_len-1; i>-1; i--)
    949 		tag_debug_print(tag->children[i], indent);
    950 }
    951 
    952 static struct TagList *tag_list_init(void)
    953 {
    954 	struct TagList *tag_list = malloc(sizeof(struct TagList));
    955 	tag_list->tags = NULL;
    956 	tag_list->len = 0;
    957 	return tag_list;
    958 }
    959 
    960 void tag_list_free(struct TagList *tag_list)
    961 {
    962 	free(tag_list->tags);
    963 	free(tag_list);
    964 }
    965 
    966 static struct HTMLDocument *html_document_init(void)
    967 {
    968     struct HTMLDocument *document = malloc(sizeof(struct HTMLDocument));
    969     document->buffer = NULL;
    970     document->tag = NULL;
    971     document->tag_list = NULL;
    972     return document;
    973 }
    974 
    975 void html_document_free(struct HTMLDocument *document)
    976 {
    977     // free(doc->buffer);
    978     tag_free(document->tag);
    979     tag_list_free(document->tag_list);
    980     free(document);
    981 }
    982 
    983 struct HTMLDocument *html_document_parse(char *buffer)
    984 {
    985     struct HTMLDocument *document = html_document_init();
    986     document->buffer = buffer;
    987     document->tag_list = tag_list_init();
    988     int len = tag_doctype_parse(document->buffer);
    989 	if (len == -1) {
    990 		fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n");
    991         html_document_free(document);
    992         return NULL;
    993 	} else {
    994 		document->buffer += len;
    995     }
    996     document->tag = tag_parse(document->tag_list, document->buffer, 0, STATE_INNER_TEXT);
    997     document->tag->name = malloc(sizeof(char));
    998     document->tag->name[0] = 0;
    999     return document;
   1000 }
   1001 
   1002 struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts)
   1003 {
   1004 	struct TagList *found_tags = tag_list_init();
   1005 	if (!find_opts_exist(opts)) {
   1006 		found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag));
   1007 		found_tags->tags[0] = document->tag->children[0];
   1008 		found_tags->len = 1;
   1009 	} else {
   1010 		tag_find(document->tag, opts, found_tags);
   1011     }
   1012     return found_tags;
   1013 }
   1014 
   1015 void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts)
   1016 {
   1017 	if (opts->is_except) {
   1018 		bool is_match = false;
   1019 		for (int i=0; i<strlen(document->buffer); i++) {
   1020 			is_match = false;
   1021 			for (int k=0; k<found_tags->len; k++) {
   1022 				if (
   1023 					found_tags->tags[k]->outer_html_begin_offset <= i &&
   1024 					found_tags->tags[k]->outer_html_end_offset > i
   1025 				)
   1026 					is_match = true;
   1027 			}
   1028 			if (!is_match)
   1029 				putchar(document->buffer[i]);
   1030 		}
   1031 	} else {
   1032 		char *requested_text = NULL;
   1033 		char *trimmed_text = NULL;
   1034 		for (int i=0; i<found_tags->len; i++) {
   1035 			switch (opts->out) {
   1036 				case OUT_INNER_HTML:
   1037 					requested_text = tag_get_inner_html(found_tags->tags[i], document->buffer);
   1038 					trimmed_text = string_trim(requested_text);
   1039 					free(requested_text);
   1040 					break;
   1041 				case OUT_OUTER_HTML:
   1042 					requested_text = tag_get_outer_html(found_tags->tags[i], document->buffer);
   1043 					trimmed_text = string_trim(requested_text);
   1044 					free(requested_text);
   1045 					break;
   1046 				case OUT_INNER_TEXT:
   1047 					trimmed_text = string_trim(found_tags->tags[i]->inner_text);
   1048 					break;
   1049 				case OUT_ATTR_VALUE:
   1050 					if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) {
   1051 						for (int k=0; k<found_tags->tags[i]->attrs_len; k++) {
   1052 							if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0)
   1053 								printf("%s\n", found_tags->tags[i]->attrs[k]->value);
   1054 						}
   1055 					} else if (strlen(opts->tag) > 0) {
   1056 						for (int k=0; k<found_tags->tags[i]->attrs_len; k++)
   1057 							printf("%s\n", found_tags->tags[i]->attrs[k]->value);
   1058 					}
   1059 					break;
   1060 			}
   1061 			if (trimmed_text) {
   1062 				if (strlen(trimmed_text) > 0)
   1063 					printf("%s\n", trimmed_text);
   1064 				free(trimmed_text);
   1065 			}
   1066 		}
   1067 	}
   1068 }
   1069 
   1070 void html_document_debug_print_tree(struct HTMLDocument *document)
   1071 {
   1072     tag_debug_print(document->tag, -1);
   1073 }