htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 572566e1c67d1d3e41dc16dbce8b1cec56107ff9
parent 3fae4d3606f7663d42ae792a72bc3e86a513b21e
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat,  6 Apr 2024 12:55:35 +0200

Restructure and make html.c more standalone

Diffstat:
MMakefile | 4++--
Mhtex.1 | 2+-
Mhtex.c | 154+++++++------------------------------------------------------------------------
Dhtex.h | 20--------------------
Msrc/html.c | 1563++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Msrc/html.h | 69+++++++++++++++++++++++++++++----------------------------------------
Msrc/misc.c | 8++++++++
Asrc/misc.h | 6++++++
Mtodo | 4++--
9 files changed, 927 insertions(+), 903 deletions(-)

diff --git a/Makefile b/Makefile @@ -2,9 +2,9 @@ PREFIX = /usr/local MANPREFIX = $(PREFIX)/share/man all: - $(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme + $(CC) -O -pedantic -Werror -Wall -o htex src/misc.c src/html.c htex.c -lgrapheme debug: - $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme + $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex src/misc.c src/html.c htex.c -lgrapheme clean: rm htex install: all diff --git a/htex.1 b/htex.1 @@ -35,7 +35,7 @@ Prints everything except the found html tags' outerHTML. \fB\,-l\/\fR, \fB\,--limit\/\fR \fI\,NUM\/\fR Find maximum \fI\,NUM\/\fR html tags. .SH INNER_TEXT -Still in progress. +Coming soon. .SH EXAMPLES .sp .RS 4 diff --git a/htex.c b/htex.c @@ -5,142 +5,8 @@ #include <getopt.h> #include <inttypes.h> #include <grapheme.h> -#include "htex.h" -#include "src/misc.c" -#include "src/html.c" - -bool find_opts_exist(struct FindOpts *opts) -{ - if (strlen(opts->tag) > 0) - return true; - if (strlen(opts->attr) > 0) - return true; - if (strlen(opts->key) > 0) - return true; - return false; -} - -struct FindOpts *find_opts_parse(const char *pattern) -{ - struct FindOpts *opts = malloc(sizeof(struct FindOpts)); - opts->out = OUT_OUTER_HTML; - opts->tag = malloc(sizeof(char)); - opts->tag[0] = 0; - opts->attr = malloc(sizeof(char)); - opts->attr[0] = 0; - opts->key = malloc(sizeof(char)); - opts->key[0] = 0; - bool is_class_value = false; - bool is_id_value = false; - int i = 0; - bool is_attr_key = false; - bool is_attr_or_tag = true; - char *attr_or_tag = NULL; - int aot = 0; - int ak = 0; - int av = 0; - switch (pattern[0]) { - case '.': - is_class_value = true; - i = 1; - break; - case '#': - is_id_value = true; - i = 1; - break; - } - for (; i<strlen(pattern); i++) { - if (pattern[i] == ']') - break; - if ( - !is_attr_key && - !is_attr_or_tag && - pattern[i] != ']' && - pattern[i] != '"' - ) { - opts->attr = realloc(opts->attr, (av+1) * sizeof(char)); - opts->attr[av] = pattern[i]; - av++; - } - if (pattern[i] == '=') - is_attr_key = false; - if (is_attr_key && !is_attr_or_tag) { - opts->key = realloc(opts->key, (ak+1) * sizeof(char)); - opts->key[ak] = pattern[i]; - ak++; - } - if (pattern[i] == '[') { - is_attr_key = true; - is_attr_or_tag = false; - } - if (is_attr_or_tag) { - attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char)); - attr_or_tag[aot] = pattern[i]; - aot++; - } - } - attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char)); - attr_or_tag[aot] = 0; - if (is_id_value) { - free(opts->key); - opts->key = NULL; - free(opts->attr); - opts->attr = NULL; - opts->attr = attr_or_tag; - opts->key = realloc(opts->key, 3 * sizeof(char)); - opts->key[0] = 'i'; - opts->key[1] = 'd'; - opts->key[2] = 0; - } else if (is_class_value) { - free(opts->key); - opts->key = NULL; - free(opts->attr); - opts->attr = NULL; - opts->attr = attr_or_tag; - opts->key = realloc(opts->key, 6 * sizeof(char)); - opts->key[0] = 'c'; - opts->key[1] = 'l'; - opts->key[2] = 'a'; - opts->key[3] = 's'; - opts->key[4] = 's'; - opts->key[5] = 0; - } else { - free(opts->tag); - opts->tag = attr_or_tag; - if (av > 0) { - opts->attr = realloc(opts->attr, (av+1) * sizeof(char)); - opts->attr[av] = 0; - } - if (ak > 0) { - opts->key = realloc(opts->key, (ak+1) * sizeof(char)); - opts->key[ak] = 0; - } - } - return opts; -} - -void find_opts_free(struct FindOpts *opts) -{ - free(opts->tag); - free(opts->attr); - free(opts->key); - free(opts); -} - -enum OutType output_type_parse(const char *type) -{ - if (type == NULL) - return OUT_OUTER_HTML; - if (strcmp(type, "outerhtml") == 0) - return OUT_OUTER_HTML; - if (strcmp(type, "innerhtml") == 0) - return OUT_INNER_HTML; - if (strcmp(type, "innertext") == 0) - return OUT_INNER_TEXT; - if (strcmp(type, "attr_value") == 0) - return OUT_ATTR_VALUE; - return -1; -} +#include "src/misc.h" +#include "src/html.h" int main(int argc, char *argv[]) { @@ -214,13 +80,17 @@ int main(int argc, char *argv[]) return 0; } } - struct FindOpts *options = find_opts_parse(search_pattern); - options->out = out; - options->is_except = is_except; - options->limit = limit; - html_filter(text, options); + struct FindOpts *opts = find_opts_parse(search_pattern); + opts->out = out; + opts->is_except = is_except; + opts->limit = limit; + struct HTMLDocument *document = html_document_parse(text); + struct TagList *found_tags = html_document_find(document, opts); + html_document_print_find_result(document, found_tags, opts); + html_document_free(document); + tag_list_free(found_tags); + find_opts_free(opts); free(output); - find_opts_free(options); free(text); return 0; } diff --git a/htex.h b/htex.h @@ -1,20 +0,0 @@ -enum OutType { - OUT_INNER_HTML, - OUT_OUTER_HTML, - OUT_INNER_TEXT, - OUT_ATTR_VALUE -}; - -struct FindOpts { - char *tag; - char *attr; - char *key; - enum OutType out; - bool is_except; - int limit; -}; - -bool find_opts_exist(struct FindOpts *opts); -struct FindOpts *find_opts_parse(const char *pattern); -void find_opts_free(struct FindOpts *opts); -enum OutType output_type_parse(const char *type); diff --git a/src/html.c b/src/html.c @@ -1,359 +1,438 @@ +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <stdbool.h> +#include <stdlib.h> +#include <grapheme.h> #include "html.h" #include "entities.h" +#include "misc.h" -void html_filter(char *text, struct FindOpts *opts) +static const char *void_elements[] = { + "area", "base", "br", "col", "embed", "hr", "img", + "input", "link", "meta", "source", "track", "wbr" +}; + +/* Only needed for debugging */ +/* static const char *state_to_string(enum State state) { - struct TagList *tag_list = tag_list_init(); - struct TagList *found_tags = tag_list_init(); - int len = tag_doctype_parse(text); - if (len == -1) { - fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n"); - goto CLEAN; - } else { - text += len; - } - struct Tag *root_tag = tag_parse(tag_list, text, 0, STATE_INNER_TEXT); - if (!find_opts_exist(opts)) { - found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag)); - found_tags->tags[0] = root_tag->children[0]; - found_tags->len = 1; + switch(state) { + case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; + case STATE_TAG: return "STATE_TAG"; + case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; + case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; + case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; + case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; + case STATE_COMMENT: return "STATE_COMMENT"; + case STATE_SCRIPT: return "STATE_SCRIPT"; + case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; + case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; + case STATE_STYLE: return "STATE_STYLE"; + case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; + case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; + case STATE_CHAR_REF: return "STATE_CHAR_REF"; + case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC"; + } + return ""; +} */ + +static inline bool tag_is_void_element(struct Tag *tag) +{ + for (int i=0; i<13; i++) { + if (strcmp(tag->name, void_elements[i]) == 0) + return true; + } + return false; +} + +static inline bool is_c0_control(uint_least32_t cp) +{ + if (cp >= 0x00 && cp <= 0x1F) + return true; + return false; +} + +static inline bool is_control(uint_least32_t cp) +{ + if (is_c0_control(cp)) + return true; + if (cp >= 0x7F && cp <= 0x9F) + return true; + return false; +} + +static inline bool is_non_char(uint_least32_t cp) +{ + if (cp >= 0xFDD0 && cp <= 0xFDEF) + return true; + if ( + cp == 0xFFFE || cp == 0xFFFF || + cp == 0x1FFFE || cp == 0x1FFFF || + cp == 0x2FFFE || cp == 0x2FFFF || + cp == 0x3FFFE || cp == 0x3FFFF || + cp == 0x4FFFE || cp == 0x4FFFF || + cp == 0x5FFFE || cp == 0x5FFFF || + cp == 0x6FFFE || cp == 0x6FFFF || + cp == 0x7FFFE || cp == 0x7FFFF || + cp == 0x8FFFE || cp == 0x8FFFF || + cp == 0x9FFFE || cp == 0x9FFFF || + cp == 0xAFFFE || cp == 0xAFFFF || + cp == 0xBFFFE || cp == 0xBFFFF || + cp == 0xCFFFE || cp == 0xCFFFF || + cp == 0xDFFFE || cp == 0xDFFFF || + cp == 0xEFFFE || cp == 0xEFFFF || + cp == 0xFFFFE || cp == 0xFFFFF || + cp == 0x10FFFE || cp == 0x10FFFF + ) + return true; + return false; +} + +static inline bool attr_name_char_is_valid(uint_least32_t cp) +{ + if (is_control(cp)) + return false; + if (is_non_char(cp)) + return false; + if ( + cp == SPACE || + cp == QUOTATION_MARK || + cp == APOSTROPHE || + cp == GREATER_THAN_SIGN || + cp == SOLIDUS || + cp == EQUALS_SIGN + ) + return false; + return true; +} + +static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp) +{ + /* + Not mentioned invalid characters. + They are already handled before + function call. + */ + if ( + cp == EQUALS_SIGN || + cp == LESS_THAN_SIGN || + cp == GREATER_THAN_SIGN || + cp == GRAVE_ACCENT + ) + return false; + return true; +} + +static inline bool ascii_is_digit(uint_least32_t cp) +{ + if (cp >= 0x30 && cp <= 0x39) + return true; + return false; +} + +static inline bool ascii_alpha_is_upper(uint_least32_t cp) +{ + if (cp >= 0x41 && cp <= 0x5A) + return true; + return false; +} + +static inline bool ascii_alpha_is_lower(uint_least32_t cp) +{ + if (cp >= 0x61 && cp <= 0x7A) + return true; + return false; +} + +static inline bool ascii_is_alpha(uint_least32_t cp) +{ + if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp)) + return true; + return false; +} + +static inline bool ascii_is_whitespace(uint_least32_t cp) +{ + if ( + cp == TAB || + cp == LF || + cp == FF || + cp == CR || + cp == SPACE + ) + return true; + return false; +} + +static bool find_opts_exist(struct FindOpts *opts) +{ + if (strlen(opts->tag) > 0) + return true; + if (strlen(opts->attr) > 0) + return true; + if (strlen(opts->key) > 0) + return true; + return false; +} + +struct FindOpts *find_opts_parse(const char *pattern) +{ + struct FindOpts *opts = malloc(sizeof(struct FindOpts)); + opts->out = OUT_OUTER_HTML; + opts->tag = malloc(sizeof(char)); + opts->tag[0] = 0; + opts->attr = malloc(sizeof(char)); + opts->attr[0] = 0; + opts->key = malloc(sizeof(char)); + opts->key[0] = 0; + bool is_class_value = false; + bool is_id_value = false; + int i = 0; + bool is_attr_key = false; + bool is_attr_or_tag = true; + char *attr_or_tag = NULL; + int aot = 0; + int ak = 0; + int av = 0; + switch (pattern[0]) { + case '.': + is_class_value = true; + i = 1; + break; + case '#': + is_id_value = true; + i = 1; + break; + } + for (; i<strlen(pattern); i++) { + if (pattern[i] == ']') + break; + if ( + !is_attr_key && + !is_attr_or_tag && + pattern[i] != ']' && + pattern[i] != '"' + ) { + opts->attr = realloc(opts->attr, (av+1) * sizeof(char)); + opts->attr[av] = pattern[i]; + av++; + } + if (pattern[i] == '=') + is_attr_key = false; + if (is_attr_key && !is_attr_or_tag) { + opts->key = realloc(opts->key, (ak+1) * sizeof(char)); + opts->key[ak] = pattern[i]; + ak++; + } + if (pattern[i] == '[') { + is_attr_key = true; + is_attr_or_tag = false; + } + if (is_attr_or_tag) { + attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char)); + attr_or_tag[aot] = pattern[i]; + aot++; + } + } + attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char)); + attr_or_tag[aot] = 0; + if (is_id_value) { + free(opts->key); + opts->key = NULL; + free(opts->attr); + opts->attr = NULL; + opts->attr = attr_or_tag; + opts->key = realloc(opts->key, 3 * sizeof(char)); + opts->key[0] = 'i'; + opts->key[1] = 'd'; + opts->key[2] = 0; + } else if (is_class_value) { + free(opts->key); + opts->key = NULL; + free(opts->attr); + opts->attr = NULL; + opts->attr = attr_or_tag; + opts->key = realloc(opts->key, 6 * sizeof(char)); + opts->key[0] = 'c'; + opts->key[1] = 'l'; + opts->key[2] = 'a'; + opts->key[3] = 's'; + opts->key[4] = 's'; + opts->key[5] = 0; } else { - tag_find(root_tag, opts, found_tags); - } - tag_print_find_result(root_tag, opts, found_tags, text); - // html_print(root_tag, -1); - tag_free(root_tag); -CLEAN: - tag_list_free(tag_list); - tag_list_free(found_tags); + free(opts->tag); + opts->tag = attr_or_tag; + if (av > 0) { + opts->attr = realloc(opts->attr, (av+1) * sizeof(char)); + opts->attr[av] = 0; + } + if (ak > 0) { + opts->key = realloc(opts->key, (ak+1) * sizeof(char)); + opts->key[ak] = 0; + } + } + return opts; } -void html_print(struct Tag *tag, int indent) +void find_opts_free(struct FindOpts *opts) { - for (int i=0; i<indent; i++) - putchar(' '); - printf("%s", tag->name); - for (int i=0; i<tag->attrs_len; i++) - printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value); - printf("\n"); - indent++; - for (int i=tag->children_len-1; i>-1; i--) - html_print(tag->children[i], indent); + free(opts->tag); + free(opts->attr); + free(opts->key); + free(opts); } -struct Tag *tag_init(void) +enum OutType output_type_parse(const char *type) { - struct Tag *t = malloc(sizeof(struct Tag)); - t->name = malloc(sizeof(char)); - t->name[0] = 0; - t->inner_text = malloc(sizeof(char)); - t->inner_text[0] = 0; - t->attrs = NULL; - t->children = NULL; - t->attrs_len = 0; - t->children_len = 0; - t->_is_void_element = false; - t->_is_closed = false; - t->_outer_html_begin_offset = 0; - t->_outer_html_end_offset = 0; - t->_inner_html_begin_offset = 0; - t->_inner_html_end_offset = 0; - return t; + if (type == NULL) + return OUT_OUTER_HTML; + if (strcmp(type, "outerhtml") == 0) + return OUT_OUTER_HTML; + if (strcmp(type, "innerhtml") == 0) + return OUT_INNER_HTML; + if (strcmp(type, "innertext") == 0) + return OUT_INNER_TEXT; + if (strcmp(type, "attr_value") == 0) + return OUT_ATTR_VALUE; + return -1; } -struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state) +static struct Attr *attr_init(void) { - struct Tag *tag = tag_init(); - tag->_outer_html_begin_offset= offset-1; - tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag)); - tag_list->tags[tag_list->len] = tag; - tag_list->len++; - struct Tag *still_open_tag = tag; - char *end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - enum State return_to_state = STATE_INNER_TEXT; - size_t a = 0; - size_t attr_name_count = 0; - enum AttrValueSyntax avs = AVS_NO; - size_t hyphen_count = 0; + struct Attr *attr = malloc(sizeof(struct Attr)); + attr->name = malloc(sizeof(char)); + attr->name[0] = 0; + attr->value = malloc(sizeof(char)); + attr->value[0] = 0; + return attr; +} + +static char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base) +{ + size_t old_offset = offset; + char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char)); + char *numeric_charref = malloc(sizeof(char)); + numeric_charref[0] = 0; + size_t ret; uint_least32_t cp; - size_t len = strlen(text); - size_t ret, off; - for (off = offset; off<len; off += ret) { - if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { - fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n"); - } else { - /* char *the_codepoint = cp_to_string(cp, ret); - printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state)); - free(the_codepoint); */ - switch (state) { - case STATE_INNER_TEXT: - if (cp == LESS_THAN_SIGN) { - state = STATE_TAG; - break; - } - if (cp == AMPERSAND) { - return_to_state = STATE_INNER_TEXT; - state = STATE_CHAR_REF; - break; - } - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret)); - break; - case STATE_TAG: - if (cp == SOLIDUS) { - state = STATE_END_TAG_NAME; - break; - } - if (cp == EXCLAMATION_MARK) { - state = STATE_COMMENT; - break; - } - still_open_tag = tag_get_last_open(tag_list); - struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME); - still_open_tag->children = realloc( - still_open_tag->children, - (still_open_tag->children_len+1) * sizeof(struct Tag) - ); - still_open_tag->children[still_open_tag->children_len] = one_tag; - still_open_tag->children_len++; - free(end_tag); - return tag; - case STATE_BEGIN_TAG_NAME: - if (cp == GREATER_THAN_SIGN) { - state = tag_process_end_of_opening_tag(tag, off); - break; - } - if (ascii_is_whitespace(cp)) { - state = STATE_ATTR_NAME; - break; - } - if (ascii_is_digit(cp) || ascii_is_alpha(cp)) { - tag->name = string_concat(tag->name, cp_to_string(cp, ret)); - } - break; - case STATE_END_TAG_NAME: - if (cp == GREATER_THAN_SIGN) { - struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); - if (closed_tag != NULL) - tag_set_inner_html_end_offset(closed_tag, text, off); - free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - state = STATE_INNER_TEXT; - break; - } - if (!ascii_is_whitespace(cp)) - end_tag = string_concat(end_tag, cp_to_string(cp, ret)); - break; - case STATE_ATTR_NAME: - if (cp == GREATER_THAN_SIGN) { - state = tag_process_end_of_opening_tag(tag, off); - break; - } - if (ascii_is_whitespace(cp)) { - if (attr_name_count == a+1) - a++; - break; - } - if (cp == EQUALS_SIGN) { - state = STATE_ATTR_VALUE; - break; - } - if (attr_name_char_is_valid(cp)) { - if (attr_name_count != a+1) { - tag->attrs = realloc( - tag->attrs, - (a+1) * sizeof(struct Attr) - ); - tag->attrs[a] = attr_init(); - attr_name_count = a + 1; - tag->attrs_len = attr_name_count; - } - tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret)); - } - break; - case STATE_ATTR_VALUE: - if (ascii_is_whitespace(cp)) { - if (avs == AVS_UNQUOTED) { - avs = AVS_NO; - state = STATE_ATTR_NAME; - } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) { - if ( - strcmp("id", tag->attrs[a]->name) == 0 || - strcmp("class", tag->attrs[a]->name) == 0 - ) { - char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); - strcpy(tmp_name, tag->attrs[a]->name); - tag->attrs = realloc( - tag->attrs, - (a+1) * sizeof(struct Attr) - ); - a++; - tag->attrs[a] = attr_init(); - free(tag->attrs[a]->name); - tag->attrs[a]->name = tmp_name; - tag->attrs_len++; - attr_name_count = a + 1; - } else { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); - } - } - break; - } - if (cp == QUOTATION_MARK) { - if (avs == AVS_NO) { - avs = AVS_QUOTATION_MARK; - break; - } - if (avs == AVS_QUOTATION_MARK) { - avs = AVS_NO; - state = STATE_ATTR_NAME; - break; - } - } - if (cp == APOSTROPHE) { - if (avs == AVS_NO) { - avs = AVS_APOSTROPHE; - break; - } - if (avs == AVS_APOSTROPHE) { - avs = AVS_NO; - state = STATE_ATTR_NAME; - break; - } - } - if (cp == GREATER_THAN_SIGN) { - state = tag_process_end_of_opening_tag(tag, off); - break; - } - if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) { - avs = AVS_UNQUOTED; - } - if (avs > AVS_NO) { - if (cp == AMPERSAND) { - state = STATE_CHAR_REF; - return_to_state = STATE_ATTR_VALUE; - break; - } - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); - } - break; - case STATE_COMMENT: - if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) { - state = STATE_INNER_TEXT; - break; - } - if (cp == HYPHEN_MINUS) - hyphen_count++; - else - hyphen_count = 0; - break; - case STATE_STYLE: - if (cp == LESS_THAN_SIGN) { - state = STATE_STYLE_POSSIBLE_END_TAG; - break; - } - break; - case STATE_STYLE_POSSIBLE_END_TAG: - if (cp == SOLIDUS) - state = STATE_STYLE_END_TAG; - else - state = STATE_STYLE; - break; - case STATE_STYLE_END_TAG: - if (cp == GREATER_THAN_SIGN) { - struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); - if (closed_tag != NULL) - tag_set_inner_html_end_offset(closed_tag, text, off); - free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - state = STATE_INNER_TEXT; - break; - } - if (!ascii_is_whitespace(cp)) - end_tag = string_concat(end_tag, cp_to_string(cp, ret)); - break; - case STATE_SCRIPT: - if (cp == LESS_THAN_SIGN) { - state = STATE_SCRIPT_POSSIBLE_END_TAG; - break; - } - break; - case STATE_SCRIPT_POSSIBLE_END_TAG: - if (cp == SOLIDUS) - state = STATE_SCRIPT_END_TAG; - else - state = STATE_SCRIPT; - break; - case STATE_SCRIPT_END_TAG: - if (cp == GREATER_THAN_SIGN) { - struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); - if (closed_tag != NULL) - tag_set_inner_html_end_offset(closed_tag, text, off); - free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - state = STATE_INNER_TEXT; - break; - } - if (!ascii_is_whitespace(cp)) - end_tag = string_concat(end_tag, cp_to_string(cp, ret)); - break; - case STATE_CHAR_REF: - if (cp == NUMBER_SIGN) { /* hashtag */ - state = STATE_CHAR_REF_NUMERIC; - break; - } - char *named_charref = charref_named_parse(text, off, len, avs); - off += strlen(named_charref)-1; - char *encoded_named_charref = charref_named_encode(named_charref); - if (return_to_state == STATE_INNER_TEXT) { - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref); - } else if (return_to_state == STATE_ATTR_VALUE) { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref); - } - free(named_charref); - state = return_to_state; - break; - case STATE_CHAR_REF_NUMERIC: - if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) { - size_t new_offset; - char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16); - off += new_offset; - if (return_to_state == STATE_INNER_TEXT) { - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); - } else if (return_to_state == STATE_ATTR_VALUE) { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); - } - state = return_to_state; - break; - } else if (ascii_is_digit(cp)) { - size_t new_offset; - char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10); - off += new_offset-1; - if (return_to_state == STATE_INNER_TEXT) { - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); - } else if (return_to_state == STATE_ATTR_VALUE) { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); - } - state = return_to_state; - break; - } - state = return_to_state; - break; - } + do { + ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); + numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret)); + offset += ret; + } while (cp != SEMICOLON); + *new_offset = offset - old_offset; + long i = strtol(numeric_charref, NULL, base); + ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE); + character[ret] = 0; + free(numeric_charref); + return character; +} + +static char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs) +{ + uint_least32_t stop_at = 0; + switch(avs) { + case AVS_QUOTATION_MARK: + stop_at = QUOTATION_MARK; + break; + case AVS_APOSTROPHE: + stop_at = APOSTROPHE; + break; + case AVS_UNQUOTED: + stop_at = GREATER_THAN_SIGN; + break; + case AVS_NO: /* Just to silence the compilier warning */ + break; + } + char *named_charref = malloc(sizeof(char)); + named_charref[0] = 0; + size_t ret; + uint_least32_t cp; + int i = 0; + for (;;) { + ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); + if (cp == AMPERSAND || ascii_is_whitespace(cp)) + break; + if (avs > AVS_NO && cp == stop_at) + break; + named_charref = string_concat(named_charref, cp_to_string(cp, ret)); + if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) + break; + offset += ret; + i++; + } + return named_charref; +} + +static void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf) +{ + const char *remaining = &parsed_string[strlen(charref)]; + size_t remaining_len = strlen(remaining); + size_t buf_len = strlen(*buf); + if (remaining_len > 0) { + if (remaining_len == 1 && remaining[0] == ';') + return; + *buf = realloc(*buf, buf_len+remaining_len+1); + strcat(*buf, remaining); + } +} + +static char *charref_named_encode(const char *name) +{ + char *buf = NULL; + size_t len; + int i; + for (i=0; i<2138; i++) { + if (string_starts_with(name, single_cp_entities[i].name)) { + buf = realloc(buf, MAX_CODEPOINT_SIZE+1); + len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE); + buf[len] = 0; + charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf); + return buf; + } + } + for (i=0; i<93; i++) { + if (string_starts_with(name, double_cp_entities[i].name)) { + size_t buf_len = 0; + buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1); + len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE); + buf_len += len; + buf += len; + len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE); + buf_len += len; + buf[buf_len] = 0; + charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf); + return buf; } } - free(end_tag); - return tag; + buf = realloc(buf, (strlen(name)+2) * sizeof(char)); + buf[0] = '&'; + buf[1] = 0; + strcat(buf, name); + return buf; +} + +static struct Tag *tag_init(void) +{ + struct Tag *tag = malloc(sizeof(struct Tag)); + tag->name = malloc(sizeof(char)); + tag->name[0] = 0; + tag->inner_text = malloc(sizeof(char)); + tag->inner_text[0] = 0; + tag->attrs = NULL; + tag->children = NULL; + tag->attrs_len = 0; + tag->children_len = 0; + tag->_is_void_element = false; + tag->_is_closed = false; + tag->_outer_html_begin_offset = 0; + tag->_outer_html_end_offset = 0; + tag->_inner_html_begin_offset = 0; + tag->_inner_html_end_offset = 0; + return tag; } -struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset) +static struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset) { for (int i=tag_list->len-1; i>-1; i--) { if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->_is_closed) { @@ -365,7 +444,7 @@ struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_ta return NULL; } -struct Tag *tag_get_last_open(struct TagList *tag_list) +static struct Tag *tag_get_last_open(struct TagList *tag_list) { for (int i=tag_list->len-1; i>-1; i--) { if (!tag_list->tags[i]->_is_void_element && !tag_list->tags[i]->_is_closed) { @@ -375,70 +454,7 @@ struct Tag *tag_get_last_open(struct TagList *tag_list) return tag_list->tags[0]; } -int tag_doctype_parse(const char *text) -{ - size_t offset = 0; - enum DoctypeState state = DSTATE_TEXT; - char *doctype = NULL; - char *lower_doctype = NULL; - uint_least32_t cp; - size_t len = strlen(text); - size_t ret, off; - for (off = 0; off<len; off += ret) { - if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { - fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n"); - } else { - switch (state) { - case DSTATE_TEXT: - if (cp == LESS_THAN_SIGN) { - state = DSTATE_POSSIBLE_DTYPE; - break; - } - if (cp == GREATER_THAN_SIGN) { - offset = off; - goto CLEANUP; - } - break; - case DSTATE_POSSIBLE_DTYPE: - if (cp == EXCLAMATION_MARK) - state = DSTATE_DTYPE_OR_COMMENT; - else - goto CLEANUP; - break; - case DSTATE_DTYPE_OR_COMMENT: - if (cp == HYPHEN_MINUS) { - goto CLEANUP; - } else { - doctype = string_concat(doctype, cp_to_string(cp, ret)); - state = DSTATE_DTYPE; - break; - } - break; - case DSTATE_DTYPE: - if (ascii_is_whitespace(cp)) { - size_t dlen = strlen(doctype)+1; - lower_doctype = malloc(dlen * sizeof(char)); - grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen); - if (strcmp(lower_doctype, "doctype") == 0) { - state = DSTATE_TEXT; - } else { - offset = -1; - goto CLEANUP; - } - break; - } - doctype = string_concat(doctype, cp_to_string(cp, ret)); - break; - } - } - } -CLEANUP: - free(doctype); - free(lower_doctype); - return offset; -} - -char *tag_get_outer_html(struct Tag *tag, char *text) +static char *tag_get_outer_html(struct Tag *tag, char *text) { char *outer_html = NULL; int o = 0; @@ -452,7 +468,7 @@ char *tag_get_outer_html(struct Tag *tag, char *text) return outer_html; } -char *tag_get_inner_html(struct Tag *tag, char *text) +static char *tag_get_inner_html(struct Tag *tag, char *text) { char *inner_html = NULL; int o = 0; @@ -466,7 +482,7 @@ char *tag_get_inner_html(struct Tag *tag, char *text) return inner_html; } -enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset) +static enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset) { tag->_inner_html_begin_offset = offset+1; tag->_is_void_element = tag_is_void_element(tag); @@ -479,16 +495,7 @@ enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset) return STATE_INNER_TEXT; } -static inline bool tag_is_void_element(struct Tag *tag) -{ - for (int i=0; i<13; i++) { - if (strcmp(tag->name, void_elements[i]) == 0) - return true; - } - return false; -} - -void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset) +static void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset) { int i = offset; while (text[i] != '<') @@ -496,7 +503,7 @@ void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t of closed_tag->_inner_html_end_offset = i; } -void tag_free(struct Tag *tag) +static void tag_free(struct Tag *tag) { free(tag->name); free(tag->inner_text); @@ -514,7 +521,7 @@ void tag_free(struct Tag *tag) free(tag); } -void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags) +static void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags) { if (opts->limit > 0 && found_tags->len == opts->limit) return; @@ -571,327 +578,491 @@ void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags } } -void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text) +static int tag_doctype_parse(const char *text) { - if (opts->is_except) { - bool is_match = false; - for (int i=0; i<strlen(text); i++) { - is_match = false; - for (int k=0; k<found_tags->len; k++) { - if ( - found_tags->tags[k]->_outer_html_begin_offset <= i && - found_tags->tags[k]->_outer_html_end_offset > i - ) - is_match = true; - } - if (!is_match) - putchar(text[i]); - } - } else { - char *requested_text = NULL; - char *trimmed_text = NULL; - for (int i=0; i<found_tags->len; i++) { - switch (opts->out) { - case OUT_INNER_HTML: - requested_text = tag_get_inner_html(found_tags->tags[i], text); - trimmed_text = string_trim(requested_text); - free(requested_text); + size_t offset = 0; + enum DoctypeState state = DSTATE_TEXT; + char *doctype = NULL; + char *lower_doctype = NULL; + uint_least32_t cp; + size_t len = strlen(text); + size_t ret, off; + for (off = 0; off<len; off += ret) { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { + fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n"); + } else { + switch (state) { + case DSTATE_TEXT: + if (cp == LESS_THAN_SIGN) { + state = DSTATE_POSSIBLE_DTYPE; + break; + } + if (cp == GREATER_THAN_SIGN) { + offset = off; + goto CLEANUP; + } break; - case OUT_OUTER_HTML: - requested_text = tag_get_outer_html(found_tags->tags[i], text); - trimmed_text = string_trim(requested_text); - free(requested_text); + case DSTATE_POSSIBLE_DTYPE: + if (cp == EXCLAMATION_MARK) + state = DSTATE_DTYPE_OR_COMMENT; + else + goto CLEANUP; break; - case OUT_INNER_TEXT: - trimmed_text = string_trim(found_tags->tags[i]->inner_text); + case DSTATE_DTYPE_OR_COMMENT: + if (cp == HYPHEN_MINUS) { + goto CLEANUP; + } else { + doctype = string_concat(doctype, cp_to_string(cp, ret)); + state = DSTATE_DTYPE; + break; + } break; - case OUT_ATTR_VALUE: - if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) { - for (int k=0; k<found_tags->tags[i]->attrs_len; k++) { - if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0) - printf("%s\n", found_tags->tags[i]->attrs[k]->value); + case DSTATE_DTYPE: + if (ascii_is_whitespace(cp)) { + size_t dlen = strlen(doctype)+1; + lower_doctype = malloc(dlen * sizeof(char)); + grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen); + if (strcmp(lower_doctype, "doctype") == 0) { + state = DSTATE_TEXT; + } else { + offset = -1; + goto CLEANUP; } - } else if (strlen(opts->tag) > 0) { - for (int k=0; k<found_tags->tags[i]->attrs_len; k++) - printf("%s\n", found_tags->tags[i]->attrs[k]->value); + break; } + doctype = string_concat(doctype, cp_to_string(cp, ret)); break; } - if (trimmed_text) { - if (strlen(trimmed_text) > 0) - printf("%s\n", trimmed_text); - free(trimmed_text); - } } } +CLEANUP: + free(doctype); + free(lower_doctype); + return offset; } -struct TagList *tag_list_init(void) -{ - struct TagList *tag_list = malloc(sizeof(struct TagList)); - tag_list->tags = NULL; - tag_list->len = 0; - return tag_list; -} - -void tag_list_free(struct TagList *tag_list) -{ - free(tag_list->tags); - free(tag_list); -} - -struct Attr *attr_init(void) -{ - struct Attr *attr = malloc(sizeof(struct Attr)); - attr->name = malloc(sizeof(char)); - attr->name[0] = 0; - attr->value = malloc(sizeof(char)); - attr->value[0] = 0; - return attr; -} - -static inline bool attr_name_char_is_valid(uint_least32_t cp) -{ - if (is_control(cp)) - return false; - if (is_non_char(cp)) - return false; - if ( - cp == SPACE || - cp == QUOTATION_MARK || - cp == APOSTROPHE || - cp == GREATER_THAN_SIGN || - cp == SOLIDUS || - cp == EQUALS_SIGN - ) - return false; - return true; -} - -static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp) -{ - /* - Not mentioned invalid characters. - They are already handled before - function call. - */ - if ( - cp == EQUALS_SIGN || - cp == LESS_THAN_SIGN || - cp == GREATER_THAN_SIGN || - cp == GRAVE_ACCENT - ) - return false; - return true; -} - -char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base) -{ - size_t old_offset = offset; - char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char)); - char *numeric_charref = malloc(sizeof(char)); - numeric_charref[0] = 0; - size_t ret; - uint_least32_t cp; - do { - ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); - numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret)); - offset += ret; - } while (cp != SEMICOLON); - *new_offset = offset - old_offset; - long i = strtol(numeric_charref, NULL, base); - ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE); - character[ret] = 0; - free(numeric_charref); - return character; -} - -char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs) +static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state) { - uint_least32_t stop_at = 0; - switch(avs) { - case AVS_QUOTATION_MARK: - stop_at = QUOTATION_MARK; - break; - case AVS_APOSTROPHE: - stop_at = APOSTROPHE; - break; - case AVS_UNQUOTED: - stop_at = GREATER_THAN_SIGN; - break; - case AVS_NO: /* Just to silence the compilier warning */ - break; - } - char *named_charref = malloc(sizeof(char)); - named_charref[0] = 0; - size_t ret; + struct Tag *tag = tag_init(); + tag->_outer_html_begin_offset= offset-1; + tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag)); + tag_list->tags[tag_list->len] = tag; + tag_list->len++; + struct Tag *still_open_tag = tag; + char *end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + enum State return_to_state = STATE_INNER_TEXT; + size_t a = 0; + size_t attr_name_count = 0; + enum AttrValueSyntax avs = AVS_NO; + size_t hyphen_count = 0; uint_least32_t cp; - int i = 0; - for (;;) { - ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); - if (cp == AMPERSAND || ascii_is_whitespace(cp)) - break; - if (avs > AVS_NO && cp == stop_at) - break; - named_charref = string_concat(named_charref, cp_to_string(cp, ret)); - if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) - break; - offset += ret; - i++; - } - return named_charref; -} - -char *charref_named_encode(const char *name) -{ - char *buf = NULL; - size_t len; - int i; - for (i=0; i<2138; i++) { - if (string_starts_with(name, single_cp_entities[i].name)) { - buf = realloc(buf, MAX_CODEPOINT_SIZE+1); - len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE); - buf[len] = 0; - charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf); - return buf; - } - } - for (i=0; i<93; i++) { - if (string_starts_with(name, double_cp_entities[i].name)) { - size_t buf_len = 0; - buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1); - len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE); - buf_len += len; - buf += len; - len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE); - buf_len += len; - buf[buf_len] = 0; - charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf); - return buf; + size_t len = strlen(text); + size_t ret, off; + for (off = offset; off<len; off += ret) { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { + fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n"); + } else { + /* char *the_codepoint = cp_to_string(cp, ret); + printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state)); + free(the_codepoint); */ + switch (state) { + case STATE_INNER_TEXT: + if (cp == LESS_THAN_SIGN) { + state = STATE_TAG; + break; + } + if (cp == AMPERSAND) { + return_to_state = STATE_INNER_TEXT; + state = STATE_CHAR_REF; + break; + } + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret)); + break; + case STATE_TAG: + if (cp == SOLIDUS) { + state = STATE_END_TAG_NAME; + break; + } + if (cp == EXCLAMATION_MARK) { + state = STATE_COMMENT; + break; + } + still_open_tag = tag_get_last_open(tag_list); + struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME); + still_open_tag->children = realloc( + still_open_tag->children, + (still_open_tag->children_len+1) * sizeof(struct Tag) + ); + still_open_tag->children[still_open_tag->children_len] = one_tag; + still_open_tag->children_len++; + free(end_tag); + return tag; + case STATE_BEGIN_TAG_NAME: + if (cp == GREATER_THAN_SIGN) { + state = tag_process_end_of_opening_tag(tag, off); + break; + } + if (ascii_is_whitespace(cp)) { + state = STATE_ATTR_NAME; + break; + } + if (ascii_is_digit(cp) || ascii_is_alpha(cp)) { + tag->name = string_concat(tag->name, cp_to_string(cp, ret)); + } + break; + case STATE_END_TAG_NAME: + if (cp == GREATER_THAN_SIGN) { + struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); + if (closed_tag != NULL) + tag_set_inner_html_end_offset(closed_tag, text, off); + free(end_tag); + end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!ascii_is_whitespace(cp)) + end_tag = string_concat(end_tag, cp_to_string(cp, ret)); + break; + case STATE_ATTR_NAME: + if (cp == GREATER_THAN_SIGN) { + state = tag_process_end_of_opening_tag(tag, off); + break; + } + if (ascii_is_whitespace(cp)) { + if (attr_name_count == a+1) + a++; + break; + } + if (cp == EQUALS_SIGN) { + state = STATE_ATTR_VALUE; + break; + } + if (attr_name_char_is_valid(cp)) { + if (attr_name_count != a+1) { + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct Attr) + ); + tag->attrs[a] = attr_init(); + attr_name_count = a + 1; + tag->attrs_len = attr_name_count; + } + tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret)); + } + break; + case STATE_ATTR_VALUE: + if (ascii_is_whitespace(cp)) { + if (avs == AVS_UNQUOTED) { + avs = AVS_NO; + state = STATE_ATTR_NAME; + } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) { + if ( + strcmp("id", tag->attrs[a]->name) == 0 || + strcmp("class", tag->attrs[a]->name) == 0 + ) { + char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); + strcpy(tmp_name, tag->attrs[a]->name); + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct Attr) + ); + a++; + tag->attrs[a] = attr_init(); + free(tag->attrs[a]->name); + tag->attrs[a]->name = tmp_name; + tag->attrs_len++; + attr_name_count = a + 1; + } else { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); + } + } + break; + } + if (cp == QUOTATION_MARK) { + if (avs == AVS_NO) { + avs = AVS_QUOTATION_MARK; + break; + } + if (avs == AVS_QUOTATION_MARK) { + avs = AVS_NO; + state = STATE_ATTR_NAME; + break; + } + } + if (cp == APOSTROPHE) { + if (avs == AVS_NO) { + avs = AVS_APOSTROPHE; + break; + } + if (avs == AVS_APOSTROPHE) { + avs = AVS_NO; + state = STATE_ATTR_NAME; + break; + } + } + if (cp == GREATER_THAN_SIGN) { + state = tag_process_end_of_opening_tag(tag, off); + break; + } + if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) { + avs = AVS_UNQUOTED; + } + if (avs > AVS_NO) { + if (cp == AMPERSAND) { + state = STATE_CHAR_REF; + return_to_state = STATE_ATTR_VALUE; + break; + } + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); + } + break; + case STATE_COMMENT: + if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) { + state = STATE_INNER_TEXT; + break; + } + if (cp == HYPHEN_MINUS) + hyphen_count++; + else + hyphen_count = 0; + break; + case STATE_STYLE: + if (cp == LESS_THAN_SIGN) { + state = STATE_STYLE_POSSIBLE_END_TAG; + break; + } + break; + case STATE_STYLE_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_STYLE_END_TAG; + else + state = STATE_STYLE; + break; + case STATE_STYLE_END_TAG: + if (cp == GREATER_THAN_SIGN) { + struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); + if (closed_tag != NULL) + tag_set_inner_html_end_offset(closed_tag, text, off); + free(end_tag); + end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!ascii_is_whitespace(cp)) + end_tag = string_concat(end_tag, cp_to_string(cp, ret)); + break; + case STATE_SCRIPT: + if (cp == LESS_THAN_SIGN) { + state = STATE_SCRIPT_POSSIBLE_END_TAG; + break; + } + break; + case STATE_SCRIPT_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_SCRIPT_END_TAG; + else + state = STATE_SCRIPT; + break; + case STATE_SCRIPT_END_TAG: + if (cp == GREATER_THAN_SIGN) { + struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); + if (closed_tag != NULL) + tag_set_inner_html_end_offset(closed_tag, text, off); + free(end_tag); + end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!ascii_is_whitespace(cp)) + end_tag = string_concat(end_tag, cp_to_string(cp, ret)); + break; + case STATE_CHAR_REF: + if (cp == NUMBER_SIGN) { /* hashtag */ + state = STATE_CHAR_REF_NUMERIC; + break; + } + char *named_charref = charref_named_parse(text, off, len, avs); + off += strlen(named_charref)-1; + char *encoded_named_charref = charref_named_encode(named_charref); + if (return_to_state == STATE_INNER_TEXT) { + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref); + } else if (return_to_state == STATE_ATTR_VALUE) { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref); + } + free(named_charref); + state = return_to_state; + break; + case STATE_CHAR_REF_NUMERIC: + if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) { + size_t new_offset; + char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16); + off += new_offset; + if (return_to_state == STATE_INNER_TEXT) { + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); + } else if (return_to_state == STATE_ATTR_VALUE) { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); + } + state = return_to_state; + break; + } else if (ascii_is_digit(cp)) { + size_t new_offset; + char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10); + off += new_offset-1; + if (return_to_state == STATE_INNER_TEXT) { + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); + } else if (return_to_state == STATE_ATTR_VALUE) { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); + } + state = return_to_state; + break; + } + state = return_to_state; + break; + } } } - buf = realloc(buf, (strlen(name)+2) * sizeof(char)); - buf[0] = '&'; - buf[1] = 0; - strcat(buf, name); - return buf; -} - -void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf) -{ - const char *remaining = &parsed_string[strlen(charref)]; - size_t remaining_len = strlen(remaining); - size_t buf_len = strlen(*buf); - if (remaining_len > 0) { - if (remaining_len == 1 && remaining[0] == ';') - return; - *buf = realloc(*buf, buf_len+remaining_len+1); - strcat(*buf, remaining); - } + free(end_tag); + return tag; } -static inline bool ascii_is_digit(uint_least32_t cp) +static void tag_debug_print(struct Tag *tag, int indent) { - if (cp >= 0x30 && cp <= 0x39) - return true; - return false; + for (int i=0; i<indent; i++) + putchar(' '); + printf("%s", tag->name); + for (int i=0; i<tag->attrs_len; i++) + printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value); + printf("\n"); + indent++; + for (int i=tag->children_len-1; i>-1; i--) + tag_debug_print(tag->children[i], indent); } -static inline bool ascii_alpha_is_upper(uint_least32_t cp) +static struct TagList *tag_list_init(void) { - if (cp >= 0x41 && cp <= 0x5A) - return true; - return false; + struct TagList *tag_list = malloc(sizeof(struct TagList)); + tag_list->tags = NULL; + tag_list->len = 0; + return tag_list; } -static inline bool ascii_alpha_is_lower(uint_least32_t cp) +void tag_list_free(struct TagList *tag_list) { - if (cp >= 0x61 && cp <= 0x7A) - return true; - return false; + free(tag_list->tags); + free(tag_list); } -static inline bool ascii_is_alpha(uint_least32_t cp) +static struct HTMLDocument *html_document_init(void) { - if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp)) - return true; - return false; + struct HTMLDocument *document = malloc(sizeof(struct HTMLDocument)); + document->buffer = NULL; + document->tag = NULL; + document->tag_list = NULL; + return document; } -static inline bool ascii_is_whitespace(uint_least32_t cp) +void html_document_free(struct HTMLDocument *document) { - if ( - cp == TAB || - cp == LF || - cp == FF || - cp == CR || - cp == SPACE - ) - return true; - return false; + // free(doc->buffer); + tag_free(document->tag); + tag_list_free(document->tag_list); + free(document); } -static inline bool is_c0_control(uint_least32_t cp) +struct HTMLDocument *html_document_parse(char *buffer) { - if (cp >= 0x00 && cp <= 0x1F) - return true; - return false; + struct HTMLDocument *document = html_document_init(); + document->buffer = buffer; + document->tag_list = tag_list_init(); + int len = tag_doctype_parse(document->buffer); + if (len == -1) { + fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n"); + html_document_free(document); + return NULL; + } else { + document->buffer += len; + } + document->tag = tag_parse(document->tag_list, document->buffer, 0, STATE_INNER_TEXT); + return document; } -static inline bool is_control(uint_least32_t cp) +struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts) { - if (is_c0_control(cp)) - return true; - if (cp >= 0x7F && cp <= 0x9F) - return true; - return false; + struct TagList *found_tags = tag_list_init(); + if (!find_opts_exist(opts)) { + found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag)); + found_tags->tags[0] = document->tag->children[0]; + found_tags->len = 1; + } else { + tag_find(document->tag, opts, found_tags); + } + return found_tags; } -static inline bool is_non_char(uint_least32_t cp) +void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts) { - if (cp >= 0xFDD0 && cp <= 0xFDEF) - return true; - if ( - cp == 0xFFFE || cp == 0xFFFF || - cp == 0x1FFFE || cp == 0x1FFFF || - cp == 0x2FFFE || cp == 0x2FFFF || - cp == 0x3FFFE || cp == 0x3FFFF || - cp == 0x4FFFE || cp == 0x4FFFF || - cp == 0x5FFFE || cp == 0x5FFFF || - cp == 0x6FFFE || cp == 0x6FFFF || - cp == 0x7FFFE || cp == 0x7FFFF || - cp == 0x8FFFE || cp == 0x8FFFF || - cp == 0x9FFFE || cp == 0x9FFFF || - cp == 0xAFFFE || cp == 0xAFFFF || - cp == 0xBFFFE || cp == 0xBFFFF || - cp == 0xCFFFE || cp == 0xCFFFF || - cp == 0xDFFFE || cp == 0xDFFFF || - cp == 0xEFFFE || cp == 0xEFFFF || - cp == 0xFFFFE || cp == 0xFFFFF || - cp == 0x10FFFE || cp == 0x10FFFF - ) - return true; - return false; + if (opts->is_except) { + bool is_match = false; + for (int i=0; i<strlen(document->buffer); i++) { + is_match = false; + for (int k=0; k<found_tags->len; k++) { + if ( + found_tags->tags[k]->_outer_html_begin_offset <= i && + found_tags->tags[k]->_outer_html_end_offset > i + ) + is_match = true; + } + if (!is_match) + putchar(document->buffer[i]); + } + } else { + char *requested_text = NULL; + char *trimmed_text = NULL; + for (int i=0; i<found_tags->len; i++) { + switch (opts->out) { + case OUT_INNER_HTML: + requested_text = tag_get_inner_html(found_tags->tags[i], document->buffer); + trimmed_text = string_trim(requested_text); + free(requested_text); + break; + case OUT_OUTER_HTML: + requested_text = tag_get_outer_html(found_tags->tags[i], document->buffer); + trimmed_text = string_trim(requested_text); + free(requested_text); + break; + case OUT_INNER_TEXT: + trimmed_text = string_trim(found_tags->tags[i]->inner_text); + break; + case OUT_ATTR_VALUE: + if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) { + for (int k=0; k<found_tags->tags[i]->attrs_len; k++) { + if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0) + printf("%s\n", found_tags->tags[i]->attrs[k]->value); + } + } else if (strlen(opts->tag) > 0) { + for (int k=0; k<found_tags->tags[i]->attrs_len; k++) + printf("%s\n", found_tags->tags[i]->attrs[k]->value); + } + break; + } + if (trimmed_text) { + if (strlen(trimmed_text) > 0) + printf("%s\n", trimmed_text); + free(trimmed_text); + } + } + } } -const char *state_to_string(enum State state) +void html_document_debug_print_tree(struct HTMLDocument *document) { - switch(state) { - case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; - case STATE_TAG: return "STATE_TAG"; - case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; - case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; - case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; - case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; - case STATE_COMMENT: return "STATE_COMMENT"; - case STATE_SCRIPT: return "STATE_SCRIPT"; - case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; - case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; - case STATE_STYLE: return "STATE_STYLE"; - case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; - case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; - case STATE_CHAR_REF: return "STATE_CHAR_REF"; - case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC"; - } - return ""; + tag_debug_print(document->tag, -1); } diff --git a/src/html.h b/src/html.h @@ -22,11 +22,6 @@ #define LONGEST_NAMED_CHAR_REF 32 #define MAX_CODEPOINT_SIZE 4 -static const char *void_elements[] = { - "area", "base", "br", "col", "embed", "hr", "img", - "input", "link", "meta", "source", "track", "wbr" -}; - struct Attr { char *name; char *value; // optional @@ -52,6 +47,12 @@ struct TagList { size_t len; }; +struct HTMLDocument { + char *buffer; + struct Tag *tag; + struct TagList *tag_list; +}; + enum State { STATE_INNER_TEXT, STATE_TAG, @@ -84,42 +85,30 @@ enum AttrValueSyntax { AVS_UNQUOTED }; -void html_filter(char *text, struct FindOpts *opts); -void html_print(struct Tag *tag, int indent); - -struct Tag *tag_init(void); -struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state); -struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset); -struct Tag *tag_get_last_open(struct TagList *tag_list); -int tag_doctype_parse(const char *text); -char *tag_get_outer_html(struct Tag *tag, char *text); -char *tag_get_inner_html(struct Tag *tag, char *text); -enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset); -static inline bool tag_is_void_element(struct Tag *tag); -void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset); -void tag_free(struct Tag *tag); -void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags); -void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text); - -struct TagList *tag_list_init(void); -void tag_list_free(struct TagList *tag_list); +enum OutType { + OUT_INNER_HTML, + OUT_OUTER_HTML, + OUT_INNER_TEXT, + OUT_ATTR_VALUE +}; -struct Attr *attr_init(void); -static inline bool attr_name_char_is_valid(uint_least32_t cp); -static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp); +struct FindOpts { + char *tag; + char *attr; + char *key; + enum OutType out; + bool is_except; + int limit; +}; -char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base); -char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs); -char *charref_named_encode(const char *name); -void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf); +struct FindOpts *find_opts_parse(const char *pattern); +void find_opts_free(struct FindOpts *opts); +enum OutType output_type_parse(const char *type); -static inline bool ascii_is_digit(uint_least32_t cp); -static inline bool ascii_alpha_is_upper(uint_least32_t cp); -static inline bool ascii_alpha_is_lower(uint_least32_t cp); -static inline bool ascii_is_alpha(uint_least32_t cp); -static inline bool ascii_is_whitespace(uint_least32_t cp); -static inline bool is_c0_control(uint_least32_t cp); -static inline bool is_control(uint_least32_t cp); -static inline bool is_non_char(uint_least32_t cp); +struct HTMLDocument *html_document_parse(char *buffer); +struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts); +void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts); +void html_document_free(struct HTMLDocument *document); +void html_document_debug_print_tree(struct HTMLDocument *document); -const char *state_to_string(enum State s); +void tag_list_free(struct TagList *tag_list); diff --git a/src/misc.c b/src/misc.c @@ -1,3 +1,11 @@ +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <grapheme.h> +#include "misc.h" + char *string_concat(char *str1, char *str2) { size_t len2 = strlen(str2); diff --git a/src/misc.h b/src/misc.h @@ -0,0 +1,6 @@ +char *string_concat(char *str1, char *str2); +char *cp_to_string(uint_least32_t cp, size_t len); +char *string_trim(char *text); +bool string_starts_with(const char *string, const char *part); +bool file_try_read(char *buf, FILE *fp); +char *file_read(FILE *fp); diff --git a/todo b/todo @@ -1,3 +1,3 @@ -replace int,size_t with uint* handle correctly when no search pattern was provided -implement charref also for outerhtml,innerhtml +implement charref also for outerhtml,innerhtml: but how? +what about replacing FindOpts with CssSelector?