htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 7bc2de577806b2c880cb056afd5f0abdcbb95672
parent 61caacea3913f3844bb5aa7deb5c01f6d2dd5858
Author: Robin <kroekerrobin@gmail.com>
Date:   Thu,  4 Apr 2024 10:40:20 +0200

Clean up

Diffstat:
Mhtex.c | 10+++++++---
Dhtml.c | 883-------------------------------------------------------------------------------
Dhtml.h | 124-------------------------------------------------------------------------------
Rentities.h -> src/entities.h | 0
Asrc/html.c | 883+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/html.h | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rmisc.c -> src/misc.c | 0
Mtodo | 1+
8 files changed, 1015 insertions(+), 1010 deletions(-)

diff --git a/htex.c b/htex.c @@ -6,8 +6,8 @@ #include <inttypes.h> #include <grapheme.h> #include "htex.h" -#include "misc.c" -#include "html.c" +#include "src/misc.c" +#include "src/html.c" bool find_opts_exist(struct FindOpts *opts) { @@ -168,6 +168,10 @@ int main(int argc, char *argv[]) break; case 'l': limit = atoi(optarg); + if (limit <= 0) { + fprintf(stderr, "htex: Provide a valid limit value.\n"); + return -1; + } break; } } @@ -213,7 +217,7 @@ int main(int argc, char *argv[]) struct FindOpts *options = find_opts_parse(search_pattern); options->out = out; options->is_except = is_except; - options->limit = limit; + options->limit = (size_t)limit; html_filter(text, options); free(output); find_opts_free(options); diff --git a/html.c b/html.c @@ -1,883 +0,0 @@ -#include "html.h" -#include "entities.h" - -void html_filter(char *text, struct FindOpts *opts) -{ - struct TagList *tag_list = tag_list_init(); - struct TagList *found_tags = tag_list_init(); - size_t len = tag_doctype_parse(text); - if (len == -1) { - fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n"); - goto CLEAN; - } else { - text += len; - } - struct Tag *root_tag = tag_parse(tag_list, text, 0, STATE_INNER_TEXT); - if (!find_opts_exist(opts)) { - found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag)); - found_tags->tags[0] = root_tag->children[0]; - found_tags->len = 1; - } else { - tag_find(root_tag, opts, found_tags); - } - tag_print_find_result(root_tag, opts, found_tags, text); - // html_print(root_tag, -1); - tag_free(root_tag); -CLEAN: - tag_list_free(tag_list); - tag_list_free(found_tags); -} - -void html_print(struct Tag *tag, int indent) -{ - for (int i=0; i<indent; i++) - putchar(' '); - printf("%s", tag->name); - for (int i=0; i<tag->attrs_len; i++) - printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value); - printf("\n"); - indent++; - for (int i=tag->children_len-1; i>-1; i--) - html_print(tag->children[i], indent); -} - -struct Tag *tag_init(void) -{ - struct Tag *t = malloc(sizeof(struct Tag)); - t->name = malloc(sizeof(char)); - t->name[0] = 0; - t->inner_text = malloc(sizeof(char)); - t->inner_text[0] = 0; - t->attrs = NULL; - t->children = NULL; - t->attrs_len = 0; - t->children_len = 0; - t->_is_void_element = false; - t->_is_closed = false; - t->_outer_html_begin_offset = 0; - t->_outer_html_end_offset = 0; - t->_inner_html_begin_offset = 0; - t->_inner_html_end_offset = 0; - return t; -} - -struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state) -{ - struct Tag *tag = tag_init(); - tag->_outer_html_begin_offset= offset-1; - tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag)); - tag_list->tags[tag_list->len] = tag; - tag_list->len++; - struct Tag *still_open_tag = tag; - char *end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - enum State return_to_state = STATE_INNER_TEXT; - size_t a = 0; - size_t attr_name_count = 0; - enum AttrValueSyntax avs = AVS_NO; - size_t hyphen_count = 0; - uint_least32_t cp; - size_t len = strlen(text); - size_t ret, off; - for (off = offset; off<len; off += ret) { - if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { - fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n"); - } else { - // char *the_codepoint = cp_to_string(cp, ret); - // printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state)); - // free(the_codepoint); - switch (state) { - case STATE_INNER_TEXT: - if (cp == LESS_THAN_SIGN) { - state = STATE_TAG; - break; - } - if (cp == AMPERSAND) { - return_to_state = STATE_INNER_TEXT; - state = STATE_CHAR_REF; - break; - } - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret)); - break; - case STATE_TAG: - if (cp == SOLIDUS) { - state = STATE_END_TAG_NAME; - break; - } - if (cp == EXCLAMATION_MARK) { - state = STATE_COMMENT; - break; - } - still_open_tag = tag_get_last_open(tag_list); - struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME); - still_open_tag->children = realloc( - still_open_tag->children, - (still_open_tag->children_len+1) * sizeof(struct Tag) - ); - still_open_tag->children[still_open_tag->children_len] = one_tag; - still_open_tag->children_len++; - free(end_tag); - return tag; - case STATE_BEGIN_TAG_NAME: - if (cp == GREATER_THAN_SIGN) { - state = tag_process_end_of_opening_tag(tag, off); - break; - } - if (ascii_is_whitespace(cp)) { - state = STATE_ATTR_NAME; - break; - } - if (ascii_is_digit(cp) || ascii_is_alpha(cp)) { - tag->name = string_concat(tag->name, cp_to_string(cp, ret)); - } - break; - case STATE_END_TAG_NAME: - if (cp == GREATER_THAN_SIGN) { - struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); - if (closed_tag != NULL) - tag_set_inner_html_end_offset(closed_tag, text, off); - free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - state = STATE_INNER_TEXT; - break; - } - if (!ascii_is_whitespace(cp)) - end_tag = string_concat(end_tag, cp_to_string(cp, ret)); - break; - case STATE_ATTR_NAME: - if (cp == GREATER_THAN_SIGN) { - state = tag_process_end_of_opening_tag(tag, off); - break; - } - if (ascii_is_whitespace(cp)) { - if (attr_name_count == a+1) - a++; - break; - } - if (cp == EQUALS_SIGN) { - state = STATE_ATTR_VALUE; - break; - } - if (attr_name_char_is_valid(cp)) { - if (attr_name_count != a+1) { - tag->attrs = realloc( - tag->attrs, - (a+1) * sizeof(struct Attr) - ); - tag->attrs[a] = attr_init(); - attr_name_count = a + 1; - tag->attrs_len = attr_name_count; - } - tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret)); - } - break; - case STATE_ATTR_VALUE: - if (ascii_is_whitespace(cp)) { - if (avs == AVS_UNQUOTED) { - avs = AVS_NO; - state = STATE_ATTR_NAME; - } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) { - if ( - strcmp("id", tag->attrs[a]->name) == 0 || - strcmp("class", tag->attrs[a]->name) == 0 - ) { - char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); - strcpy(tmp_name, tag->attrs[a]->name); - tag->attrs = realloc( - tag->attrs, - (a+1) * sizeof(struct Attr) - ); - a++; - tag->attrs[a] = attr_init(); - free(tag->attrs[a]->name); - tag->attrs[a]->name = tmp_name; - tag->attrs_len++; - attr_name_count = a + 1; - } else { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); - } - } - break; - } - if (cp == QUOTATION_MARK) { - if (avs == AVS_NO) { - avs = AVS_QUOTATION_MARK; - break; - } - if (avs == AVS_QUOTATION_MARK) { - avs = AVS_NO; - state = STATE_ATTR_NAME; - break; - } - } - if (cp == APOSTROPHE) { - if (avs == AVS_NO) { - avs = AVS_APOSTROPHE; - break; - } - if (avs == AVS_APOSTROPHE) { - avs = AVS_NO; - state = STATE_ATTR_NAME; - break; - } - } - if (cp == GREATER_THAN_SIGN) { - state = tag_process_end_of_opening_tag(tag, off); - break; - } - if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) { - avs = AVS_UNQUOTED; - } - if (avs > AVS_NO) { - if (cp == AMPERSAND) { - state = STATE_CHAR_REF; - return_to_state = STATE_ATTR_VALUE; - break; - } - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); - } - break; - case STATE_COMMENT: - if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) { - state = STATE_INNER_TEXT; - break; - } - if (cp == HYPHEN_MINUS) - hyphen_count++; - else - hyphen_count = 0; - break; - case STATE_STYLE: - if (cp == LESS_THAN_SIGN) { - state = STATE_STYLE_POSSIBLE_END_TAG; - break; - } - break; - case STATE_STYLE_POSSIBLE_END_TAG: - if (cp == SOLIDUS) - state = STATE_STYLE_END_TAG; - else - state = STATE_STYLE; - break; - case STATE_STYLE_END_TAG: - if (cp == GREATER_THAN_SIGN) { - struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); - if (closed_tag != NULL) - tag_set_inner_html_end_offset(closed_tag, text, off); - free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - state = STATE_INNER_TEXT; - break; - } - if (!ascii_is_whitespace(cp)) - end_tag = string_concat(end_tag, cp_to_string(cp, ret)); - break; - case STATE_SCRIPT: - if (cp == LESS_THAN_SIGN) { - state = STATE_SCRIPT_POSSIBLE_END_TAG; - break; - } - break; - case STATE_SCRIPT_POSSIBLE_END_TAG: - if (cp == SOLIDUS) - state = STATE_SCRIPT_END_TAG; - else - state = STATE_SCRIPT; - break; - case STATE_SCRIPT_END_TAG: - if (cp == GREATER_THAN_SIGN) { - struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); - if (closed_tag != NULL) - tag_set_inner_html_end_offset(closed_tag, text, off); - free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; - state = STATE_INNER_TEXT; - break; - } - if (!ascii_is_whitespace(cp)) - end_tag = string_concat(end_tag, cp_to_string(cp, ret)); - break; - case STATE_CHAR_REF: - if (cp == NUMBER_SIGN) { /* hashtag */ - state = STATE_CHAR_REF_NUMERIC; - break; - } - char *named_charref = charref_named_parse(text, off, len, avs); - off += strlen(named_charref)-1; - char *encoded_named_charref = charref_named_encode(named_charref); - if (return_to_state == STATE_INNER_TEXT) { - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref); - } else if (return_to_state == STATE_ATTR_VALUE) { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref); - } - free(named_charref); - state = return_to_state; - break; - case STATE_CHAR_REF_NUMERIC: - if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) { - size_t new_offset; - char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16); - off += new_offset; - if (return_to_state == STATE_INNER_TEXT) { - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); - } else if (return_to_state == STATE_ATTR_VALUE) { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); - } - state = return_to_state; - break; - } else if (ascii_is_digit(cp)) { - size_t new_offset; - char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10); - off += new_offset-1; - if (return_to_state == STATE_INNER_TEXT) { - still_open_tag = tag_get_last_open(tag_list); - still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); - } else if (return_to_state == STATE_ATTR_VALUE) { - tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); - } - state = return_to_state; - break; - } - state = return_to_state; - break; - } - } - } - free(end_tag); - return tag; -} - -struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset) -{ - for (int i=tag_list->len-1; i>-1; i--) { - if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->_is_closed) { - tag_list->tags[i]->_is_closed = true; - tag_list->tags[i]->_outer_html_end_offset = end_offset; - return tag_list->tags[i]; - } - } - return NULL; -} - -struct Tag *tag_get_last_open(struct TagList *tag_list) -{ - for (int i=tag_list->len-1; i>-1; i--) { - if (!tag_list->tags[i]->_is_void_element && !tag_list->tags[i]->_is_closed) { - return tag_list->tags[i]; - } - } - return tag_list->tags[0]; -} - -size_t tag_doctype_parse(const char *text) -{ - size_t offset = 0; - enum DoctypeState state = DSTATE_TEXT; - char *doctype = NULL; - char *lower_doctype = NULL; - uint_least32_t cp; - size_t len = strlen(text); - size_t ret, off; - for (off = 0; off<len; off += ret) { - if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { - fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n"); - } else { - switch (state) { - case DSTATE_TEXT: - if (cp == LESS_THAN_SIGN) { - state = DSTATE_POSSIBLE_DTYPE; - break; - } - if (cp == GREATER_THAN_SIGN) { - offset = off; - goto CLEANUP; - } - break; - case DSTATE_POSSIBLE_DTYPE: - if (cp == EXCLAMATION_MARK) - state = DSTATE_DTYPE_OR_COMMENT; - else - goto CLEANUP; - break; - case DSTATE_DTYPE_OR_COMMENT: - if (cp == HYPHEN_MINUS) { - goto CLEANUP; - } else { - doctype = string_concat(doctype, cp_to_string(cp, ret)); - state = DSTATE_DTYPE; - break; - } - break; - case DSTATE_DTYPE: - if (ascii_is_whitespace(cp)) { - size_t dlen = strlen(doctype)+1; - lower_doctype = malloc(dlen * sizeof(char)); - grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen); - if (strcmp(lower_doctype, "doctype") == 0) { - state = DSTATE_TEXT; - } else { - offset = -1; - goto CLEANUP; - } - break; - } - doctype = string_concat(doctype, cp_to_string(cp, ret)); - break; - } - } - } -CLEANUP: - free(doctype); - free(lower_doctype); - return offset; -} - -char *tag_get_outer_html(struct Tag *tag, char *text) -{ - char *outer_html = NULL; - int o = 0; - for (int i=tag->_outer_html_begin_offset; i<tag->_outer_html_end_offset; i++) { - outer_html = realloc(outer_html, (o+1) * sizeof(char)); - outer_html[o] = text[i]; - o++; - } - outer_html = realloc(outer_html, (o+1) * sizeof(char)); - outer_html[o] = 0; - return outer_html; -} - -char *tag_get_inner_html(struct Tag *tag, char *text) -{ - char *inner_html = NULL; - int o = 0; - for (int i=tag->_inner_html_begin_offset; i<tag->_inner_html_end_offset; i++) { - inner_html = realloc(inner_html, (o+1) * sizeof(char)); - inner_html[o] = text[i]; - o++; - } - inner_html = realloc(inner_html, (o+1) * sizeof(char)); - inner_html[o] = 0; - return inner_html; -} - -enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset) -{ - tag->_inner_html_begin_offset = offset+1; - tag->_is_void_element = tag_is_void_element(tag); - if (tag->_is_void_element) - tag->_outer_html_end_offset = offset+1; - if (strcmp(tag->name, "script") == 0) - return STATE_SCRIPT; - else if (strcmp(tag->name, "style") == 0) - return STATE_STYLE; - return STATE_INNER_TEXT; -} - -static inline bool tag_is_void_element(struct Tag *tag) -{ - for (int i=0; i<13; i++) { - if (strcmp(tag->name, void_elements[i]) == 0) - return true; - } - return false; -} - -void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset) -{ - int i = offset; - while (text[i] != '<') - i--; - closed_tag->_inner_html_end_offset = i; -} - -void tag_free(struct Tag *tag) -{ - free(tag->name); - free(tag->inner_text); - for (int i=0; i<tag->attrs_len; i++) { - free(tag->attrs[i]->name); - free(tag->attrs[i]->value); - free(tag->attrs[i]); - } - free(tag->attrs); - for (int i=0; i<tag->children_len; i++) { - if (tag->children[i] != NULL) - tag_free(tag->children[i]); - } - free(tag->children); - free(tag); -} - -void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags) -{ - if (opts->limit > 0 && found_tags->len == opts->limit) - return; - bool matches_tag = false; - bool matches_attr_key = false; - bool matches_attr_value = false; - if (strcmp(tag->name, opts->tag) == 0) - matches_tag = true; - for (int i=0; i<tag->attrs_len; i++) { - if (strcmp(tag->attrs[i]->name, opts->key) == 0) - matches_attr_key = true; - if (strcmp(tag->attrs[i]->value, opts->attr) == 0) - matches_attr_value = true; - } - if (strlen(opts->tag) > 0 && strlen(opts->key) > 0 && strlen(opts->attr) > 0) { - if (matches_tag && matches_attr_key && matches_attr_value) { - found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); - found_tags->tags[found_tags->len] = tag; - found_tags->len++; - } - } else if (strlen(opts->tag) > 0 && strlen(opts->key) > 0) { - if (matches_tag && matches_attr_key) { - found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); - found_tags->tags[found_tags->len] = tag; - found_tags->len++; - } - } else if (strlen(opts->tag) > 0) { - if (matches_tag) { - found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); - found_tags->tags[found_tags->len] = tag; - found_tags->len++; - } - } else if (strlen(opts->key) > 0 && strlen(opts->attr) > 0) { - if (matches_attr_key && matches_attr_value) { - found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); - found_tags->tags[found_tags->len] = tag; - found_tags->len++; - } - } else if (strlen(opts->key) > 0) { - if (matches_attr_key) { - found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); - found_tags->tags[found_tags->len] = tag; - found_tags->len++; - } - } else if (strlen(opts->attr) > 0) { - if (matches_attr_value) { - found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); - found_tags->tags[found_tags->len] = tag; - found_tags->len++; - } - } - for (int i=tag->children_len-1; i>-1; i--) { - tag_find(tag->children[i], opts, found_tags); - } -} - -void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text) -{ - if (opts->is_except) { - bool is_match = false; - for (int i=0; i<strlen(text); i++) { - is_match = false; - for (int k=0; k<found_tags->len; k++) { - if ( - found_tags->tags[k]->_outer_html_begin_offset <= i && - found_tags->tags[k]->_outer_html_end_offset > i - ) - is_match = true; - } - if (!is_match) - putchar(text[i]); - } - } else { - char *requested_text = NULL; - char *trimmed_text = NULL; - for (int i=0; i<found_tags->len; i++) { - switch (opts->out) { - case OUT_INNER_HTML: - requested_text = tag_get_inner_html(found_tags->tags[i], text); - trimmed_text = string_trim(requested_text); - free(requested_text); - break; - case OUT_OUTER_HTML: - requested_text = tag_get_outer_html(found_tags->tags[i], text); - trimmed_text = string_trim(requested_text); - free(requested_text); - break; - case OUT_INNER_TEXT: - trimmed_text = string_trim(found_tags->tags[i]->inner_text); - break; - case OUT_ATTR_VALUE: - if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) { - for (int k=0; k<found_tags->tags[i]->attrs_len; k++) { - if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0) - printf("%s\n", found_tags->tags[i]->attrs[k]->value); - } - } else if (strlen(opts->tag) > 0) { - for (int k=0; k<found_tags->tags[i]->attrs_len; k++) - printf("%s\n", found_tags->tags[i]->attrs[k]->value); - } - break; - } - if (trimmed_text) { - if (strlen(trimmed_text) > 0) - printf("%s\n", trimmed_text); - free(trimmed_text); - } - } - } -} - -struct TagList *tag_list_init(void) -{ - struct TagList *tag_list = malloc(sizeof(struct TagList)); - tag_list->tags = NULL; - tag_list->len = 0; - return tag_list; -} - -void tag_list_free(struct TagList *tag_list) -{ - free(tag_list->tags); - free(tag_list); -} - -struct Attr *attr_init(void) -{ - struct Attr *attr = malloc(sizeof(struct Attr)); - attr->name = malloc(sizeof(char)); - attr->name[0] = 0; - attr->value = malloc(sizeof(char)); - attr->value[0] = 0; - return attr; -} - -static inline bool attr_name_char_is_valid(uint_least32_t cp) -{ - if (is_control(cp)) - return false; - if (is_non_char(cp)) - return false; - if ( - cp == SPACE || - cp == QUOTATION_MARK || - cp == APOSTROPHE || - cp == GREATER_THAN_SIGN || - cp == SOLIDUS || - cp == EQUALS_SIGN - ) - return false; - return true; -} - -static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp) -{ - /* - Not mentioned invalid characters. - They are already handled before - function call. - */ - if ( - cp == EQUALS_SIGN || - cp == LESS_THAN_SIGN || - cp == GREATER_THAN_SIGN || - cp == GRAVE_ACCENT - ) - return false; - return true; -} - -char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base) -{ - size_t old_offset = offset; - char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char)); - char *numeric_charref = malloc(sizeof(char)); - numeric_charref[0] = 0; - size_t ret; - uint_least32_t cp; - do { - ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); - numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret)); - offset += ret; - } while (cp != SEMICOLON); - *new_offset = offset - old_offset; - long i = strtol(numeric_charref, NULL, base); - ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE); - character[ret] = 0; - free(numeric_charref); - return character; -} - -char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs) -{ - uint_least32_t stop_at = 0; - switch(avs) { - case AVS_QUOTATION_MARK: - stop_at = QUOTATION_MARK; - break; - case AVS_APOSTROPHE: - stop_at = APOSTROPHE; - break; - case AVS_UNQUOTED: - stop_at = GREATER_THAN_SIGN; - break; - case AVS_NO: /* Just to silence the compilier warning */ - break; - } - char *named_charref = malloc(sizeof(char)); - named_charref[0] = 0; - size_t ret; - uint_least32_t cp; - int i = 0; - for (;;) { - ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); - if (cp == AMPERSAND || ascii_is_whitespace(cp)) - break; - if (avs > AVS_NO && cp == stop_at) - break; - named_charref = string_concat(named_charref, cp_to_string(cp, ret)); - if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) - break; - offset += ret; - i++; - } - return named_charref; -} - -char *charref_named_encode(const char *name) -{ - char *buf = malloc(2*MAX_CODEPOINT_SIZE+1); - char cp[MAX_CODEPOINT_SIZE]; - memset(&cp, 0, MAX_CODEPOINT_SIZE); - size_t len; - for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) { - if (string_starts_with(name, entities[i].name)) { - len = grapheme_encode_utf8(entities[i].cp[0], cp, MAX_CODEPOINT_SIZE); - strcpy(buf, cp); - if (entities[i].cp[1] != 0) { - len += grapheme_encode_utf8(entities[i].cp[1], cp, MAX_CODEPOINT_SIZE); - strcat(buf, cp); - } - buf[len] = 0; - const char *part = &name[strlen(entities[i].name)]; - size_t part_len = strlen(part); - if (part_len > 0) { - if (part_len == 1 && part[0] == ';') - return buf; - buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+part_len); - strcat(buf, &name[strlen(entities[i].name)]); - buf[len+part_len] = 0; - } - return buf; - } - } - buf = realloc(buf, (strlen(name)+2) * sizeof(char)); - buf[0] = '&'; - buf[1] = 0; - strcat(buf, name); - return buf; -} - -static inline bool ascii_is_digit(uint_least32_t cp) -{ - if (cp >= 0x30 && cp <= 0x39) - return true; - return false; -} - -static inline bool ascii_alpha_is_upper(uint_least32_t cp) -{ - if (cp >= 0x41 && cp <= 0x5A) - return true; - return false; -} - -static inline bool ascii_alpha_is_lower(uint_least32_t cp) -{ - if (cp >= 0x61 && cp <= 0x7A) - return true; - return false; -} - -static inline bool ascii_is_alpha(uint_least32_t cp) -{ - if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp)) - return true; - return false; -} - -static inline bool ascii_is_whitespace(uint_least32_t cp) -{ - if ( - cp == TAB || - cp == LF || - cp == FF || - cp == CR || - cp == SPACE - ) - return true; - return false; -} - -static inline bool is_c0_control(uint_least32_t cp) -{ - if (cp >= 0x00 && cp <= 0x1F) - return true; - return false; -} - -static inline bool is_control(uint_least32_t cp) -{ - if (is_c0_control(cp)) - return true; - if (cp >= 0x7F && cp <= 0x9F) - return true; - return false; -} - -static inline bool is_non_char(uint_least32_t cp) -{ - if (cp >= 0xFDD0 && cp <= 0xFDEF) - return true; - if ( - cp == 0xFFFE || cp == 0xFFFF || - cp == 0x1FFFE || cp == 0x1FFFF || - cp == 0x2FFFE || cp == 0x2FFFF || - cp == 0x3FFFE || cp == 0x3FFFF || - cp == 0x4FFFE || cp == 0x4FFFF || - cp == 0x5FFFE || cp == 0x5FFFF || - cp == 0x6FFFE || cp == 0x6FFFF || - cp == 0x7FFFE || cp == 0x7FFFF || - cp == 0x8FFFE || cp == 0x8FFFF || - cp == 0x9FFFE || cp == 0x9FFFF || - cp == 0xAFFFE || cp == 0xAFFFF || - cp == 0xBFFFE || cp == 0xBFFFF || - cp == 0xCFFFE || cp == 0xCFFFF || - cp == 0xDFFFE || cp == 0xDFFFF || - cp == 0xEFFFE || cp == 0xEFFFF || - cp == 0xFFFFE || cp == 0xFFFFF || - cp == 0x10FFFE || cp == 0x10FFFF - ) - return true; - return false; -} - -const char *state_to_string(enum State state) -{ - switch(state) { - case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; - case STATE_TAG: return "STATE_TAG"; - case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; - case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; - case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; - case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; - case STATE_COMMENT: return "STATE_COMMENT"; - case STATE_SCRIPT: return "STATE_SCRIPT"; - case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; - case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; - case STATE_STYLE: return "STATE_STYLE"; - case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; - case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; - case STATE_CHAR_REF: return "STATE_CHAR_REF"; - case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC"; - } - return ""; -} diff --git a/html.h b/html.h @@ -1,124 +0,0 @@ -#define LESS_THAN_SIGN 0x3C -#define GREATER_THAN_SIGN 0x3E -#define EQUALS_SIGN 0x3D -#define TAB 0x09 -#define LF 0x0A -#define FF 0x0C -#define CR 0x0D -#define SPACE 0x20 -#define SOLIDUS 0x2F -#define EXCLAMATION_MARK 0x21 -#define QUOTATION_MARK 0x22 -#define NUMBER_SIGN 0x23 -#define AMPERSAND 0x26 -#define APOSTROPHE 0x27 -#define GRAVE_ACCENT 0x60 -#define HYPHEN_MINUS 0x2D -#define SEMICOLON 0x3B -#define SMALL_LETTER_X 0x78 -#define CAPITAL_LETTER_X 0x58 - -#define NAMED_CHAR_REF_COUNT 2231 -#define LONGEST_NAMED_CHAR_REF 32 -#define MAX_CODEPOINT_SIZE 4 - -static const char *void_elements[] = { - "area", "base", "br", "col", "embed", "hr", "img", - "input", "link", "meta", "source", "track", "wbr" -}; - -struct Attr { - char *name; - char *value; // optional -}; - -struct Tag { - char *name; - struct Attr **attrs; - struct Tag **children; - char *inner_text; - size_t attrs_len; - size_t children_len; - bool _is_void_element; // means there is no closing tag - bool _is_closed; - size_t _outer_html_begin_offset; - size_t _outer_html_end_offset; - size_t _inner_html_begin_offset; - size_t _inner_html_end_offset; -}; - -struct TagList { - struct Tag **tags; - size_t len; -}; - -enum State { - STATE_INNER_TEXT, - STATE_TAG, - STATE_BEGIN_TAG_NAME, - STATE_END_TAG_NAME, - STATE_ATTR_NAME, - STATE_ATTR_VALUE, - STATE_COMMENT, - STATE_SCRIPT, - STATE_SCRIPT_POSSIBLE_END_TAG, - STATE_SCRIPT_END_TAG, - STATE_STYLE, - STATE_STYLE_POSSIBLE_END_TAG, - STATE_STYLE_END_TAG, - STATE_CHAR_REF, - STATE_CHAR_REF_NUMERIC -}; - -enum DoctypeState { - DSTATE_TEXT, - DSTATE_POSSIBLE_DTYPE, - DSTATE_DTYPE_OR_COMMENT, - DSTATE_DTYPE -}; - -enum AttrValueSyntax { - AVS_NO, - AVS_QUOTATION_MARK, - AVS_APOSTROPHE, - AVS_UNQUOTED -}; - -void html_filter(char *text, struct FindOpts *opts); -void html_print(struct Tag *tag, int indent); - -struct Tag *tag_init(void); -struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state); -struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset); -struct Tag *tag_get_last_open(struct TagList *tag_list); -size_t tag_doctype_parse(const char *text); -char *tag_get_outer_html(struct Tag *tag, char *text); -char *tag_get_inner_html(struct Tag *tag, char *text); -enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset); -static inline bool tag_is_void_element(struct Tag *tag); -void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset); -void tag_free(struct Tag *tag); -void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags); -void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text); - -struct TagList *tag_list_init(void); -void tag_list_free(struct TagList *tag_list); - -struct Attr *attr_init(void); -static inline bool attr_name_char_is_valid(uint_least32_t cp); -static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp); - -char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base); -char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs); -char *charref_named_encode(const char *name); - -static inline bool ascii_is_digit(uint_least32_t cp); -static inline bool ascii_alpha_is_upper(uint_least32_t cp); -static inline bool ascii_alpha_is_lower(uint_least32_t cp); -static inline bool ascii_is_alpha(uint_least32_t cp); -static inline bool ascii_is_whitespace(uint_least32_t cp); -static inline bool is_c0_control(uint_least32_t cp); -static inline bool is_control(uint_least32_t cp); -static inline bool is_non_char(uint_least32_t cp); - -const char *state_to_string(enum State s); diff --git a/entities.h b/src/entities.h diff --git a/src/html.c b/src/html.c @@ -0,0 +1,883 @@ +#include "html.h" +#include "entities.h" + +void html_filter(char *text, struct FindOpts *opts) +{ + struct TagList *tag_list = tag_list_init(); + struct TagList *found_tags = tag_list_init(); + int len = tag_doctype_parse(text); + if (len == -1) { + fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n"); + goto CLEAN; + } else { + text += len; + } + struct Tag *root_tag = tag_parse(tag_list, text, 0, STATE_INNER_TEXT); + if (!find_opts_exist(opts)) { + found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag)); + found_tags->tags[0] = root_tag->children[0]; + found_tags->len = 1; + } else { + tag_find(root_tag, opts, found_tags); + } + tag_print_find_result(root_tag, opts, found_tags, text); + // html_print(root_tag, -1); + tag_free(root_tag); +CLEAN: + tag_list_free(tag_list); + tag_list_free(found_tags); +} + +void html_print(struct Tag *tag, int indent) +{ + for (int i=0; i<indent; i++) + putchar(' '); + printf("%s", tag->name); + for (int i=0; i<tag->attrs_len; i++) + printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value); + printf("\n"); + indent++; + for (int i=tag->children_len-1; i>-1; i--) + html_print(tag->children[i], indent); +} + +struct Tag *tag_init(void) +{ + struct Tag *t = malloc(sizeof(struct Tag)); + t->name = malloc(sizeof(char)); + t->name[0] = 0; + t->inner_text = malloc(sizeof(char)); + t->inner_text[0] = 0; + t->attrs = NULL; + t->children = NULL; + t->attrs_len = 0; + t->children_len = 0; + t->_is_void_element = false; + t->_is_closed = false; + t->_outer_html_begin_offset = 0; + t->_outer_html_end_offset = 0; + t->_inner_html_begin_offset = 0; + t->_inner_html_end_offset = 0; + return t; +} + +struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state) +{ + struct Tag *tag = tag_init(); + tag->_outer_html_begin_offset= offset-1; + tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag)); + tag_list->tags[tag_list->len] = tag; + tag_list->len++; + struct Tag *still_open_tag = tag; + char *end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + enum State return_to_state = STATE_INNER_TEXT; + size_t a = 0; + size_t attr_name_count = 0; + enum AttrValueSyntax avs = AVS_NO; + size_t hyphen_count = 0; + uint_least32_t cp; + size_t len = strlen(text); + size_t ret, off; + for (off = offset; off<len; off += ret) { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { + fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n"); + } else { + /* char *the_codepoint = cp_to_string(cp, ret); + printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state)); + free(the_codepoint); */ + switch (state) { + case STATE_INNER_TEXT: + if (cp == LESS_THAN_SIGN) { + state = STATE_TAG; + break; + } + if (cp == AMPERSAND) { + return_to_state = STATE_INNER_TEXT; + state = STATE_CHAR_REF; + break; + } + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret)); + break; + case STATE_TAG: + if (cp == SOLIDUS) { + state = STATE_END_TAG_NAME; + break; + } + if (cp == EXCLAMATION_MARK) { + state = STATE_COMMENT; + break; + } + still_open_tag = tag_get_last_open(tag_list); + struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME); + still_open_tag->children = realloc( + still_open_tag->children, + (still_open_tag->children_len+1) * sizeof(struct Tag) + ); + still_open_tag->children[still_open_tag->children_len] = one_tag; + still_open_tag->children_len++; + free(end_tag); + return tag; + case STATE_BEGIN_TAG_NAME: + if (cp == GREATER_THAN_SIGN) { + state = tag_process_end_of_opening_tag(tag, off); + break; + } + if (ascii_is_whitespace(cp)) { + state = STATE_ATTR_NAME; + break; + } + if (ascii_is_digit(cp) || ascii_is_alpha(cp)) { + tag->name = string_concat(tag->name, cp_to_string(cp, ret)); + } + break; + case STATE_END_TAG_NAME: + if (cp == GREATER_THAN_SIGN) { + struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); + if (closed_tag != NULL) + tag_set_inner_html_end_offset(closed_tag, text, off); + free(end_tag); + end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!ascii_is_whitespace(cp)) + end_tag = string_concat(end_tag, cp_to_string(cp, ret)); + break; + case STATE_ATTR_NAME: + if (cp == GREATER_THAN_SIGN) { + state = tag_process_end_of_opening_tag(tag, off); + break; + } + if (ascii_is_whitespace(cp)) { + if (attr_name_count == a+1) + a++; + break; + } + if (cp == EQUALS_SIGN) { + state = STATE_ATTR_VALUE; + break; + } + if (attr_name_char_is_valid(cp)) { + if (attr_name_count != a+1) { + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct Attr) + ); + tag->attrs[a] = attr_init(); + attr_name_count = a + 1; + tag->attrs_len = attr_name_count; + } + tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret)); + } + break; + case STATE_ATTR_VALUE: + if (ascii_is_whitespace(cp)) { + if (avs == AVS_UNQUOTED) { + avs = AVS_NO; + state = STATE_ATTR_NAME; + } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) { + if ( + strcmp("id", tag->attrs[a]->name) == 0 || + strcmp("class", tag->attrs[a]->name) == 0 + ) { + char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); + strcpy(tmp_name, tag->attrs[a]->name); + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct Attr) + ); + a++; + tag->attrs[a] = attr_init(); + free(tag->attrs[a]->name); + tag->attrs[a]->name = tmp_name; + tag->attrs_len++; + attr_name_count = a + 1; + } else { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); + } + } + break; + } + if (cp == QUOTATION_MARK) { + if (avs == AVS_NO) { + avs = AVS_QUOTATION_MARK; + break; + } + if (avs == AVS_QUOTATION_MARK) { + avs = AVS_NO; + state = STATE_ATTR_NAME; + break; + } + } + if (cp == APOSTROPHE) { + if (avs == AVS_NO) { + avs = AVS_APOSTROPHE; + break; + } + if (avs == AVS_APOSTROPHE) { + avs = AVS_NO; + state = STATE_ATTR_NAME; + break; + } + } + if (cp == GREATER_THAN_SIGN) { + state = tag_process_end_of_opening_tag(tag, off); + break; + } + if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) { + avs = AVS_UNQUOTED; + } + if (avs > AVS_NO) { + if (cp == AMPERSAND) { + state = STATE_CHAR_REF; + return_to_state = STATE_ATTR_VALUE; + break; + } + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); + } + break; + case STATE_COMMENT: + if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) { + state = STATE_INNER_TEXT; + break; + } + if (cp == HYPHEN_MINUS) + hyphen_count++; + else + hyphen_count = 0; + break; + case STATE_STYLE: + if (cp == LESS_THAN_SIGN) { + state = STATE_STYLE_POSSIBLE_END_TAG; + break; + } + break; + case STATE_STYLE_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_STYLE_END_TAG; + else + state = STATE_STYLE; + break; + case STATE_STYLE_END_TAG: + if (cp == GREATER_THAN_SIGN) { + struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); + if (closed_tag != NULL) + tag_set_inner_html_end_offset(closed_tag, text, off); + free(end_tag); + end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!ascii_is_whitespace(cp)) + end_tag = string_concat(end_tag, cp_to_string(cp, ret)); + break; + case STATE_SCRIPT: + if (cp == LESS_THAN_SIGN) { + state = STATE_SCRIPT_POSSIBLE_END_TAG; + break; + } + break; + case STATE_SCRIPT_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_SCRIPT_END_TAG; + else + state = STATE_SCRIPT; + break; + case STATE_SCRIPT_END_TAG: + if (cp == GREATER_THAN_SIGN) { + struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); + if (closed_tag != NULL) + tag_set_inner_html_end_offset(closed_tag, text, off); + free(end_tag); + end_tag = malloc(sizeof(char)); + end_tag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!ascii_is_whitespace(cp)) + end_tag = string_concat(end_tag, cp_to_string(cp, ret)); + break; + case STATE_CHAR_REF: + if (cp == NUMBER_SIGN) { /* hashtag */ + state = STATE_CHAR_REF_NUMERIC; + break; + } + char *named_charref = charref_named_parse(text, off, len, avs); + off += strlen(named_charref)-1; + char *encoded_named_charref = charref_named_encode(named_charref); + if (return_to_state == STATE_INNER_TEXT) { + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref); + } else if (return_to_state == STATE_ATTR_VALUE) { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref); + } + free(named_charref); + state = return_to_state; + break; + case STATE_CHAR_REF_NUMERIC: + if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) { + size_t new_offset; + char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16); + off += new_offset; + if (return_to_state == STATE_INNER_TEXT) { + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); + } else if (return_to_state == STATE_ATTR_VALUE) { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); + } + state = return_to_state; + break; + } else if (ascii_is_digit(cp)) { + size_t new_offset; + char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10); + off += new_offset-1; + if (return_to_state == STATE_INNER_TEXT) { + still_open_tag = tag_get_last_open(tag_list); + still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); + } else if (return_to_state == STATE_ATTR_VALUE) { + tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); + } + state = return_to_state; + break; + } + state = return_to_state; + break; + } + } + } + free(end_tag); + return tag; +} + +struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset) +{ + for (int i=tag_list->len-1; i>-1; i--) { + if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->_is_closed) { + tag_list->tags[i]->_is_closed = true; + tag_list->tags[i]->_outer_html_end_offset = end_offset; + return tag_list->tags[i]; + } + } + return NULL; +} + +struct Tag *tag_get_last_open(struct TagList *tag_list) +{ + for (int i=tag_list->len-1; i>-1; i--) { + if (!tag_list->tags[i]->_is_void_element && !tag_list->tags[i]->_is_closed) { + return tag_list->tags[i]; + } + } + return tag_list->tags[0]; +} + +int tag_doctype_parse(const char *text) +{ + size_t offset = 0; + enum DoctypeState state = DSTATE_TEXT; + char *doctype = NULL; + char *lower_doctype = NULL; + uint_least32_t cp; + size_t len = strlen(text); + size_t ret, off; + for (off = 0; off<len; off += ret) { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { + fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n"); + } else { + switch (state) { + case DSTATE_TEXT: + if (cp == LESS_THAN_SIGN) { + state = DSTATE_POSSIBLE_DTYPE; + break; + } + if (cp == GREATER_THAN_SIGN) { + offset = off; + goto CLEANUP; + } + break; + case DSTATE_POSSIBLE_DTYPE: + if (cp == EXCLAMATION_MARK) + state = DSTATE_DTYPE_OR_COMMENT; + else + goto CLEANUP; + break; + case DSTATE_DTYPE_OR_COMMENT: + if (cp == HYPHEN_MINUS) { + goto CLEANUP; + } else { + doctype = string_concat(doctype, cp_to_string(cp, ret)); + state = DSTATE_DTYPE; + break; + } + break; + case DSTATE_DTYPE: + if (ascii_is_whitespace(cp)) { + size_t dlen = strlen(doctype)+1; + lower_doctype = malloc(dlen * sizeof(char)); + grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen); + if (strcmp(lower_doctype, "doctype") == 0) { + state = DSTATE_TEXT; + } else { + offset = -1; + goto CLEANUP; + } + break; + } + doctype = string_concat(doctype, cp_to_string(cp, ret)); + break; + } + } + } +CLEANUP: + free(doctype); + free(lower_doctype); + return offset; +} + +char *tag_get_outer_html(struct Tag *tag, char *text) +{ + char *outer_html = NULL; + int o = 0; + for (int i=tag->_outer_html_begin_offset; i<tag->_outer_html_end_offset; i++) { + outer_html = realloc(outer_html, (o+1) * sizeof(char)); + outer_html[o] = text[i]; + o++; + } + outer_html = realloc(outer_html, (o+1) * sizeof(char)); + outer_html[o] = 0; + return outer_html; +} + +char *tag_get_inner_html(struct Tag *tag, char *text) +{ + char *inner_html = NULL; + int o = 0; + for (int i=tag->_inner_html_begin_offset; i<tag->_inner_html_end_offset; i++) { + inner_html = realloc(inner_html, (o+1) * sizeof(char)); + inner_html[o] = text[i]; + o++; + } + inner_html = realloc(inner_html, (o+1) * sizeof(char)); + inner_html[o] = 0; + return inner_html; +} + +enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset) +{ + tag->_inner_html_begin_offset = offset+1; + tag->_is_void_element = tag_is_void_element(tag); + if (tag->_is_void_element) + tag->_outer_html_end_offset = offset+1; + if (strcmp(tag->name, "script") == 0) + return STATE_SCRIPT; + else if (strcmp(tag->name, "style") == 0) + return STATE_STYLE; + return STATE_INNER_TEXT; +} + +static inline bool tag_is_void_element(struct Tag *tag) +{ + for (int i=0; i<13; i++) { + if (strcmp(tag->name, void_elements[i]) == 0) + return true; + } + return false; +} + +void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset) +{ + int i = offset; + while (text[i] != '<') + i--; + closed_tag->_inner_html_end_offset = i; +} + +void tag_free(struct Tag *tag) +{ + free(tag->name); + free(tag->inner_text); + for (int i=0; i<tag->attrs_len; i++) { + free(tag->attrs[i]->name); + free(tag->attrs[i]->value); + free(tag->attrs[i]); + } + free(tag->attrs); + for (int i=0; i<tag->children_len; i++) { + if (tag->children[i] != NULL) + tag_free(tag->children[i]); + } + free(tag->children); + free(tag); +} + +void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags) +{ + if (opts->limit > 0 && found_tags->len == opts->limit) + return; + bool matches_tag = false; + bool matches_attr_key = false; + bool matches_attr_value = false; + if (strcmp(tag->name, opts->tag) == 0) + matches_tag = true; + for (int i=0; i<tag->attrs_len; i++) { + if (strcmp(tag->attrs[i]->name, opts->key) == 0) + matches_attr_key = true; + if (strcmp(tag->attrs[i]->value, opts->attr) == 0) + matches_attr_value = true; + } + if (strlen(opts->tag) > 0 && strlen(opts->key) > 0 && strlen(opts->attr) > 0) { + if (matches_tag && matches_attr_key && matches_attr_value) { + found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); + found_tags->tags[found_tags->len] = tag; + found_tags->len++; + } + } else if (strlen(opts->tag) > 0 && strlen(opts->key) > 0) { + if (matches_tag && matches_attr_key) { + found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); + found_tags->tags[found_tags->len] = tag; + found_tags->len++; + } + } else if (strlen(opts->tag) > 0) { + if (matches_tag) { + found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); + found_tags->tags[found_tags->len] = tag; + found_tags->len++; + } + } else if (strlen(opts->key) > 0 && strlen(opts->attr) > 0) { + if (matches_attr_key && matches_attr_value) { + found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); + found_tags->tags[found_tags->len] = tag; + found_tags->len++; + } + } else if (strlen(opts->key) > 0) { + if (matches_attr_key) { + found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); + found_tags->tags[found_tags->len] = tag; + found_tags->len++; + } + } else if (strlen(opts->attr) > 0) { + if (matches_attr_value) { + found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); + found_tags->tags[found_tags->len] = tag; + found_tags->len++; + } + } + for (int i=tag->children_len-1; i>-1; i--) { + tag_find(tag->children[i], opts, found_tags); + } +} + +void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text) +{ + if (opts->is_except) { + bool is_match = false; + for (int i=0; i<strlen(text); i++) { + is_match = false; + for (int k=0; k<found_tags->len; k++) { + if ( + found_tags->tags[k]->_outer_html_begin_offset <= i && + found_tags->tags[k]->_outer_html_end_offset > i + ) + is_match = true; + } + if (!is_match) + putchar(text[i]); + } + } else { + char *requested_text = NULL; + char *trimmed_text = NULL; + for (int i=0; i<found_tags->len; i++) { + switch (opts->out) { + case OUT_INNER_HTML: + requested_text = tag_get_inner_html(found_tags->tags[i], text); + trimmed_text = string_trim(requested_text); + free(requested_text); + break; + case OUT_OUTER_HTML: + requested_text = tag_get_outer_html(found_tags->tags[i], text); + trimmed_text = string_trim(requested_text); + free(requested_text); + break; + case OUT_INNER_TEXT: + trimmed_text = string_trim(found_tags->tags[i]->inner_text); + break; + case OUT_ATTR_VALUE: + if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) { + for (int k=0; k<found_tags->tags[i]->attrs_len; k++) { + if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0) + printf("%s\n", found_tags->tags[i]->attrs[k]->value); + } + } else if (strlen(opts->tag) > 0) { + for (int k=0; k<found_tags->tags[i]->attrs_len; k++) + printf("%s\n", found_tags->tags[i]->attrs[k]->value); + } + break; + } + if (trimmed_text) { + if (strlen(trimmed_text) > 0) + printf("%s\n", trimmed_text); + free(trimmed_text); + } + } + } +} + +struct TagList *tag_list_init(void) +{ + struct TagList *tag_list = malloc(sizeof(struct TagList)); + tag_list->tags = NULL; + tag_list->len = 0; + return tag_list; +} + +void tag_list_free(struct TagList *tag_list) +{ + free(tag_list->tags); + free(tag_list); +} + +struct Attr *attr_init(void) +{ + struct Attr *attr = malloc(sizeof(struct Attr)); + attr->name = malloc(sizeof(char)); + attr->name[0] = 0; + attr->value = malloc(sizeof(char)); + attr->value[0] = 0; + return attr; +} + +static inline bool attr_name_char_is_valid(uint_least32_t cp) +{ + if (is_control(cp)) + return false; + if (is_non_char(cp)) + return false; + if ( + cp == SPACE || + cp == QUOTATION_MARK || + cp == APOSTROPHE || + cp == GREATER_THAN_SIGN || + cp == SOLIDUS || + cp == EQUALS_SIGN + ) + return false; + return true; +} + +static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp) +{ + /* + Not mentioned invalid characters. + They are already handled before + function call. + */ + if ( + cp == EQUALS_SIGN || + cp == LESS_THAN_SIGN || + cp == GREATER_THAN_SIGN || + cp == GRAVE_ACCENT + ) + return false; + return true; +} + +char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base) +{ + size_t old_offset = offset; + char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char)); + char *numeric_charref = malloc(sizeof(char)); + numeric_charref[0] = 0; + size_t ret; + uint_least32_t cp; + do { + ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); + numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret)); + offset += ret; + } while (cp != SEMICOLON); + *new_offset = offset - old_offset; + long i = strtol(numeric_charref, NULL, base); + ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE); + character[ret] = 0; + free(numeric_charref); + return character; +} + +char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs) +{ + uint_least32_t stop_at = 0; + switch(avs) { + case AVS_QUOTATION_MARK: + stop_at = QUOTATION_MARK; + break; + case AVS_APOSTROPHE: + stop_at = APOSTROPHE; + break; + case AVS_UNQUOTED: + stop_at = GREATER_THAN_SIGN; + break; + case AVS_NO: /* Just to silence the compilier warning */ + break; + } + char *named_charref = malloc(sizeof(char)); + named_charref[0] = 0; + size_t ret; + uint_least32_t cp; + int i = 0; + for (;;) { + ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); + if (cp == AMPERSAND || ascii_is_whitespace(cp)) + break; + if (avs > AVS_NO && cp == stop_at) + break; + named_charref = string_concat(named_charref, cp_to_string(cp, ret)); + if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) + break; + offset += ret; + i++; + } + return named_charref; +} + +char *charref_named_encode(const char *name) +{ + char *buf = malloc(2*MAX_CODEPOINT_SIZE+1); + char character[MAX_CODEPOINT_SIZE]; + memset(&character, 0, MAX_CODEPOINT_SIZE); + size_t len; + for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) { + if (string_starts_with(name, entities[i].name)) { + len = grapheme_encode_utf8(entities[i].cp[0], character, MAX_CODEPOINT_SIZE); + strcpy(buf, character); + if (entities[i].cp[1] != 0) { + len += grapheme_encode_utf8(entities[i].cp[1], character, MAX_CODEPOINT_SIZE); + strcat(buf, character); + } + buf[len] = 0; + const char *part = &name[strlen(entities[i].name)]; + size_t part_len = strlen(part); + if (part_len > 0) { + if (part_len == 1 && part[0] == ';') + return buf; + buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+part_len); + strcat(buf, &name[strlen(entities[i].name)]); + buf[len+part_len] = 0; + } + return buf; + } + } + buf = realloc(buf, (strlen(name)+2) * sizeof(char)); + buf[0] = '&'; + buf[1] = 0; + strcat(buf, name); + return buf; +} + +static inline bool ascii_is_digit(uint_least32_t cp) +{ + if (cp >= 0x30 && cp <= 0x39) + return true; + return false; +} + +static inline bool ascii_alpha_is_upper(uint_least32_t cp) +{ + if (cp >= 0x41 && cp <= 0x5A) + return true; + return false; +} + +static inline bool ascii_alpha_is_lower(uint_least32_t cp) +{ + if (cp >= 0x61 && cp <= 0x7A) + return true; + return false; +} + +static inline bool ascii_is_alpha(uint_least32_t cp) +{ + if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp)) + return true; + return false; +} + +static inline bool ascii_is_whitespace(uint_least32_t cp) +{ + if ( + cp == TAB || + cp == LF || + cp == FF || + cp == CR || + cp == SPACE + ) + return true; + return false; +} + +static inline bool is_c0_control(uint_least32_t cp) +{ + if (cp >= 0x00 && cp <= 0x1F) + return true; + return false; +} + +static inline bool is_control(uint_least32_t cp) +{ + if (is_c0_control(cp)) + return true; + if (cp >= 0x7F && cp <= 0x9F) + return true; + return false; +} + +static inline bool is_non_char(uint_least32_t cp) +{ + if (cp >= 0xFDD0 && cp <= 0xFDEF) + return true; + if ( + cp == 0xFFFE || cp == 0xFFFF || + cp == 0x1FFFE || cp == 0x1FFFF || + cp == 0x2FFFE || cp == 0x2FFFF || + cp == 0x3FFFE || cp == 0x3FFFF || + cp == 0x4FFFE || cp == 0x4FFFF || + cp == 0x5FFFE || cp == 0x5FFFF || + cp == 0x6FFFE || cp == 0x6FFFF || + cp == 0x7FFFE || cp == 0x7FFFF || + cp == 0x8FFFE || cp == 0x8FFFF || + cp == 0x9FFFE || cp == 0x9FFFF || + cp == 0xAFFFE || cp == 0xAFFFF || + cp == 0xBFFFE || cp == 0xBFFFF || + cp == 0xCFFFE || cp == 0xCFFFF || + cp == 0xDFFFE || cp == 0xDFFFF || + cp == 0xEFFFE || cp == 0xEFFFF || + cp == 0xFFFFE || cp == 0xFFFFF || + cp == 0x10FFFE || cp == 0x10FFFF + ) + return true; + return false; +} + +const char *state_to_string(enum State state) +{ + switch(state) { + case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; + case STATE_TAG: return "STATE_TAG"; + case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; + case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; + case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; + case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; + case STATE_COMMENT: return "STATE_COMMENT"; + case STATE_SCRIPT: return "STATE_SCRIPT"; + case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; + case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; + case STATE_STYLE: return "STATE_STYLE"; + case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; + case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; + case STATE_CHAR_REF: return "STATE_CHAR_REF"; + case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC"; + } + return ""; +} diff --git a/src/html.h b/src/html.h @@ -0,0 +1,124 @@ +#define LESS_THAN_SIGN 0x3C +#define GREATER_THAN_SIGN 0x3E +#define EQUALS_SIGN 0x3D +#define TAB 0x09 +#define LF 0x0A +#define FF 0x0C +#define CR 0x0D +#define SPACE 0x20 +#define SOLIDUS 0x2F +#define EXCLAMATION_MARK 0x21 +#define QUOTATION_MARK 0x22 +#define NUMBER_SIGN 0x23 +#define AMPERSAND 0x26 +#define APOSTROPHE 0x27 +#define GRAVE_ACCENT 0x60 +#define HYPHEN_MINUS 0x2D +#define SEMICOLON 0x3B +#define SMALL_LETTER_X 0x78 +#define CAPITAL_LETTER_X 0x58 + +#define NAMED_CHAR_REF_COUNT 2231 +#define LONGEST_NAMED_CHAR_REF 32 +#define MAX_CODEPOINT_SIZE 4 + +static const char *void_elements[] = { + "area", "base", "br", "col", "embed", "hr", "img", + "input", "link", "meta", "source", "track", "wbr" +}; + +struct Attr { + char *name; + char *value; // optional +}; + +struct Tag { + char *name; + struct Attr **attrs; + struct Tag **children; + char *inner_text; + size_t attrs_len; + size_t children_len; + bool _is_void_element; // means there is no closing tag + bool _is_closed; + size_t _outer_html_begin_offset; + size_t _outer_html_end_offset; + size_t _inner_html_begin_offset; + size_t _inner_html_end_offset; +}; + +struct TagList { + struct Tag **tags; + size_t len; +}; + +enum State { + STATE_INNER_TEXT, + STATE_TAG, + STATE_BEGIN_TAG_NAME, + STATE_END_TAG_NAME, + STATE_ATTR_NAME, + STATE_ATTR_VALUE, + STATE_COMMENT, + STATE_SCRIPT, + STATE_SCRIPT_POSSIBLE_END_TAG, + STATE_SCRIPT_END_TAG, + STATE_STYLE, + STATE_STYLE_POSSIBLE_END_TAG, + STATE_STYLE_END_TAG, + STATE_CHAR_REF, + STATE_CHAR_REF_NUMERIC +}; + +enum DoctypeState { + DSTATE_TEXT, + DSTATE_POSSIBLE_DTYPE, + DSTATE_DTYPE_OR_COMMENT, + DSTATE_DTYPE +}; + +enum AttrValueSyntax { + AVS_NO, + AVS_QUOTATION_MARK, + AVS_APOSTROPHE, + AVS_UNQUOTED +}; + +void html_filter(char *text, struct FindOpts *opts); +void html_print(struct Tag *tag, int indent); + +struct Tag *tag_init(void); +struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state); +struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset); +struct Tag *tag_get_last_open(struct TagList *tag_list); +int tag_doctype_parse(const char *text); +char *tag_get_outer_html(struct Tag *tag, char *text); +char *tag_get_inner_html(struct Tag *tag, char *text); +enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset); +static inline bool tag_is_void_element(struct Tag *tag); +void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset); +void tag_free(struct Tag *tag); +void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags); +void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text); + +struct TagList *tag_list_init(void); +void tag_list_free(struct TagList *tag_list); + +struct Attr *attr_init(void); +static inline bool attr_name_char_is_valid(uint_least32_t cp); +static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp); + +char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base); +char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs); +char *charref_named_encode(const char *name); + +static inline bool ascii_is_digit(uint_least32_t cp); +static inline bool ascii_alpha_is_upper(uint_least32_t cp); +static inline bool ascii_alpha_is_lower(uint_least32_t cp); +static inline bool ascii_is_alpha(uint_least32_t cp); +static inline bool ascii_is_whitespace(uint_least32_t cp); +static inline bool is_c0_control(uint_least32_t cp); +static inline bool is_control(uint_least32_t cp); +static inline bool is_non_char(uint_least32_t cp); + +const char *state_to_string(enum State s); diff --git a/misc.c b/src/misc.c diff --git a/todo b/todo @@ -1,2 +1,3 @@ replace int,size_t with uint* handle correctly when no search pattern was provided +implement charref also for outerhtml,innerhtml