commit 7bc2de577806b2c880cb056afd5f0abdcbb95672
parent 61caacea3913f3844bb5aa7deb5c01f6d2dd5858
Author: Robin <kroekerrobin@gmail.com>
Date: Thu, 4 Apr 2024 10:40:20 +0200
Clean up
Diffstat:
| M | htex.c | | | 10 | +++++++--- |
| D | html.c | | | 883 | ------------------------------------------------------------------------------- |
| D | html.h | | | 124 | ------------------------------------------------------------------------------- |
| R | entities.h -> src/entities.h | | | 0 | |
| A | src/html.c | | | 883 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/html.h | | | 124 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| R | misc.c -> src/misc.c | | | 0 | |
| M | todo | | | 1 | + |
8 files changed, 1015 insertions(+), 1010 deletions(-)
diff --git a/htex.c b/htex.c
@@ -6,8 +6,8 @@
#include <inttypes.h>
#include <grapheme.h>
#include "htex.h"
-#include "misc.c"
-#include "html.c"
+#include "src/misc.c"
+#include "src/html.c"
bool find_opts_exist(struct FindOpts *opts)
{
@@ -168,6 +168,10 @@ int main(int argc, char *argv[])
break;
case 'l':
limit = atoi(optarg);
+ if (limit <= 0) {
+ fprintf(stderr, "htex: Provide a valid limit value.\n");
+ return -1;
+ }
break;
}
}
@@ -213,7 +217,7 @@ int main(int argc, char *argv[])
struct FindOpts *options = find_opts_parse(search_pattern);
options->out = out;
options->is_except = is_except;
- options->limit = limit;
+ options->limit = (size_t)limit;
html_filter(text, options);
free(output);
find_opts_free(options);
diff --git a/html.c b/html.c
@@ -1,883 +0,0 @@
-#include "html.h"
-#include "entities.h"
-
-void html_filter(char *text, struct FindOpts *opts)
-{
- struct TagList *tag_list = tag_list_init();
- struct TagList *found_tags = tag_list_init();
- size_t len = tag_doctype_parse(text);
- if (len == -1) {
- fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n");
- goto CLEAN;
- } else {
- text += len;
- }
- struct Tag *root_tag = tag_parse(tag_list, text, 0, STATE_INNER_TEXT);
- if (!find_opts_exist(opts)) {
- found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag));
- found_tags->tags[0] = root_tag->children[0];
- found_tags->len = 1;
- } else {
- tag_find(root_tag, opts, found_tags);
- }
- tag_print_find_result(root_tag, opts, found_tags, text);
- // html_print(root_tag, -1);
- tag_free(root_tag);
-CLEAN:
- tag_list_free(tag_list);
- tag_list_free(found_tags);
-}
-
-void html_print(struct Tag *tag, int indent)
-{
- for (int i=0; i<indent; i++)
- putchar(' ');
- printf("%s", tag->name);
- for (int i=0; i<tag->attrs_len; i++)
- printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value);
- printf("\n");
- indent++;
- for (int i=tag->children_len-1; i>-1; i--)
- html_print(tag->children[i], indent);
-}
-
-struct Tag *tag_init(void)
-{
- struct Tag *t = malloc(sizeof(struct Tag));
- t->name = malloc(sizeof(char));
- t->name[0] = 0;
- t->inner_text = malloc(sizeof(char));
- t->inner_text[0] = 0;
- t->attrs = NULL;
- t->children = NULL;
- t->attrs_len = 0;
- t->children_len = 0;
- t->_is_void_element = false;
- t->_is_closed = false;
- t->_outer_html_begin_offset = 0;
- t->_outer_html_end_offset = 0;
- t->_inner_html_begin_offset = 0;
- t->_inner_html_end_offset = 0;
- return t;
-}
-
-struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state)
-{
- struct Tag *tag = tag_init();
- tag->_outer_html_begin_offset= offset-1;
- tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag));
- tag_list->tags[tag_list->len] = tag;
- tag_list->len++;
- struct Tag *still_open_tag = tag;
- char *end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- enum State return_to_state = STATE_INNER_TEXT;
- size_t a = 0;
- size_t attr_name_count = 0;
- enum AttrValueSyntax avs = AVS_NO;
- size_t hyphen_count = 0;
- uint_least32_t cp;
- size_t len = strlen(text);
- size_t ret, off;
- for (off = offset; off<len; off += ret) {
- if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
- fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n");
- } else {
- // char *the_codepoint = cp_to_string(cp, ret);
- // printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state));
- // free(the_codepoint);
- switch (state) {
- case STATE_INNER_TEXT:
- if (cp == LESS_THAN_SIGN) {
- state = STATE_TAG;
- break;
- }
- if (cp == AMPERSAND) {
- return_to_state = STATE_INNER_TEXT;
- state = STATE_CHAR_REF;
- break;
- }
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret));
- break;
- case STATE_TAG:
- if (cp == SOLIDUS) {
- state = STATE_END_TAG_NAME;
- break;
- }
- if (cp == EXCLAMATION_MARK) {
- state = STATE_COMMENT;
- break;
- }
- still_open_tag = tag_get_last_open(tag_list);
- struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME);
- still_open_tag->children = realloc(
- still_open_tag->children,
- (still_open_tag->children_len+1) * sizeof(struct Tag)
- );
- still_open_tag->children[still_open_tag->children_len] = one_tag;
- still_open_tag->children_len++;
- free(end_tag);
- return tag;
- case STATE_BEGIN_TAG_NAME:
- if (cp == GREATER_THAN_SIGN) {
- state = tag_process_end_of_opening_tag(tag, off);
- break;
- }
- if (ascii_is_whitespace(cp)) {
- state = STATE_ATTR_NAME;
- break;
- }
- if (ascii_is_digit(cp) || ascii_is_alpha(cp)) {
- tag->name = string_concat(tag->name, cp_to_string(cp, ret));
- }
- break;
- case STATE_END_TAG_NAME:
- if (cp == GREATER_THAN_SIGN) {
- struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
- if (closed_tag != NULL)
- tag_set_inner_html_end_offset(closed_tag, text, off);
- free(end_tag);
- end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- state = STATE_INNER_TEXT;
- break;
- }
- if (!ascii_is_whitespace(cp))
- end_tag = string_concat(end_tag, cp_to_string(cp, ret));
- break;
- case STATE_ATTR_NAME:
- if (cp == GREATER_THAN_SIGN) {
- state = tag_process_end_of_opening_tag(tag, off);
- break;
- }
- if (ascii_is_whitespace(cp)) {
- if (attr_name_count == a+1)
- a++;
- break;
- }
- if (cp == EQUALS_SIGN) {
- state = STATE_ATTR_VALUE;
- break;
- }
- if (attr_name_char_is_valid(cp)) {
- if (attr_name_count != a+1) {
- tag->attrs = realloc(
- tag->attrs,
- (a+1) * sizeof(struct Attr)
- );
- tag->attrs[a] = attr_init();
- attr_name_count = a + 1;
- tag->attrs_len = attr_name_count;
- }
- tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret));
- }
- break;
- case STATE_ATTR_VALUE:
- if (ascii_is_whitespace(cp)) {
- if (avs == AVS_UNQUOTED) {
- avs = AVS_NO;
- state = STATE_ATTR_NAME;
- } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) {
- if (
- strcmp("id", tag->attrs[a]->name) == 0 ||
- strcmp("class", tag->attrs[a]->name) == 0
- ) {
- char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
- strcpy(tmp_name, tag->attrs[a]->name);
- tag->attrs = realloc(
- tag->attrs,
- (a+1) * sizeof(struct Attr)
- );
- a++;
- tag->attrs[a] = attr_init();
- free(tag->attrs[a]->name);
- tag->attrs[a]->name = tmp_name;
- tag->attrs_len++;
- attr_name_count = a + 1;
- } else {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
- }
- }
- break;
- }
- if (cp == QUOTATION_MARK) {
- if (avs == AVS_NO) {
- avs = AVS_QUOTATION_MARK;
- break;
- }
- if (avs == AVS_QUOTATION_MARK) {
- avs = AVS_NO;
- state = STATE_ATTR_NAME;
- break;
- }
- }
- if (cp == APOSTROPHE) {
- if (avs == AVS_NO) {
- avs = AVS_APOSTROPHE;
- break;
- }
- if (avs == AVS_APOSTROPHE) {
- avs = AVS_NO;
- state = STATE_ATTR_NAME;
- break;
- }
- }
- if (cp == GREATER_THAN_SIGN) {
- state = tag_process_end_of_opening_tag(tag, off);
- break;
- }
- if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) {
- avs = AVS_UNQUOTED;
- }
- if (avs > AVS_NO) {
- if (cp == AMPERSAND) {
- state = STATE_CHAR_REF;
- return_to_state = STATE_ATTR_VALUE;
- break;
- }
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
- }
- break;
- case STATE_COMMENT:
- if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) {
- state = STATE_INNER_TEXT;
- break;
- }
- if (cp == HYPHEN_MINUS)
- hyphen_count++;
- else
- hyphen_count = 0;
- break;
- case STATE_STYLE:
- if (cp == LESS_THAN_SIGN) {
- state = STATE_STYLE_POSSIBLE_END_TAG;
- break;
- }
- break;
- case STATE_STYLE_POSSIBLE_END_TAG:
- if (cp == SOLIDUS)
- state = STATE_STYLE_END_TAG;
- else
- state = STATE_STYLE;
- break;
- case STATE_STYLE_END_TAG:
- if (cp == GREATER_THAN_SIGN) {
- struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
- if (closed_tag != NULL)
- tag_set_inner_html_end_offset(closed_tag, text, off);
- free(end_tag);
- end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- state = STATE_INNER_TEXT;
- break;
- }
- if (!ascii_is_whitespace(cp))
- end_tag = string_concat(end_tag, cp_to_string(cp, ret));
- break;
- case STATE_SCRIPT:
- if (cp == LESS_THAN_SIGN) {
- state = STATE_SCRIPT_POSSIBLE_END_TAG;
- break;
- }
- break;
- case STATE_SCRIPT_POSSIBLE_END_TAG:
- if (cp == SOLIDUS)
- state = STATE_SCRIPT_END_TAG;
- else
- state = STATE_SCRIPT;
- break;
- case STATE_SCRIPT_END_TAG:
- if (cp == GREATER_THAN_SIGN) {
- struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
- if (closed_tag != NULL)
- tag_set_inner_html_end_offset(closed_tag, text, off);
- free(end_tag);
- end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- state = STATE_INNER_TEXT;
- break;
- }
- if (!ascii_is_whitespace(cp))
- end_tag = string_concat(end_tag, cp_to_string(cp, ret));
- break;
- case STATE_CHAR_REF:
- if (cp == NUMBER_SIGN) { /* hashtag */
- state = STATE_CHAR_REF_NUMERIC;
- break;
- }
- char *named_charref = charref_named_parse(text, off, len, avs);
- off += strlen(named_charref)-1;
- char *encoded_named_charref = charref_named_encode(named_charref);
- if (return_to_state == STATE_INNER_TEXT) {
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref);
- } else if (return_to_state == STATE_ATTR_VALUE) {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref);
- }
- free(named_charref);
- state = return_to_state;
- break;
- case STATE_CHAR_REF_NUMERIC:
- if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) {
- size_t new_offset;
- char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16);
- off += new_offset;
- if (return_to_state == STATE_INNER_TEXT) {
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
- } else if (return_to_state == STATE_ATTR_VALUE) {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
- }
- state = return_to_state;
- break;
- } else if (ascii_is_digit(cp)) {
- size_t new_offset;
- char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10);
- off += new_offset-1;
- if (return_to_state == STATE_INNER_TEXT) {
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
- } else if (return_to_state == STATE_ATTR_VALUE) {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
- }
- state = return_to_state;
- break;
- }
- state = return_to_state;
- break;
- }
- }
- }
- free(end_tag);
- return tag;
-}
-
-struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset)
-{
- for (int i=tag_list->len-1; i>-1; i--) {
- if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->_is_closed) {
- tag_list->tags[i]->_is_closed = true;
- tag_list->tags[i]->_outer_html_end_offset = end_offset;
- return tag_list->tags[i];
- }
- }
- return NULL;
-}
-
-struct Tag *tag_get_last_open(struct TagList *tag_list)
-{
- for (int i=tag_list->len-1; i>-1; i--) {
- if (!tag_list->tags[i]->_is_void_element && !tag_list->tags[i]->_is_closed) {
- return tag_list->tags[i];
- }
- }
- return tag_list->tags[0];
-}
-
-size_t tag_doctype_parse(const char *text)
-{
- size_t offset = 0;
- enum DoctypeState state = DSTATE_TEXT;
- char *doctype = NULL;
- char *lower_doctype = NULL;
- uint_least32_t cp;
- size_t len = strlen(text);
- size_t ret, off;
- for (off = 0; off<len; off += ret) {
- if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
- fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n");
- } else {
- switch (state) {
- case DSTATE_TEXT:
- if (cp == LESS_THAN_SIGN) {
- state = DSTATE_POSSIBLE_DTYPE;
- break;
- }
- if (cp == GREATER_THAN_SIGN) {
- offset = off;
- goto CLEANUP;
- }
- break;
- case DSTATE_POSSIBLE_DTYPE:
- if (cp == EXCLAMATION_MARK)
- state = DSTATE_DTYPE_OR_COMMENT;
- else
- goto CLEANUP;
- break;
- case DSTATE_DTYPE_OR_COMMENT:
- if (cp == HYPHEN_MINUS) {
- goto CLEANUP;
- } else {
- doctype = string_concat(doctype, cp_to_string(cp, ret));
- state = DSTATE_DTYPE;
- break;
- }
- break;
- case DSTATE_DTYPE:
- if (ascii_is_whitespace(cp)) {
- size_t dlen = strlen(doctype)+1;
- lower_doctype = malloc(dlen * sizeof(char));
- grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen);
- if (strcmp(lower_doctype, "doctype") == 0) {
- state = DSTATE_TEXT;
- } else {
- offset = -1;
- goto CLEANUP;
- }
- break;
- }
- doctype = string_concat(doctype, cp_to_string(cp, ret));
- break;
- }
- }
- }
-CLEANUP:
- free(doctype);
- free(lower_doctype);
- return offset;
-}
-
-char *tag_get_outer_html(struct Tag *tag, char *text)
-{
- char *outer_html = NULL;
- int o = 0;
- for (int i=tag->_outer_html_begin_offset; i<tag->_outer_html_end_offset; i++) {
- outer_html = realloc(outer_html, (o+1) * sizeof(char));
- outer_html[o] = text[i];
- o++;
- }
- outer_html = realloc(outer_html, (o+1) * sizeof(char));
- outer_html[o] = 0;
- return outer_html;
-}
-
-char *tag_get_inner_html(struct Tag *tag, char *text)
-{
- char *inner_html = NULL;
- int o = 0;
- for (int i=tag->_inner_html_begin_offset; i<tag->_inner_html_end_offset; i++) {
- inner_html = realloc(inner_html, (o+1) * sizeof(char));
- inner_html[o] = text[i];
- o++;
- }
- inner_html = realloc(inner_html, (o+1) * sizeof(char));
- inner_html[o] = 0;
- return inner_html;
-}
-
-enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset)
-{
- tag->_inner_html_begin_offset = offset+1;
- tag->_is_void_element = tag_is_void_element(tag);
- if (tag->_is_void_element)
- tag->_outer_html_end_offset = offset+1;
- if (strcmp(tag->name, "script") == 0)
- return STATE_SCRIPT;
- else if (strcmp(tag->name, "style") == 0)
- return STATE_STYLE;
- return STATE_INNER_TEXT;
-}
-
-static inline bool tag_is_void_element(struct Tag *tag)
-{
- for (int i=0; i<13; i++) {
- if (strcmp(tag->name, void_elements[i]) == 0)
- return true;
- }
- return false;
-}
-
-void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset)
-{
- int i = offset;
- while (text[i] != '<')
- i--;
- closed_tag->_inner_html_end_offset = i;
-}
-
-void tag_free(struct Tag *tag)
-{
- free(tag->name);
- free(tag->inner_text);
- for (int i=0; i<tag->attrs_len; i++) {
- free(tag->attrs[i]->name);
- free(tag->attrs[i]->value);
- free(tag->attrs[i]);
- }
- free(tag->attrs);
- for (int i=0; i<tag->children_len; i++) {
- if (tag->children[i] != NULL)
- tag_free(tag->children[i]);
- }
- free(tag->children);
- free(tag);
-}
-
-void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags)
-{
- if (opts->limit > 0 && found_tags->len == opts->limit)
- return;
- bool matches_tag = false;
- bool matches_attr_key = false;
- bool matches_attr_value = false;
- if (strcmp(tag->name, opts->tag) == 0)
- matches_tag = true;
- for (int i=0; i<tag->attrs_len; i++) {
- if (strcmp(tag->attrs[i]->name, opts->key) == 0)
- matches_attr_key = true;
- if (strcmp(tag->attrs[i]->value, opts->attr) == 0)
- matches_attr_value = true;
- }
- if (strlen(opts->tag) > 0 && strlen(opts->key) > 0 && strlen(opts->attr) > 0) {
- if (matches_tag && matches_attr_key && matches_attr_value) {
- found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
- found_tags->tags[found_tags->len] = tag;
- found_tags->len++;
- }
- } else if (strlen(opts->tag) > 0 && strlen(opts->key) > 0) {
- if (matches_tag && matches_attr_key) {
- found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
- found_tags->tags[found_tags->len] = tag;
- found_tags->len++;
- }
- } else if (strlen(opts->tag) > 0) {
- if (matches_tag) {
- found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
- found_tags->tags[found_tags->len] = tag;
- found_tags->len++;
- }
- } else if (strlen(opts->key) > 0 && strlen(opts->attr) > 0) {
- if (matches_attr_key && matches_attr_value) {
- found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
- found_tags->tags[found_tags->len] = tag;
- found_tags->len++;
- }
- } else if (strlen(opts->key) > 0) {
- if (matches_attr_key) {
- found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
- found_tags->tags[found_tags->len] = tag;
- found_tags->len++;
- }
- } else if (strlen(opts->attr) > 0) {
- if (matches_attr_value) {
- found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
- found_tags->tags[found_tags->len] = tag;
- found_tags->len++;
- }
- }
- for (int i=tag->children_len-1; i>-1; i--) {
- tag_find(tag->children[i], opts, found_tags);
- }
-}
-
-void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text)
-{
- if (opts->is_except) {
- bool is_match = false;
- for (int i=0; i<strlen(text); i++) {
- is_match = false;
- for (int k=0; k<found_tags->len; k++) {
- if (
- found_tags->tags[k]->_outer_html_begin_offset <= i &&
- found_tags->tags[k]->_outer_html_end_offset > i
- )
- is_match = true;
- }
- if (!is_match)
- putchar(text[i]);
- }
- } else {
- char *requested_text = NULL;
- char *trimmed_text = NULL;
- for (int i=0; i<found_tags->len; i++) {
- switch (opts->out) {
- case OUT_INNER_HTML:
- requested_text = tag_get_inner_html(found_tags->tags[i], text);
- trimmed_text = string_trim(requested_text);
- free(requested_text);
- break;
- case OUT_OUTER_HTML:
- requested_text = tag_get_outer_html(found_tags->tags[i], text);
- trimmed_text = string_trim(requested_text);
- free(requested_text);
- break;
- case OUT_INNER_TEXT:
- trimmed_text = string_trim(found_tags->tags[i]->inner_text);
- break;
- case OUT_ATTR_VALUE:
- if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) {
- for (int k=0; k<found_tags->tags[i]->attrs_len; k++) {
- if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0)
- printf("%s\n", found_tags->tags[i]->attrs[k]->value);
- }
- } else if (strlen(opts->tag) > 0) {
- for (int k=0; k<found_tags->tags[i]->attrs_len; k++)
- printf("%s\n", found_tags->tags[i]->attrs[k]->value);
- }
- break;
- }
- if (trimmed_text) {
- if (strlen(trimmed_text) > 0)
- printf("%s\n", trimmed_text);
- free(trimmed_text);
- }
- }
- }
-}
-
-struct TagList *tag_list_init(void)
-{
- struct TagList *tag_list = malloc(sizeof(struct TagList));
- tag_list->tags = NULL;
- tag_list->len = 0;
- return tag_list;
-}
-
-void tag_list_free(struct TagList *tag_list)
-{
- free(tag_list->tags);
- free(tag_list);
-}
-
-struct Attr *attr_init(void)
-{
- struct Attr *attr = malloc(sizeof(struct Attr));
- attr->name = malloc(sizeof(char));
- attr->name[0] = 0;
- attr->value = malloc(sizeof(char));
- attr->value[0] = 0;
- return attr;
-}
-
-static inline bool attr_name_char_is_valid(uint_least32_t cp)
-{
- if (is_control(cp))
- return false;
- if (is_non_char(cp))
- return false;
- if (
- cp == SPACE ||
- cp == QUOTATION_MARK ||
- cp == APOSTROPHE ||
- cp == GREATER_THAN_SIGN ||
- cp == SOLIDUS ||
- cp == EQUALS_SIGN
- )
- return false;
- return true;
-}
-
-static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp)
-{
- /*
- Not mentioned invalid characters.
- They are already handled before
- function call.
- */
- if (
- cp == EQUALS_SIGN ||
- cp == LESS_THAN_SIGN ||
- cp == GREATER_THAN_SIGN ||
- cp == GRAVE_ACCENT
- )
- return false;
- return true;
-}
-
-char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base)
-{
- size_t old_offset = offset;
- char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char));
- char *numeric_charref = malloc(sizeof(char));
- numeric_charref[0] = 0;
- size_t ret;
- uint_least32_t cp;
- do {
- ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
- numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret));
- offset += ret;
- } while (cp != SEMICOLON);
- *new_offset = offset - old_offset;
- long i = strtol(numeric_charref, NULL, base);
- ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE);
- character[ret] = 0;
- free(numeric_charref);
- return character;
-}
-
-char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs)
-{
- uint_least32_t stop_at = 0;
- switch(avs) {
- case AVS_QUOTATION_MARK:
- stop_at = QUOTATION_MARK;
- break;
- case AVS_APOSTROPHE:
- stop_at = APOSTROPHE;
- break;
- case AVS_UNQUOTED:
- stop_at = GREATER_THAN_SIGN;
- break;
- case AVS_NO: /* Just to silence the compilier warning */
- break;
- }
- char *named_charref = malloc(sizeof(char));
- named_charref[0] = 0;
- size_t ret;
- uint_least32_t cp;
- int i = 0;
- for (;;) {
- ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
- if (cp == AMPERSAND || ascii_is_whitespace(cp))
- break;
- if (avs > AVS_NO && cp == stop_at)
- break;
- named_charref = string_concat(named_charref, cp_to_string(cp, ret));
- if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
- break;
- offset += ret;
- i++;
- }
- return named_charref;
-}
-
-char *charref_named_encode(const char *name)
-{
- char *buf = malloc(2*MAX_CODEPOINT_SIZE+1);
- char cp[MAX_CODEPOINT_SIZE];
- memset(&cp, 0, MAX_CODEPOINT_SIZE);
- size_t len;
- for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) {
- if (string_starts_with(name, entities[i].name)) {
- len = grapheme_encode_utf8(entities[i].cp[0], cp, MAX_CODEPOINT_SIZE);
- strcpy(buf, cp);
- if (entities[i].cp[1] != 0) {
- len += grapheme_encode_utf8(entities[i].cp[1], cp, MAX_CODEPOINT_SIZE);
- strcat(buf, cp);
- }
- buf[len] = 0;
- const char *part = &name[strlen(entities[i].name)];
- size_t part_len = strlen(part);
- if (part_len > 0) {
- if (part_len == 1 && part[0] == ';')
- return buf;
- buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+part_len);
- strcat(buf, &name[strlen(entities[i].name)]);
- buf[len+part_len] = 0;
- }
- return buf;
- }
- }
- buf = realloc(buf, (strlen(name)+2) * sizeof(char));
- buf[0] = '&';
- buf[1] = 0;
- strcat(buf, name);
- return buf;
-}
-
-static inline bool ascii_is_digit(uint_least32_t cp)
-{
- if (cp >= 0x30 && cp <= 0x39)
- return true;
- return false;
-}
-
-static inline bool ascii_alpha_is_upper(uint_least32_t cp)
-{
- if (cp >= 0x41 && cp <= 0x5A)
- return true;
- return false;
-}
-
-static inline bool ascii_alpha_is_lower(uint_least32_t cp)
-{
- if (cp >= 0x61 && cp <= 0x7A)
- return true;
- return false;
-}
-
-static inline bool ascii_is_alpha(uint_least32_t cp)
-{
- if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp))
- return true;
- return false;
-}
-
-static inline bool ascii_is_whitespace(uint_least32_t cp)
-{
- if (
- cp == TAB ||
- cp == LF ||
- cp == FF ||
- cp == CR ||
- cp == SPACE
- )
- return true;
- return false;
-}
-
-static inline bool is_c0_control(uint_least32_t cp)
-{
- if (cp >= 0x00 && cp <= 0x1F)
- return true;
- return false;
-}
-
-static inline bool is_control(uint_least32_t cp)
-{
- if (is_c0_control(cp))
- return true;
- if (cp >= 0x7F && cp <= 0x9F)
- return true;
- return false;
-}
-
-static inline bool is_non_char(uint_least32_t cp)
-{
- if (cp >= 0xFDD0 && cp <= 0xFDEF)
- return true;
- if (
- cp == 0xFFFE || cp == 0xFFFF ||
- cp == 0x1FFFE || cp == 0x1FFFF ||
- cp == 0x2FFFE || cp == 0x2FFFF ||
- cp == 0x3FFFE || cp == 0x3FFFF ||
- cp == 0x4FFFE || cp == 0x4FFFF ||
- cp == 0x5FFFE || cp == 0x5FFFF ||
- cp == 0x6FFFE || cp == 0x6FFFF ||
- cp == 0x7FFFE || cp == 0x7FFFF ||
- cp == 0x8FFFE || cp == 0x8FFFF ||
- cp == 0x9FFFE || cp == 0x9FFFF ||
- cp == 0xAFFFE || cp == 0xAFFFF ||
- cp == 0xBFFFE || cp == 0xBFFFF ||
- cp == 0xCFFFE || cp == 0xCFFFF ||
- cp == 0xDFFFE || cp == 0xDFFFF ||
- cp == 0xEFFFE || cp == 0xEFFFF ||
- cp == 0xFFFFE || cp == 0xFFFFF ||
- cp == 0x10FFFE || cp == 0x10FFFF
- )
- return true;
- return false;
-}
-
-const char *state_to_string(enum State state)
-{
- switch(state) {
- case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
- case STATE_TAG: return "STATE_TAG";
- case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
- case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
- case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
- case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
- case STATE_COMMENT: return "STATE_COMMENT";
- case STATE_SCRIPT: return "STATE_SCRIPT";
- case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
- case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
- case STATE_STYLE: return "STATE_STYLE";
- case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
- case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
- case STATE_CHAR_REF: return "STATE_CHAR_REF";
- case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC";
- }
- return "";
-}
diff --git a/html.h b/html.h
@@ -1,124 +0,0 @@
-#define LESS_THAN_SIGN 0x3C
-#define GREATER_THAN_SIGN 0x3E
-#define EQUALS_SIGN 0x3D
-#define TAB 0x09
-#define LF 0x0A
-#define FF 0x0C
-#define CR 0x0D
-#define SPACE 0x20
-#define SOLIDUS 0x2F
-#define EXCLAMATION_MARK 0x21
-#define QUOTATION_MARK 0x22
-#define NUMBER_SIGN 0x23
-#define AMPERSAND 0x26
-#define APOSTROPHE 0x27
-#define GRAVE_ACCENT 0x60
-#define HYPHEN_MINUS 0x2D
-#define SEMICOLON 0x3B
-#define SMALL_LETTER_X 0x78
-#define CAPITAL_LETTER_X 0x58
-
-#define NAMED_CHAR_REF_COUNT 2231
-#define LONGEST_NAMED_CHAR_REF 32
-#define MAX_CODEPOINT_SIZE 4
-
-static const char *void_elements[] = {
- "area", "base", "br", "col", "embed", "hr", "img",
- "input", "link", "meta", "source", "track", "wbr"
-};
-
-struct Attr {
- char *name;
- char *value; // optional
-};
-
-struct Tag {
- char *name;
- struct Attr **attrs;
- struct Tag **children;
- char *inner_text;
- size_t attrs_len;
- size_t children_len;
- bool _is_void_element; // means there is no closing tag
- bool _is_closed;
- size_t _outer_html_begin_offset;
- size_t _outer_html_end_offset;
- size_t _inner_html_begin_offset;
- size_t _inner_html_end_offset;
-};
-
-struct TagList {
- struct Tag **tags;
- size_t len;
-};
-
-enum State {
- STATE_INNER_TEXT,
- STATE_TAG,
- STATE_BEGIN_TAG_NAME,
- STATE_END_TAG_NAME,
- STATE_ATTR_NAME,
- STATE_ATTR_VALUE,
- STATE_COMMENT,
- STATE_SCRIPT,
- STATE_SCRIPT_POSSIBLE_END_TAG,
- STATE_SCRIPT_END_TAG,
- STATE_STYLE,
- STATE_STYLE_POSSIBLE_END_TAG,
- STATE_STYLE_END_TAG,
- STATE_CHAR_REF,
- STATE_CHAR_REF_NUMERIC
-};
-
-enum DoctypeState {
- DSTATE_TEXT,
- DSTATE_POSSIBLE_DTYPE,
- DSTATE_DTYPE_OR_COMMENT,
- DSTATE_DTYPE
-};
-
-enum AttrValueSyntax {
- AVS_NO,
- AVS_QUOTATION_MARK,
- AVS_APOSTROPHE,
- AVS_UNQUOTED
-};
-
-void html_filter(char *text, struct FindOpts *opts);
-void html_print(struct Tag *tag, int indent);
-
-struct Tag *tag_init(void);
-struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state);
-struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset);
-struct Tag *tag_get_last_open(struct TagList *tag_list);
-size_t tag_doctype_parse(const char *text);
-char *tag_get_outer_html(struct Tag *tag, char *text);
-char *tag_get_inner_html(struct Tag *tag, char *text);
-enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset);
-static inline bool tag_is_void_element(struct Tag *tag);
-void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset);
-void tag_free(struct Tag *tag);
-void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags);
-void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text);
-
-struct TagList *tag_list_init(void);
-void tag_list_free(struct TagList *tag_list);
-
-struct Attr *attr_init(void);
-static inline bool attr_name_char_is_valid(uint_least32_t cp);
-static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp);
-
-char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base);
-char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs);
-char *charref_named_encode(const char *name);
-
-static inline bool ascii_is_digit(uint_least32_t cp);
-static inline bool ascii_alpha_is_upper(uint_least32_t cp);
-static inline bool ascii_alpha_is_lower(uint_least32_t cp);
-static inline bool ascii_is_alpha(uint_least32_t cp);
-static inline bool ascii_is_whitespace(uint_least32_t cp);
-static inline bool is_c0_control(uint_least32_t cp);
-static inline bool is_control(uint_least32_t cp);
-static inline bool is_non_char(uint_least32_t cp);
-
-const char *state_to_string(enum State s);
diff --git a/entities.h b/src/entities.h
diff --git a/src/html.c b/src/html.c
@@ -0,0 +1,883 @@
+#include "html.h"
+#include "entities.h"
+
+void html_filter(char *text, struct FindOpts *opts)
+{
+ struct TagList *tag_list = tag_list_init();
+ struct TagList *found_tags = tag_list_init();
+ int len = tag_doctype_parse(text);
+ if (len == -1) {
+ fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n");
+ goto CLEAN;
+ } else {
+ text += len;
+ }
+ struct Tag *root_tag = tag_parse(tag_list, text, 0, STATE_INNER_TEXT);
+ if (!find_opts_exist(opts)) {
+ found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag));
+ found_tags->tags[0] = root_tag->children[0];
+ found_tags->len = 1;
+ } else {
+ tag_find(root_tag, opts, found_tags);
+ }
+ tag_print_find_result(root_tag, opts, found_tags, text);
+ // html_print(root_tag, -1);
+ tag_free(root_tag);
+CLEAN:
+ tag_list_free(tag_list);
+ tag_list_free(found_tags);
+}
+
+void html_print(struct Tag *tag, int indent)
+{
+ for (int i=0; i<indent; i++)
+ putchar(' ');
+ printf("%s", tag->name);
+ for (int i=0; i<tag->attrs_len; i++)
+ printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value);
+ printf("\n");
+ indent++;
+ for (int i=tag->children_len-1; i>-1; i--)
+ html_print(tag->children[i], indent);
+}
+
+struct Tag *tag_init(void)
+{
+ struct Tag *t = malloc(sizeof(struct Tag));
+ t->name = malloc(sizeof(char));
+ t->name[0] = 0;
+ t->inner_text = malloc(sizeof(char));
+ t->inner_text[0] = 0;
+ t->attrs = NULL;
+ t->children = NULL;
+ t->attrs_len = 0;
+ t->children_len = 0;
+ t->_is_void_element = false;
+ t->_is_closed = false;
+ t->_outer_html_begin_offset = 0;
+ t->_outer_html_end_offset = 0;
+ t->_inner_html_begin_offset = 0;
+ t->_inner_html_end_offset = 0;
+ return t;
+}
+
+struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state)
+{
+ struct Tag *tag = tag_init();
+ tag->_outer_html_begin_offset= offset-1;
+ tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag));
+ tag_list->tags[tag_list->len] = tag;
+ tag_list->len++;
+ struct Tag *still_open_tag = tag;
+ char *end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ enum State return_to_state = STATE_INNER_TEXT;
+ size_t a = 0;
+ size_t attr_name_count = 0;
+ enum AttrValueSyntax avs = AVS_NO;
+ size_t hyphen_count = 0;
+ uint_least32_t cp;
+ size_t len = strlen(text);
+ size_t ret, off;
+ for (off = offset; off<len; off += ret) {
+ if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
+ fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n");
+ } else {
+ /* char *the_codepoint = cp_to_string(cp, ret);
+ printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state));
+ free(the_codepoint); */
+ switch (state) {
+ case STATE_INNER_TEXT:
+ if (cp == LESS_THAN_SIGN) {
+ state = STATE_TAG;
+ break;
+ }
+ if (cp == AMPERSAND) {
+ return_to_state = STATE_INNER_TEXT;
+ state = STATE_CHAR_REF;
+ break;
+ }
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret));
+ break;
+ case STATE_TAG:
+ if (cp == SOLIDUS) {
+ state = STATE_END_TAG_NAME;
+ break;
+ }
+ if (cp == EXCLAMATION_MARK) {
+ state = STATE_COMMENT;
+ break;
+ }
+ still_open_tag = tag_get_last_open(tag_list);
+ struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME);
+ still_open_tag->children = realloc(
+ still_open_tag->children,
+ (still_open_tag->children_len+1) * sizeof(struct Tag)
+ );
+ still_open_tag->children[still_open_tag->children_len] = one_tag;
+ still_open_tag->children_len++;
+ free(end_tag);
+ return tag;
+ case STATE_BEGIN_TAG_NAME:
+ if (cp == GREATER_THAN_SIGN) {
+ state = tag_process_end_of_opening_tag(tag, off);
+ break;
+ }
+ if (ascii_is_whitespace(cp)) {
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ if (ascii_is_digit(cp) || ascii_is_alpha(cp)) {
+ tag->name = string_concat(tag->name, cp_to_string(cp, ret));
+ }
+ break;
+ case STATE_END_TAG_NAME:
+ if (cp == GREATER_THAN_SIGN) {
+ struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
+ if (closed_tag != NULL)
+ tag_set_inner_html_end_offset(closed_tag, text, off);
+ free(end_tag);
+ end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!ascii_is_whitespace(cp))
+ end_tag = string_concat(end_tag, cp_to_string(cp, ret));
+ break;
+ case STATE_ATTR_NAME:
+ if (cp == GREATER_THAN_SIGN) {
+ state = tag_process_end_of_opening_tag(tag, off);
+ break;
+ }
+ if (ascii_is_whitespace(cp)) {
+ if (attr_name_count == a+1)
+ a++;
+ break;
+ }
+ if (cp == EQUALS_SIGN) {
+ state = STATE_ATTR_VALUE;
+ break;
+ }
+ if (attr_name_char_is_valid(cp)) {
+ if (attr_name_count != a+1) {
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct Attr)
+ );
+ tag->attrs[a] = attr_init();
+ attr_name_count = a + 1;
+ tag->attrs_len = attr_name_count;
+ }
+ tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret));
+ }
+ break;
+ case STATE_ATTR_VALUE:
+ if (ascii_is_whitespace(cp)) {
+ if (avs == AVS_UNQUOTED) {
+ avs = AVS_NO;
+ state = STATE_ATTR_NAME;
+ } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) {
+ if (
+ strcmp("id", tag->attrs[a]->name) == 0 ||
+ strcmp("class", tag->attrs[a]->name) == 0
+ ) {
+ char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
+ strcpy(tmp_name, tag->attrs[a]->name);
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct Attr)
+ );
+ a++;
+ tag->attrs[a] = attr_init();
+ free(tag->attrs[a]->name);
+ tag->attrs[a]->name = tmp_name;
+ tag->attrs_len++;
+ attr_name_count = a + 1;
+ } else {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
+ }
+ }
+ break;
+ }
+ if (cp == QUOTATION_MARK) {
+ if (avs == AVS_NO) {
+ avs = AVS_QUOTATION_MARK;
+ break;
+ }
+ if (avs == AVS_QUOTATION_MARK) {
+ avs = AVS_NO;
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ }
+ if (cp == APOSTROPHE) {
+ if (avs == AVS_NO) {
+ avs = AVS_APOSTROPHE;
+ break;
+ }
+ if (avs == AVS_APOSTROPHE) {
+ avs = AVS_NO;
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ }
+ if (cp == GREATER_THAN_SIGN) {
+ state = tag_process_end_of_opening_tag(tag, off);
+ break;
+ }
+ if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) {
+ avs = AVS_UNQUOTED;
+ }
+ if (avs > AVS_NO) {
+ if (cp == AMPERSAND) {
+ state = STATE_CHAR_REF;
+ return_to_state = STATE_ATTR_VALUE;
+ break;
+ }
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
+ }
+ break;
+ case STATE_COMMENT:
+ if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) {
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (cp == HYPHEN_MINUS)
+ hyphen_count++;
+ else
+ hyphen_count = 0;
+ break;
+ case STATE_STYLE:
+ if (cp == LESS_THAN_SIGN) {
+ state = STATE_STYLE_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_STYLE_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_STYLE_END_TAG;
+ else
+ state = STATE_STYLE;
+ break;
+ case STATE_STYLE_END_TAG:
+ if (cp == GREATER_THAN_SIGN) {
+ struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
+ if (closed_tag != NULL)
+ tag_set_inner_html_end_offset(closed_tag, text, off);
+ free(end_tag);
+ end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!ascii_is_whitespace(cp))
+ end_tag = string_concat(end_tag, cp_to_string(cp, ret));
+ break;
+ case STATE_SCRIPT:
+ if (cp == LESS_THAN_SIGN) {
+ state = STATE_SCRIPT_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_SCRIPT_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_SCRIPT_END_TAG;
+ else
+ state = STATE_SCRIPT;
+ break;
+ case STATE_SCRIPT_END_TAG:
+ if (cp == GREATER_THAN_SIGN) {
+ struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
+ if (closed_tag != NULL)
+ tag_set_inner_html_end_offset(closed_tag, text, off);
+ free(end_tag);
+ end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!ascii_is_whitespace(cp))
+ end_tag = string_concat(end_tag, cp_to_string(cp, ret));
+ break;
+ case STATE_CHAR_REF:
+ if (cp == NUMBER_SIGN) { /* hashtag */
+ state = STATE_CHAR_REF_NUMERIC;
+ break;
+ }
+ char *named_charref = charref_named_parse(text, off, len, avs);
+ off += strlen(named_charref)-1;
+ char *encoded_named_charref = charref_named_encode(named_charref);
+ if (return_to_state == STATE_INNER_TEXT) {
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref);
+ } else if (return_to_state == STATE_ATTR_VALUE) {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref);
+ }
+ free(named_charref);
+ state = return_to_state;
+ break;
+ case STATE_CHAR_REF_NUMERIC:
+ if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) {
+ size_t new_offset;
+ char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16);
+ off += new_offset;
+ if (return_to_state == STATE_INNER_TEXT) {
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
+ } else if (return_to_state == STATE_ATTR_VALUE) {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
+ }
+ state = return_to_state;
+ break;
+ } else if (ascii_is_digit(cp)) {
+ size_t new_offset;
+ char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10);
+ off += new_offset-1;
+ if (return_to_state == STATE_INNER_TEXT) {
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
+ } else if (return_to_state == STATE_ATTR_VALUE) {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
+ }
+ state = return_to_state;
+ break;
+ }
+ state = return_to_state;
+ break;
+ }
+ }
+ }
+ free(end_tag);
+ return tag;
+}
+
+struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset)
+{
+ for (int i=tag_list->len-1; i>-1; i--) {
+ if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->_is_closed) {
+ tag_list->tags[i]->_is_closed = true;
+ tag_list->tags[i]->_outer_html_end_offset = end_offset;
+ return tag_list->tags[i];
+ }
+ }
+ return NULL;
+}
+
+struct Tag *tag_get_last_open(struct TagList *tag_list)
+{
+ for (int i=tag_list->len-1; i>-1; i--) {
+ if (!tag_list->tags[i]->_is_void_element && !tag_list->tags[i]->_is_closed) {
+ return tag_list->tags[i];
+ }
+ }
+ return tag_list->tags[0];
+}
+
+int tag_doctype_parse(const char *text)
+{
+ size_t offset = 0;
+ enum DoctypeState state = DSTATE_TEXT;
+ char *doctype = NULL;
+ char *lower_doctype = NULL;
+ uint_least32_t cp;
+ size_t len = strlen(text);
+ size_t ret, off;
+ for (off = 0; off<len; off += ret) {
+ if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
+ fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n");
+ } else {
+ switch (state) {
+ case DSTATE_TEXT:
+ if (cp == LESS_THAN_SIGN) {
+ state = DSTATE_POSSIBLE_DTYPE;
+ break;
+ }
+ if (cp == GREATER_THAN_SIGN) {
+ offset = off;
+ goto CLEANUP;
+ }
+ break;
+ case DSTATE_POSSIBLE_DTYPE:
+ if (cp == EXCLAMATION_MARK)
+ state = DSTATE_DTYPE_OR_COMMENT;
+ else
+ goto CLEANUP;
+ break;
+ case DSTATE_DTYPE_OR_COMMENT:
+ if (cp == HYPHEN_MINUS) {
+ goto CLEANUP;
+ } else {
+ doctype = string_concat(doctype, cp_to_string(cp, ret));
+ state = DSTATE_DTYPE;
+ break;
+ }
+ break;
+ case DSTATE_DTYPE:
+ if (ascii_is_whitespace(cp)) {
+ size_t dlen = strlen(doctype)+1;
+ lower_doctype = malloc(dlen * sizeof(char));
+ grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen);
+ if (strcmp(lower_doctype, "doctype") == 0) {
+ state = DSTATE_TEXT;
+ } else {
+ offset = -1;
+ goto CLEANUP;
+ }
+ break;
+ }
+ doctype = string_concat(doctype, cp_to_string(cp, ret));
+ break;
+ }
+ }
+ }
+CLEANUP:
+ free(doctype);
+ free(lower_doctype);
+ return offset;
+}
+
+char *tag_get_outer_html(struct Tag *tag, char *text)
+{
+ char *outer_html = NULL;
+ int o = 0;
+ for (int i=tag->_outer_html_begin_offset; i<tag->_outer_html_end_offset; i++) {
+ outer_html = realloc(outer_html, (o+1) * sizeof(char));
+ outer_html[o] = text[i];
+ o++;
+ }
+ outer_html = realloc(outer_html, (o+1) * sizeof(char));
+ outer_html[o] = 0;
+ return outer_html;
+}
+
+char *tag_get_inner_html(struct Tag *tag, char *text)
+{
+ char *inner_html = NULL;
+ int o = 0;
+ for (int i=tag->_inner_html_begin_offset; i<tag->_inner_html_end_offset; i++) {
+ inner_html = realloc(inner_html, (o+1) * sizeof(char));
+ inner_html[o] = text[i];
+ o++;
+ }
+ inner_html = realloc(inner_html, (o+1) * sizeof(char));
+ inner_html[o] = 0;
+ return inner_html;
+}
+
+enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset)
+{
+ tag->_inner_html_begin_offset = offset+1;
+ tag->_is_void_element = tag_is_void_element(tag);
+ if (tag->_is_void_element)
+ tag->_outer_html_end_offset = offset+1;
+ if (strcmp(tag->name, "script") == 0)
+ return STATE_SCRIPT;
+ else if (strcmp(tag->name, "style") == 0)
+ return STATE_STYLE;
+ return STATE_INNER_TEXT;
+}
+
+static inline bool tag_is_void_element(struct Tag *tag)
+{
+ for (int i=0; i<13; i++) {
+ if (strcmp(tag->name, void_elements[i]) == 0)
+ return true;
+ }
+ return false;
+}
+
+void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset)
+{
+ int i = offset;
+ while (text[i] != '<')
+ i--;
+ closed_tag->_inner_html_end_offset = i;
+}
+
+void tag_free(struct Tag *tag)
+{
+ free(tag->name);
+ free(tag->inner_text);
+ for (int i=0; i<tag->attrs_len; i++) {
+ free(tag->attrs[i]->name);
+ free(tag->attrs[i]->value);
+ free(tag->attrs[i]);
+ }
+ free(tag->attrs);
+ for (int i=0; i<tag->children_len; i++) {
+ if (tag->children[i] != NULL)
+ tag_free(tag->children[i]);
+ }
+ free(tag->children);
+ free(tag);
+}
+
+void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags)
+{
+ if (opts->limit > 0 && found_tags->len == opts->limit)
+ return;
+ bool matches_tag = false;
+ bool matches_attr_key = false;
+ bool matches_attr_value = false;
+ if (strcmp(tag->name, opts->tag) == 0)
+ matches_tag = true;
+ for (int i=0; i<tag->attrs_len; i++) {
+ if (strcmp(tag->attrs[i]->name, opts->key) == 0)
+ matches_attr_key = true;
+ if (strcmp(tag->attrs[i]->value, opts->attr) == 0)
+ matches_attr_value = true;
+ }
+ if (strlen(opts->tag) > 0 && strlen(opts->key) > 0 && strlen(opts->attr) > 0) {
+ if (matches_tag && matches_attr_key && matches_attr_value) {
+ found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
+ found_tags->tags[found_tags->len] = tag;
+ found_tags->len++;
+ }
+ } else if (strlen(opts->tag) > 0 && strlen(opts->key) > 0) {
+ if (matches_tag && matches_attr_key) {
+ found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
+ found_tags->tags[found_tags->len] = tag;
+ found_tags->len++;
+ }
+ } else if (strlen(opts->tag) > 0) {
+ if (matches_tag) {
+ found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
+ found_tags->tags[found_tags->len] = tag;
+ found_tags->len++;
+ }
+ } else if (strlen(opts->key) > 0 && strlen(opts->attr) > 0) {
+ if (matches_attr_key && matches_attr_value) {
+ found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
+ found_tags->tags[found_tags->len] = tag;
+ found_tags->len++;
+ }
+ } else if (strlen(opts->key) > 0) {
+ if (matches_attr_key) {
+ found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
+ found_tags->tags[found_tags->len] = tag;
+ found_tags->len++;
+ }
+ } else if (strlen(opts->attr) > 0) {
+ if (matches_attr_value) {
+ found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag));
+ found_tags->tags[found_tags->len] = tag;
+ found_tags->len++;
+ }
+ }
+ for (int i=tag->children_len-1; i>-1; i--) {
+ tag_find(tag->children[i], opts, found_tags);
+ }
+}
+
+void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text)
+{
+ if (opts->is_except) {
+ bool is_match = false;
+ for (int i=0; i<strlen(text); i++) {
+ is_match = false;
+ for (int k=0; k<found_tags->len; k++) {
+ if (
+ found_tags->tags[k]->_outer_html_begin_offset <= i &&
+ found_tags->tags[k]->_outer_html_end_offset > i
+ )
+ is_match = true;
+ }
+ if (!is_match)
+ putchar(text[i]);
+ }
+ } else {
+ char *requested_text = NULL;
+ char *trimmed_text = NULL;
+ for (int i=0; i<found_tags->len; i++) {
+ switch (opts->out) {
+ case OUT_INNER_HTML:
+ requested_text = tag_get_inner_html(found_tags->tags[i], text);
+ trimmed_text = string_trim(requested_text);
+ free(requested_text);
+ break;
+ case OUT_OUTER_HTML:
+ requested_text = tag_get_outer_html(found_tags->tags[i], text);
+ trimmed_text = string_trim(requested_text);
+ free(requested_text);
+ break;
+ case OUT_INNER_TEXT:
+ trimmed_text = string_trim(found_tags->tags[i]->inner_text);
+ break;
+ case OUT_ATTR_VALUE:
+ if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) {
+ for (int k=0; k<found_tags->tags[i]->attrs_len; k++) {
+ if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0)
+ printf("%s\n", found_tags->tags[i]->attrs[k]->value);
+ }
+ } else if (strlen(opts->tag) > 0) {
+ for (int k=0; k<found_tags->tags[i]->attrs_len; k++)
+ printf("%s\n", found_tags->tags[i]->attrs[k]->value);
+ }
+ break;
+ }
+ if (trimmed_text) {
+ if (strlen(trimmed_text) > 0)
+ printf("%s\n", trimmed_text);
+ free(trimmed_text);
+ }
+ }
+ }
+}
+
+struct TagList *tag_list_init(void)
+{
+ struct TagList *tag_list = malloc(sizeof(struct TagList));
+ tag_list->tags = NULL;
+ tag_list->len = 0;
+ return tag_list;
+}
+
+void tag_list_free(struct TagList *tag_list)
+{
+ free(tag_list->tags);
+ free(tag_list);
+}
+
+struct Attr *attr_init(void)
+{
+ struct Attr *attr = malloc(sizeof(struct Attr));
+ attr->name = malloc(sizeof(char));
+ attr->name[0] = 0;
+ attr->value = malloc(sizeof(char));
+ attr->value[0] = 0;
+ return attr;
+}
+
+static inline bool attr_name_char_is_valid(uint_least32_t cp)
+{
+ if (is_control(cp))
+ return false;
+ if (is_non_char(cp))
+ return false;
+ if (
+ cp == SPACE ||
+ cp == QUOTATION_MARK ||
+ cp == APOSTROPHE ||
+ cp == GREATER_THAN_SIGN ||
+ cp == SOLIDUS ||
+ cp == EQUALS_SIGN
+ )
+ return false;
+ return true;
+}
+
+static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp)
+{
+ /*
+ Not mentioned invalid characters.
+ They are already handled before
+ function call.
+ */
+ if (
+ cp == EQUALS_SIGN ||
+ cp == LESS_THAN_SIGN ||
+ cp == GREATER_THAN_SIGN ||
+ cp == GRAVE_ACCENT
+ )
+ return false;
+ return true;
+}
+
+char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base)
+{
+ size_t old_offset = offset;
+ char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char));
+ char *numeric_charref = malloc(sizeof(char));
+ numeric_charref[0] = 0;
+ size_t ret;
+ uint_least32_t cp;
+ do {
+ ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
+ numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret));
+ offset += ret;
+ } while (cp != SEMICOLON);
+ *new_offset = offset - old_offset;
+ long i = strtol(numeric_charref, NULL, base);
+ ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE);
+ character[ret] = 0;
+ free(numeric_charref);
+ return character;
+}
+
+char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs)
+{
+ uint_least32_t stop_at = 0;
+ switch(avs) {
+ case AVS_QUOTATION_MARK:
+ stop_at = QUOTATION_MARK;
+ break;
+ case AVS_APOSTROPHE:
+ stop_at = APOSTROPHE;
+ break;
+ case AVS_UNQUOTED:
+ stop_at = GREATER_THAN_SIGN;
+ break;
+ case AVS_NO: /* Just to silence the compilier warning */
+ break;
+ }
+ char *named_charref = malloc(sizeof(char));
+ named_charref[0] = 0;
+ size_t ret;
+ uint_least32_t cp;
+ int i = 0;
+ for (;;) {
+ ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
+ if (cp == AMPERSAND || ascii_is_whitespace(cp))
+ break;
+ if (avs > AVS_NO && cp == stop_at)
+ break;
+ named_charref = string_concat(named_charref, cp_to_string(cp, ret));
+ if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
+ break;
+ offset += ret;
+ i++;
+ }
+ return named_charref;
+}
+
+char *charref_named_encode(const char *name)
+{
+ char *buf = malloc(2*MAX_CODEPOINT_SIZE+1);
+ char character[MAX_CODEPOINT_SIZE];
+ memset(&character, 0, MAX_CODEPOINT_SIZE);
+ size_t len;
+ for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) {
+ if (string_starts_with(name, entities[i].name)) {
+ len = grapheme_encode_utf8(entities[i].cp[0], character, MAX_CODEPOINT_SIZE);
+ strcpy(buf, character);
+ if (entities[i].cp[1] != 0) {
+ len += grapheme_encode_utf8(entities[i].cp[1], character, MAX_CODEPOINT_SIZE);
+ strcat(buf, character);
+ }
+ buf[len] = 0;
+ const char *part = &name[strlen(entities[i].name)];
+ size_t part_len = strlen(part);
+ if (part_len > 0) {
+ if (part_len == 1 && part[0] == ';')
+ return buf;
+ buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+part_len);
+ strcat(buf, &name[strlen(entities[i].name)]);
+ buf[len+part_len] = 0;
+ }
+ return buf;
+ }
+ }
+ buf = realloc(buf, (strlen(name)+2) * sizeof(char));
+ buf[0] = '&';
+ buf[1] = 0;
+ strcat(buf, name);
+ return buf;
+}
+
+static inline bool ascii_is_digit(uint_least32_t cp)
+{
+ if (cp >= 0x30 && cp <= 0x39)
+ return true;
+ return false;
+}
+
+static inline bool ascii_alpha_is_upper(uint_least32_t cp)
+{
+ if (cp >= 0x41 && cp <= 0x5A)
+ return true;
+ return false;
+}
+
+static inline bool ascii_alpha_is_lower(uint_least32_t cp)
+{
+ if (cp >= 0x61 && cp <= 0x7A)
+ return true;
+ return false;
+}
+
+static inline bool ascii_is_alpha(uint_least32_t cp)
+{
+ if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp))
+ return true;
+ return false;
+}
+
+static inline bool ascii_is_whitespace(uint_least32_t cp)
+{
+ if (
+ cp == TAB ||
+ cp == LF ||
+ cp == FF ||
+ cp == CR ||
+ cp == SPACE
+ )
+ return true;
+ return false;
+}
+
+static inline bool is_c0_control(uint_least32_t cp)
+{
+ if (cp >= 0x00 && cp <= 0x1F)
+ return true;
+ return false;
+}
+
+static inline bool is_control(uint_least32_t cp)
+{
+ if (is_c0_control(cp))
+ return true;
+ if (cp >= 0x7F && cp <= 0x9F)
+ return true;
+ return false;
+}
+
+static inline bool is_non_char(uint_least32_t cp)
+{
+ if (cp >= 0xFDD0 && cp <= 0xFDEF)
+ return true;
+ if (
+ cp == 0xFFFE || cp == 0xFFFF ||
+ cp == 0x1FFFE || cp == 0x1FFFF ||
+ cp == 0x2FFFE || cp == 0x2FFFF ||
+ cp == 0x3FFFE || cp == 0x3FFFF ||
+ cp == 0x4FFFE || cp == 0x4FFFF ||
+ cp == 0x5FFFE || cp == 0x5FFFF ||
+ cp == 0x6FFFE || cp == 0x6FFFF ||
+ cp == 0x7FFFE || cp == 0x7FFFF ||
+ cp == 0x8FFFE || cp == 0x8FFFF ||
+ cp == 0x9FFFE || cp == 0x9FFFF ||
+ cp == 0xAFFFE || cp == 0xAFFFF ||
+ cp == 0xBFFFE || cp == 0xBFFFF ||
+ cp == 0xCFFFE || cp == 0xCFFFF ||
+ cp == 0xDFFFE || cp == 0xDFFFF ||
+ cp == 0xEFFFE || cp == 0xEFFFF ||
+ cp == 0xFFFFE || cp == 0xFFFFF ||
+ cp == 0x10FFFE || cp == 0x10FFFF
+ )
+ return true;
+ return false;
+}
+
+const char *state_to_string(enum State state)
+{
+ switch(state) {
+ case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
+ case STATE_TAG: return "STATE_TAG";
+ case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
+ case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
+ case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
+ case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
+ case STATE_COMMENT: return "STATE_COMMENT";
+ case STATE_SCRIPT: return "STATE_SCRIPT";
+ case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
+ case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
+ case STATE_STYLE: return "STATE_STYLE";
+ case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
+ case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
+ case STATE_CHAR_REF: return "STATE_CHAR_REF";
+ case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC";
+ }
+ return "";
+}
diff --git a/src/html.h b/src/html.h
@@ -0,0 +1,124 @@
+#define LESS_THAN_SIGN 0x3C
+#define GREATER_THAN_SIGN 0x3E
+#define EQUALS_SIGN 0x3D
+#define TAB 0x09
+#define LF 0x0A
+#define FF 0x0C
+#define CR 0x0D
+#define SPACE 0x20
+#define SOLIDUS 0x2F
+#define EXCLAMATION_MARK 0x21
+#define QUOTATION_MARK 0x22
+#define NUMBER_SIGN 0x23
+#define AMPERSAND 0x26
+#define APOSTROPHE 0x27
+#define GRAVE_ACCENT 0x60
+#define HYPHEN_MINUS 0x2D
+#define SEMICOLON 0x3B
+#define SMALL_LETTER_X 0x78
+#define CAPITAL_LETTER_X 0x58
+
+#define NAMED_CHAR_REF_COUNT 2231
+#define LONGEST_NAMED_CHAR_REF 32
+#define MAX_CODEPOINT_SIZE 4
+
+static const char *void_elements[] = {
+ "area", "base", "br", "col", "embed", "hr", "img",
+ "input", "link", "meta", "source", "track", "wbr"
+};
+
+struct Attr {
+ char *name;
+ char *value; // optional
+};
+
+struct Tag {
+ char *name;
+ struct Attr **attrs;
+ struct Tag **children;
+ char *inner_text;
+ size_t attrs_len;
+ size_t children_len;
+ bool _is_void_element; // means there is no closing tag
+ bool _is_closed;
+ size_t _outer_html_begin_offset;
+ size_t _outer_html_end_offset;
+ size_t _inner_html_begin_offset;
+ size_t _inner_html_end_offset;
+};
+
+struct TagList {
+ struct Tag **tags;
+ size_t len;
+};
+
+enum State {
+ STATE_INNER_TEXT,
+ STATE_TAG,
+ STATE_BEGIN_TAG_NAME,
+ STATE_END_TAG_NAME,
+ STATE_ATTR_NAME,
+ STATE_ATTR_VALUE,
+ STATE_COMMENT,
+ STATE_SCRIPT,
+ STATE_SCRIPT_POSSIBLE_END_TAG,
+ STATE_SCRIPT_END_TAG,
+ STATE_STYLE,
+ STATE_STYLE_POSSIBLE_END_TAG,
+ STATE_STYLE_END_TAG,
+ STATE_CHAR_REF,
+ STATE_CHAR_REF_NUMERIC
+};
+
+enum DoctypeState {
+ DSTATE_TEXT,
+ DSTATE_POSSIBLE_DTYPE,
+ DSTATE_DTYPE_OR_COMMENT,
+ DSTATE_DTYPE
+};
+
+enum AttrValueSyntax {
+ AVS_NO,
+ AVS_QUOTATION_MARK,
+ AVS_APOSTROPHE,
+ AVS_UNQUOTED
+};
+
+void html_filter(char *text, struct FindOpts *opts);
+void html_print(struct Tag *tag, int indent);
+
+struct Tag *tag_init(void);
+struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state);
+struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset);
+struct Tag *tag_get_last_open(struct TagList *tag_list);
+int tag_doctype_parse(const char *text);
+char *tag_get_outer_html(struct Tag *tag, char *text);
+char *tag_get_inner_html(struct Tag *tag, char *text);
+enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset);
+static inline bool tag_is_void_element(struct Tag *tag);
+void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset);
+void tag_free(struct Tag *tag);
+void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags);
+void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text);
+
+struct TagList *tag_list_init(void);
+void tag_list_free(struct TagList *tag_list);
+
+struct Attr *attr_init(void);
+static inline bool attr_name_char_is_valid(uint_least32_t cp);
+static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp);
+
+char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base);
+char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs);
+char *charref_named_encode(const char *name);
+
+static inline bool ascii_is_digit(uint_least32_t cp);
+static inline bool ascii_alpha_is_upper(uint_least32_t cp);
+static inline bool ascii_alpha_is_lower(uint_least32_t cp);
+static inline bool ascii_is_alpha(uint_least32_t cp);
+static inline bool ascii_is_whitespace(uint_least32_t cp);
+static inline bool is_c0_control(uint_least32_t cp);
+static inline bool is_control(uint_least32_t cp);
+static inline bool is_non_char(uint_least32_t cp);
+
+const char *state_to_string(enum State s);
diff --git a/misc.c b/src/misc.c
diff --git a/todo b/todo
@@ -1,2 +1,3 @@
replace int,size_t with uint*
handle correctly when no search pattern was provided
+implement charref also for outerhtml,innerhtml