commit 572566e1c67d1d3e41dc16dbce8b1cec56107ff9
parent 3fae4d3606f7663d42ae792a72bc3e86a513b21e
Author: Robin <kroekerrobin@gmail.com>
Date: Sat, 6 Apr 2024 12:55:35 +0200
Restructure and make html.c more standalone
Diffstat:
| M | Makefile | | | 4 | ++-- |
| M | htex.1 | | | 2 | +- |
| M | htex.c | | | 154 | +++++++------------------------------------------------------------------------ |
| D | htex.h | | | 20 | -------------------- |
| M | src/html.c | | | 1563 | ++++++++++++++++++++++++++++++++++++++++++++----------------------------------- |
| M | src/html.h | | | 69 | +++++++++++++++++++++++++++++---------------------------------------- |
| M | src/misc.c | | | 8 | ++++++++ |
| A | src/misc.h | | | 6 | ++++++ |
| M | todo | | | 4 | ++-- |
9 files changed, 927 insertions(+), 903 deletions(-)
diff --git a/Makefile b/Makefile
@@ -2,9 +2,9 @@ PREFIX = /usr/local
MANPREFIX = $(PREFIX)/share/man
all:
- $(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
+ $(CC) -O -pedantic -Werror -Wall -o htex src/misc.c src/html.c htex.c -lgrapheme
debug:
- $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
+ $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex src/misc.c src/html.c htex.c -lgrapheme
clean:
rm htex
install: all
diff --git a/htex.1 b/htex.1
@@ -35,7 +35,7 @@ Prints everything except the found html tags' outerHTML.
\fB\,-l\/\fR, \fB\,--limit\/\fR \fI\,NUM\/\fR
Find maximum \fI\,NUM\/\fR html tags.
.SH INNER_TEXT
-Still in progress.
+Coming soon.
.SH EXAMPLES
.sp
.RS 4
diff --git a/htex.c b/htex.c
@@ -5,142 +5,8 @@
#include <getopt.h>
#include <inttypes.h>
#include <grapheme.h>
-#include "htex.h"
-#include "src/misc.c"
-#include "src/html.c"
-
-bool find_opts_exist(struct FindOpts *opts)
-{
- if (strlen(opts->tag) > 0)
- return true;
- if (strlen(opts->attr) > 0)
- return true;
- if (strlen(opts->key) > 0)
- return true;
- return false;
-}
-
-struct FindOpts *find_opts_parse(const char *pattern)
-{
- struct FindOpts *opts = malloc(sizeof(struct FindOpts));
- opts->out = OUT_OUTER_HTML;
- opts->tag = malloc(sizeof(char));
- opts->tag[0] = 0;
- opts->attr = malloc(sizeof(char));
- opts->attr[0] = 0;
- opts->key = malloc(sizeof(char));
- opts->key[0] = 0;
- bool is_class_value = false;
- bool is_id_value = false;
- int i = 0;
- bool is_attr_key = false;
- bool is_attr_or_tag = true;
- char *attr_or_tag = NULL;
- int aot = 0;
- int ak = 0;
- int av = 0;
- switch (pattern[0]) {
- case '.':
- is_class_value = true;
- i = 1;
- break;
- case '#':
- is_id_value = true;
- i = 1;
- break;
- }
- for (; i<strlen(pattern); i++) {
- if (pattern[i] == ']')
- break;
- if (
- !is_attr_key &&
- !is_attr_or_tag &&
- pattern[i] != ']' &&
- pattern[i] != '"'
- ) {
- opts->attr = realloc(opts->attr, (av+1) * sizeof(char));
- opts->attr[av] = pattern[i];
- av++;
- }
- if (pattern[i] == '=')
- is_attr_key = false;
- if (is_attr_key && !is_attr_or_tag) {
- opts->key = realloc(opts->key, (ak+1) * sizeof(char));
- opts->key[ak] = pattern[i];
- ak++;
- }
- if (pattern[i] == '[') {
- is_attr_key = true;
- is_attr_or_tag = false;
- }
- if (is_attr_or_tag) {
- attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char));
- attr_or_tag[aot] = pattern[i];
- aot++;
- }
- }
- attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char));
- attr_or_tag[aot] = 0;
- if (is_id_value) {
- free(opts->key);
- opts->key = NULL;
- free(opts->attr);
- opts->attr = NULL;
- opts->attr = attr_or_tag;
- opts->key = realloc(opts->key, 3 * sizeof(char));
- opts->key[0] = 'i';
- opts->key[1] = 'd';
- opts->key[2] = 0;
- } else if (is_class_value) {
- free(opts->key);
- opts->key = NULL;
- free(opts->attr);
- opts->attr = NULL;
- opts->attr = attr_or_tag;
- opts->key = realloc(opts->key, 6 * sizeof(char));
- opts->key[0] = 'c';
- opts->key[1] = 'l';
- opts->key[2] = 'a';
- opts->key[3] = 's';
- opts->key[4] = 's';
- opts->key[5] = 0;
- } else {
- free(opts->tag);
- opts->tag = attr_or_tag;
- if (av > 0) {
- opts->attr = realloc(opts->attr, (av+1) * sizeof(char));
- opts->attr[av] = 0;
- }
- if (ak > 0) {
- opts->key = realloc(opts->key, (ak+1) * sizeof(char));
- opts->key[ak] = 0;
- }
- }
- return opts;
-}
-
-void find_opts_free(struct FindOpts *opts)
-{
- free(opts->tag);
- free(opts->attr);
- free(opts->key);
- free(opts);
-}
-
-enum OutType output_type_parse(const char *type)
-{
- if (type == NULL)
- return OUT_OUTER_HTML;
- if (strcmp(type, "outerhtml") == 0)
- return OUT_OUTER_HTML;
- if (strcmp(type, "innerhtml") == 0)
- return OUT_INNER_HTML;
- if (strcmp(type, "innertext") == 0)
- return OUT_INNER_TEXT;
- if (strcmp(type, "attr_value") == 0)
- return OUT_ATTR_VALUE;
- return -1;
-}
+#include "src/misc.h"
+#include "src/html.h"
int main(int argc, char *argv[])
{
@@ -214,13 +80,17 @@ int main(int argc, char *argv[])
return 0;
}
}
- struct FindOpts *options = find_opts_parse(search_pattern);
- options->out = out;
- options->is_except = is_except;
- options->limit = limit;
- html_filter(text, options);
+ struct FindOpts *opts = find_opts_parse(search_pattern);
+ opts->out = out;
+ opts->is_except = is_except;
+ opts->limit = limit;
+ struct HTMLDocument *document = html_document_parse(text);
+ struct TagList *found_tags = html_document_find(document, opts);
+ html_document_print_find_result(document, found_tags, opts);
+ html_document_free(document);
+ tag_list_free(found_tags);
+ find_opts_free(opts);
free(output);
- find_opts_free(options);
free(text);
return 0;
}
diff --git a/htex.h b/htex.h
@@ -1,20 +0,0 @@
-enum OutType {
- OUT_INNER_HTML,
- OUT_OUTER_HTML,
- OUT_INNER_TEXT,
- OUT_ATTR_VALUE
-};
-
-struct FindOpts {
- char *tag;
- char *attr;
- char *key;
- enum OutType out;
- bool is_except;
- int limit;
-};
-
-bool find_opts_exist(struct FindOpts *opts);
-struct FindOpts *find_opts_parse(const char *pattern);
-void find_opts_free(struct FindOpts *opts);
-enum OutType output_type_parse(const char *type);
diff --git a/src/html.c b/src/html.c
@@ -1,359 +1,438 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <grapheme.h>
#include "html.h"
#include "entities.h"
+#include "misc.h"
-void html_filter(char *text, struct FindOpts *opts)
+static const char *void_elements[] = {
+ "area", "base", "br", "col", "embed", "hr", "img",
+ "input", "link", "meta", "source", "track", "wbr"
+};
+
+/* Only needed for debugging */
+/* static const char *state_to_string(enum State state)
{
- struct TagList *tag_list = tag_list_init();
- struct TagList *found_tags = tag_list_init();
- int len = tag_doctype_parse(text);
- if (len == -1) {
- fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n");
- goto CLEAN;
- } else {
- text += len;
- }
- struct Tag *root_tag = tag_parse(tag_list, text, 0, STATE_INNER_TEXT);
- if (!find_opts_exist(opts)) {
- found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag));
- found_tags->tags[0] = root_tag->children[0];
- found_tags->len = 1;
+ switch(state) {
+ case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
+ case STATE_TAG: return "STATE_TAG";
+ case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
+ case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
+ case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
+ case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
+ case STATE_COMMENT: return "STATE_COMMENT";
+ case STATE_SCRIPT: return "STATE_SCRIPT";
+ case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
+ case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
+ case STATE_STYLE: return "STATE_STYLE";
+ case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
+ case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
+ case STATE_CHAR_REF: return "STATE_CHAR_REF";
+ case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC";
+ }
+ return "";
+} */
+
+static inline bool tag_is_void_element(struct Tag *tag)
+{
+ for (int i=0; i<13; i++) {
+ if (strcmp(tag->name, void_elements[i]) == 0)
+ return true;
+ }
+ return false;
+}
+
+static inline bool is_c0_control(uint_least32_t cp)
+{
+ if (cp >= 0x00 && cp <= 0x1F)
+ return true;
+ return false;
+}
+
+static inline bool is_control(uint_least32_t cp)
+{
+ if (is_c0_control(cp))
+ return true;
+ if (cp >= 0x7F && cp <= 0x9F)
+ return true;
+ return false;
+}
+
+static inline bool is_non_char(uint_least32_t cp)
+{
+ if (cp >= 0xFDD0 && cp <= 0xFDEF)
+ return true;
+ if (
+ cp == 0xFFFE || cp == 0xFFFF ||
+ cp == 0x1FFFE || cp == 0x1FFFF ||
+ cp == 0x2FFFE || cp == 0x2FFFF ||
+ cp == 0x3FFFE || cp == 0x3FFFF ||
+ cp == 0x4FFFE || cp == 0x4FFFF ||
+ cp == 0x5FFFE || cp == 0x5FFFF ||
+ cp == 0x6FFFE || cp == 0x6FFFF ||
+ cp == 0x7FFFE || cp == 0x7FFFF ||
+ cp == 0x8FFFE || cp == 0x8FFFF ||
+ cp == 0x9FFFE || cp == 0x9FFFF ||
+ cp == 0xAFFFE || cp == 0xAFFFF ||
+ cp == 0xBFFFE || cp == 0xBFFFF ||
+ cp == 0xCFFFE || cp == 0xCFFFF ||
+ cp == 0xDFFFE || cp == 0xDFFFF ||
+ cp == 0xEFFFE || cp == 0xEFFFF ||
+ cp == 0xFFFFE || cp == 0xFFFFF ||
+ cp == 0x10FFFE || cp == 0x10FFFF
+ )
+ return true;
+ return false;
+}
+
+static inline bool attr_name_char_is_valid(uint_least32_t cp)
+{
+ if (is_control(cp))
+ return false;
+ if (is_non_char(cp))
+ return false;
+ if (
+ cp == SPACE ||
+ cp == QUOTATION_MARK ||
+ cp == APOSTROPHE ||
+ cp == GREATER_THAN_SIGN ||
+ cp == SOLIDUS ||
+ cp == EQUALS_SIGN
+ )
+ return false;
+ return true;
+}
+
+static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp)
+{
+ /*
+ Not mentioned invalid characters.
+ They are already handled before
+ function call.
+ */
+ if (
+ cp == EQUALS_SIGN ||
+ cp == LESS_THAN_SIGN ||
+ cp == GREATER_THAN_SIGN ||
+ cp == GRAVE_ACCENT
+ )
+ return false;
+ return true;
+}
+
+static inline bool ascii_is_digit(uint_least32_t cp)
+{
+ if (cp >= 0x30 && cp <= 0x39)
+ return true;
+ return false;
+}
+
+static inline bool ascii_alpha_is_upper(uint_least32_t cp)
+{
+ if (cp >= 0x41 && cp <= 0x5A)
+ return true;
+ return false;
+}
+
+static inline bool ascii_alpha_is_lower(uint_least32_t cp)
+{
+ if (cp >= 0x61 && cp <= 0x7A)
+ return true;
+ return false;
+}
+
+static inline bool ascii_is_alpha(uint_least32_t cp)
+{
+ if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp))
+ return true;
+ return false;
+}
+
+static inline bool ascii_is_whitespace(uint_least32_t cp)
+{
+ if (
+ cp == TAB ||
+ cp == LF ||
+ cp == FF ||
+ cp == CR ||
+ cp == SPACE
+ )
+ return true;
+ return false;
+}
+
+static bool find_opts_exist(struct FindOpts *opts)
+{
+ if (strlen(opts->tag) > 0)
+ return true;
+ if (strlen(opts->attr) > 0)
+ return true;
+ if (strlen(opts->key) > 0)
+ return true;
+ return false;
+}
+
+struct FindOpts *find_opts_parse(const char *pattern)
+{
+ struct FindOpts *opts = malloc(sizeof(struct FindOpts));
+ opts->out = OUT_OUTER_HTML;
+ opts->tag = malloc(sizeof(char));
+ opts->tag[0] = 0;
+ opts->attr = malloc(sizeof(char));
+ opts->attr[0] = 0;
+ opts->key = malloc(sizeof(char));
+ opts->key[0] = 0;
+ bool is_class_value = false;
+ bool is_id_value = false;
+ int i = 0;
+ bool is_attr_key = false;
+ bool is_attr_or_tag = true;
+ char *attr_or_tag = NULL;
+ int aot = 0;
+ int ak = 0;
+ int av = 0;
+ switch (pattern[0]) {
+ case '.':
+ is_class_value = true;
+ i = 1;
+ break;
+ case '#':
+ is_id_value = true;
+ i = 1;
+ break;
+ }
+ for (; i<strlen(pattern); i++) {
+ if (pattern[i] == ']')
+ break;
+ if (
+ !is_attr_key &&
+ !is_attr_or_tag &&
+ pattern[i] != ']' &&
+ pattern[i] != '"'
+ ) {
+ opts->attr = realloc(opts->attr, (av+1) * sizeof(char));
+ opts->attr[av] = pattern[i];
+ av++;
+ }
+ if (pattern[i] == '=')
+ is_attr_key = false;
+ if (is_attr_key && !is_attr_or_tag) {
+ opts->key = realloc(opts->key, (ak+1) * sizeof(char));
+ opts->key[ak] = pattern[i];
+ ak++;
+ }
+ if (pattern[i] == '[') {
+ is_attr_key = true;
+ is_attr_or_tag = false;
+ }
+ if (is_attr_or_tag) {
+ attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char));
+ attr_or_tag[aot] = pattern[i];
+ aot++;
+ }
+ }
+ attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char));
+ attr_or_tag[aot] = 0;
+ if (is_id_value) {
+ free(opts->key);
+ opts->key = NULL;
+ free(opts->attr);
+ opts->attr = NULL;
+ opts->attr = attr_or_tag;
+ opts->key = realloc(opts->key, 3 * sizeof(char));
+ opts->key[0] = 'i';
+ opts->key[1] = 'd';
+ opts->key[2] = 0;
+ } else if (is_class_value) {
+ free(opts->key);
+ opts->key = NULL;
+ free(opts->attr);
+ opts->attr = NULL;
+ opts->attr = attr_or_tag;
+ opts->key = realloc(opts->key, 6 * sizeof(char));
+ opts->key[0] = 'c';
+ opts->key[1] = 'l';
+ opts->key[2] = 'a';
+ opts->key[3] = 's';
+ opts->key[4] = 's';
+ opts->key[5] = 0;
} else {
- tag_find(root_tag, opts, found_tags);
- }
- tag_print_find_result(root_tag, opts, found_tags, text);
- // html_print(root_tag, -1);
- tag_free(root_tag);
-CLEAN:
- tag_list_free(tag_list);
- tag_list_free(found_tags);
+ free(opts->tag);
+ opts->tag = attr_or_tag;
+ if (av > 0) {
+ opts->attr = realloc(opts->attr, (av+1) * sizeof(char));
+ opts->attr[av] = 0;
+ }
+ if (ak > 0) {
+ opts->key = realloc(opts->key, (ak+1) * sizeof(char));
+ opts->key[ak] = 0;
+ }
+ }
+ return opts;
}
-void html_print(struct Tag *tag, int indent)
+void find_opts_free(struct FindOpts *opts)
{
- for (int i=0; i<indent; i++)
- putchar(' ');
- printf("%s", tag->name);
- for (int i=0; i<tag->attrs_len; i++)
- printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value);
- printf("\n");
- indent++;
- for (int i=tag->children_len-1; i>-1; i--)
- html_print(tag->children[i], indent);
+ free(opts->tag);
+ free(opts->attr);
+ free(opts->key);
+ free(opts);
}
-struct Tag *tag_init(void)
+enum OutType output_type_parse(const char *type)
{
- struct Tag *t = malloc(sizeof(struct Tag));
- t->name = malloc(sizeof(char));
- t->name[0] = 0;
- t->inner_text = malloc(sizeof(char));
- t->inner_text[0] = 0;
- t->attrs = NULL;
- t->children = NULL;
- t->attrs_len = 0;
- t->children_len = 0;
- t->_is_void_element = false;
- t->_is_closed = false;
- t->_outer_html_begin_offset = 0;
- t->_outer_html_end_offset = 0;
- t->_inner_html_begin_offset = 0;
- t->_inner_html_end_offset = 0;
- return t;
+ if (type == NULL)
+ return OUT_OUTER_HTML;
+ if (strcmp(type, "outerhtml") == 0)
+ return OUT_OUTER_HTML;
+ if (strcmp(type, "innerhtml") == 0)
+ return OUT_INNER_HTML;
+ if (strcmp(type, "innertext") == 0)
+ return OUT_INNER_TEXT;
+ if (strcmp(type, "attr_value") == 0)
+ return OUT_ATTR_VALUE;
+ return -1;
}
-struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state)
+static struct Attr *attr_init(void)
{
- struct Tag *tag = tag_init();
- tag->_outer_html_begin_offset= offset-1;
- tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag));
- tag_list->tags[tag_list->len] = tag;
- tag_list->len++;
- struct Tag *still_open_tag = tag;
- char *end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- enum State return_to_state = STATE_INNER_TEXT;
- size_t a = 0;
- size_t attr_name_count = 0;
- enum AttrValueSyntax avs = AVS_NO;
- size_t hyphen_count = 0;
+ struct Attr *attr = malloc(sizeof(struct Attr));
+ attr->name = malloc(sizeof(char));
+ attr->name[0] = 0;
+ attr->value = malloc(sizeof(char));
+ attr->value[0] = 0;
+ return attr;
+}
+
+static char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base)
+{
+ size_t old_offset = offset;
+ char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char));
+ char *numeric_charref = malloc(sizeof(char));
+ numeric_charref[0] = 0;
+ size_t ret;
uint_least32_t cp;
- size_t len = strlen(text);
- size_t ret, off;
- for (off = offset; off<len; off += ret) {
- if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
- fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n");
- } else {
- /* char *the_codepoint = cp_to_string(cp, ret);
- printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state));
- free(the_codepoint); */
- switch (state) {
- case STATE_INNER_TEXT:
- if (cp == LESS_THAN_SIGN) {
- state = STATE_TAG;
- break;
- }
- if (cp == AMPERSAND) {
- return_to_state = STATE_INNER_TEXT;
- state = STATE_CHAR_REF;
- break;
- }
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret));
- break;
- case STATE_TAG:
- if (cp == SOLIDUS) {
- state = STATE_END_TAG_NAME;
- break;
- }
- if (cp == EXCLAMATION_MARK) {
- state = STATE_COMMENT;
- break;
- }
- still_open_tag = tag_get_last_open(tag_list);
- struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME);
- still_open_tag->children = realloc(
- still_open_tag->children,
- (still_open_tag->children_len+1) * sizeof(struct Tag)
- );
- still_open_tag->children[still_open_tag->children_len] = one_tag;
- still_open_tag->children_len++;
- free(end_tag);
- return tag;
- case STATE_BEGIN_TAG_NAME:
- if (cp == GREATER_THAN_SIGN) {
- state = tag_process_end_of_opening_tag(tag, off);
- break;
- }
- if (ascii_is_whitespace(cp)) {
- state = STATE_ATTR_NAME;
- break;
- }
- if (ascii_is_digit(cp) || ascii_is_alpha(cp)) {
- tag->name = string_concat(tag->name, cp_to_string(cp, ret));
- }
- break;
- case STATE_END_TAG_NAME:
- if (cp == GREATER_THAN_SIGN) {
- struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
- if (closed_tag != NULL)
- tag_set_inner_html_end_offset(closed_tag, text, off);
- free(end_tag);
- end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- state = STATE_INNER_TEXT;
- break;
- }
- if (!ascii_is_whitespace(cp))
- end_tag = string_concat(end_tag, cp_to_string(cp, ret));
- break;
- case STATE_ATTR_NAME:
- if (cp == GREATER_THAN_SIGN) {
- state = tag_process_end_of_opening_tag(tag, off);
- break;
- }
- if (ascii_is_whitespace(cp)) {
- if (attr_name_count == a+1)
- a++;
- break;
- }
- if (cp == EQUALS_SIGN) {
- state = STATE_ATTR_VALUE;
- break;
- }
- if (attr_name_char_is_valid(cp)) {
- if (attr_name_count != a+1) {
- tag->attrs = realloc(
- tag->attrs,
- (a+1) * sizeof(struct Attr)
- );
- tag->attrs[a] = attr_init();
- attr_name_count = a + 1;
- tag->attrs_len = attr_name_count;
- }
- tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret));
- }
- break;
- case STATE_ATTR_VALUE:
- if (ascii_is_whitespace(cp)) {
- if (avs == AVS_UNQUOTED) {
- avs = AVS_NO;
- state = STATE_ATTR_NAME;
- } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) {
- if (
- strcmp("id", tag->attrs[a]->name) == 0 ||
- strcmp("class", tag->attrs[a]->name) == 0
- ) {
- char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
- strcpy(tmp_name, tag->attrs[a]->name);
- tag->attrs = realloc(
- tag->attrs,
- (a+1) * sizeof(struct Attr)
- );
- a++;
- tag->attrs[a] = attr_init();
- free(tag->attrs[a]->name);
- tag->attrs[a]->name = tmp_name;
- tag->attrs_len++;
- attr_name_count = a + 1;
- } else {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
- }
- }
- break;
- }
- if (cp == QUOTATION_MARK) {
- if (avs == AVS_NO) {
- avs = AVS_QUOTATION_MARK;
- break;
- }
- if (avs == AVS_QUOTATION_MARK) {
- avs = AVS_NO;
- state = STATE_ATTR_NAME;
- break;
- }
- }
- if (cp == APOSTROPHE) {
- if (avs == AVS_NO) {
- avs = AVS_APOSTROPHE;
- break;
- }
- if (avs == AVS_APOSTROPHE) {
- avs = AVS_NO;
- state = STATE_ATTR_NAME;
- break;
- }
- }
- if (cp == GREATER_THAN_SIGN) {
- state = tag_process_end_of_opening_tag(tag, off);
- break;
- }
- if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) {
- avs = AVS_UNQUOTED;
- }
- if (avs > AVS_NO) {
- if (cp == AMPERSAND) {
- state = STATE_CHAR_REF;
- return_to_state = STATE_ATTR_VALUE;
- break;
- }
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
- }
- break;
- case STATE_COMMENT:
- if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) {
- state = STATE_INNER_TEXT;
- break;
- }
- if (cp == HYPHEN_MINUS)
- hyphen_count++;
- else
- hyphen_count = 0;
- break;
- case STATE_STYLE:
- if (cp == LESS_THAN_SIGN) {
- state = STATE_STYLE_POSSIBLE_END_TAG;
- break;
- }
- break;
- case STATE_STYLE_POSSIBLE_END_TAG:
- if (cp == SOLIDUS)
- state = STATE_STYLE_END_TAG;
- else
- state = STATE_STYLE;
- break;
- case STATE_STYLE_END_TAG:
- if (cp == GREATER_THAN_SIGN) {
- struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
- if (closed_tag != NULL)
- tag_set_inner_html_end_offset(closed_tag, text, off);
- free(end_tag);
- end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- state = STATE_INNER_TEXT;
- break;
- }
- if (!ascii_is_whitespace(cp))
- end_tag = string_concat(end_tag, cp_to_string(cp, ret));
- break;
- case STATE_SCRIPT:
- if (cp == LESS_THAN_SIGN) {
- state = STATE_SCRIPT_POSSIBLE_END_TAG;
- break;
- }
- break;
- case STATE_SCRIPT_POSSIBLE_END_TAG:
- if (cp == SOLIDUS)
- state = STATE_SCRIPT_END_TAG;
- else
- state = STATE_SCRIPT;
- break;
- case STATE_SCRIPT_END_TAG:
- if (cp == GREATER_THAN_SIGN) {
- struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
- if (closed_tag != NULL)
- tag_set_inner_html_end_offset(closed_tag, text, off);
- free(end_tag);
- end_tag = malloc(sizeof(char));
- end_tag[0] = 0;
- state = STATE_INNER_TEXT;
- break;
- }
- if (!ascii_is_whitespace(cp))
- end_tag = string_concat(end_tag, cp_to_string(cp, ret));
- break;
- case STATE_CHAR_REF:
- if (cp == NUMBER_SIGN) { /* hashtag */
- state = STATE_CHAR_REF_NUMERIC;
- break;
- }
- char *named_charref = charref_named_parse(text, off, len, avs);
- off += strlen(named_charref)-1;
- char *encoded_named_charref = charref_named_encode(named_charref);
- if (return_to_state == STATE_INNER_TEXT) {
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref);
- } else if (return_to_state == STATE_ATTR_VALUE) {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref);
- }
- free(named_charref);
- state = return_to_state;
- break;
- case STATE_CHAR_REF_NUMERIC:
- if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) {
- size_t new_offset;
- char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16);
- off += new_offset;
- if (return_to_state == STATE_INNER_TEXT) {
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
- } else if (return_to_state == STATE_ATTR_VALUE) {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
- }
- state = return_to_state;
- break;
- } else if (ascii_is_digit(cp)) {
- size_t new_offset;
- char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10);
- off += new_offset-1;
- if (return_to_state == STATE_INNER_TEXT) {
- still_open_tag = tag_get_last_open(tag_list);
- still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
- } else if (return_to_state == STATE_ATTR_VALUE) {
- tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
- }
- state = return_to_state;
- break;
- }
- state = return_to_state;
- break;
- }
+ do {
+ ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
+ numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret));
+ offset += ret;
+ } while (cp != SEMICOLON);
+ *new_offset = offset - old_offset;
+ long i = strtol(numeric_charref, NULL, base);
+ ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE);
+ character[ret] = 0;
+ free(numeric_charref);
+ return character;
+}
+
+static char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs)
+{
+ uint_least32_t stop_at = 0;
+ switch(avs) {
+ case AVS_QUOTATION_MARK:
+ stop_at = QUOTATION_MARK;
+ break;
+ case AVS_APOSTROPHE:
+ stop_at = APOSTROPHE;
+ break;
+ case AVS_UNQUOTED:
+ stop_at = GREATER_THAN_SIGN;
+ break;
+ case AVS_NO: /* Just to silence the compilier warning */
+ break;
+ }
+ char *named_charref = malloc(sizeof(char));
+ named_charref[0] = 0;
+ size_t ret;
+ uint_least32_t cp;
+ int i = 0;
+ for (;;) {
+ ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
+ if (cp == AMPERSAND || ascii_is_whitespace(cp))
+ break;
+ if (avs > AVS_NO && cp == stop_at)
+ break;
+ named_charref = string_concat(named_charref, cp_to_string(cp, ret));
+ if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
+ break;
+ offset += ret;
+ i++;
+ }
+ return named_charref;
+}
+
+static void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf)
+{
+ const char *remaining = &parsed_string[strlen(charref)];
+ size_t remaining_len = strlen(remaining);
+ size_t buf_len = strlen(*buf);
+ if (remaining_len > 0) {
+ if (remaining_len == 1 && remaining[0] == ';')
+ return;
+ *buf = realloc(*buf, buf_len+remaining_len+1);
+ strcat(*buf, remaining);
+ }
+}
+
+static char *charref_named_encode(const char *name)
+{
+ char *buf = NULL;
+ size_t len;
+ int i;
+ for (i=0; i<2138; i++) {
+ if (string_starts_with(name, single_cp_entities[i].name)) {
+ buf = realloc(buf, MAX_CODEPOINT_SIZE+1);
+ len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE);
+ buf[len] = 0;
+ charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf);
+ return buf;
+ }
+ }
+ for (i=0; i<93; i++) {
+ if (string_starts_with(name, double_cp_entities[i].name)) {
+ size_t buf_len = 0;
+ buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1);
+ len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE);
+ buf_len += len;
+ buf += len;
+ len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE);
+ buf_len += len;
+ buf[buf_len] = 0;
+ charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf);
+ return buf;
}
}
- free(end_tag);
- return tag;
+ buf = realloc(buf, (strlen(name)+2) * sizeof(char));
+ buf[0] = '&';
+ buf[1] = 0;
+ strcat(buf, name);
+ return buf;
+}
+
+static struct Tag *tag_init(void)
+{
+ struct Tag *tag = malloc(sizeof(struct Tag));
+ tag->name = malloc(sizeof(char));
+ tag->name[0] = 0;
+ tag->inner_text = malloc(sizeof(char));
+ tag->inner_text[0] = 0;
+ tag->attrs = NULL;
+ tag->children = NULL;
+ tag->attrs_len = 0;
+ tag->children_len = 0;
+ tag->_is_void_element = false;
+ tag->_is_closed = false;
+ tag->_outer_html_begin_offset = 0;
+ tag->_outer_html_end_offset = 0;
+ tag->_inner_html_begin_offset = 0;
+ tag->_inner_html_end_offset = 0;
+ return tag;
}
-struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset)
+static struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset)
{
for (int i=tag_list->len-1; i>-1; i--) {
if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->_is_closed) {
@@ -365,7 +444,7 @@ struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_ta
return NULL;
}
-struct Tag *tag_get_last_open(struct TagList *tag_list)
+static struct Tag *tag_get_last_open(struct TagList *tag_list)
{
for (int i=tag_list->len-1; i>-1; i--) {
if (!tag_list->tags[i]->_is_void_element && !tag_list->tags[i]->_is_closed) {
@@ -375,70 +454,7 @@ struct Tag *tag_get_last_open(struct TagList *tag_list)
return tag_list->tags[0];
}
-int tag_doctype_parse(const char *text)
-{
- size_t offset = 0;
- enum DoctypeState state = DSTATE_TEXT;
- char *doctype = NULL;
- char *lower_doctype = NULL;
- uint_least32_t cp;
- size_t len = strlen(text);
- size_t ret, off;
- for (off = 0; off<len; off += ret) {
- if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
- fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n");
- } else {
- switch (state) {
- case DSTATE_TEXT:
- if (cp == LESS_THAN_SIGN) {
- state = DSTATE_POSSIBLE_DTYPE;
- break;
- }
- if (cp == GREATER_THAN_SIGN) {
- offset = off;
- goto CLEANUP;
- }
- break;
- case DSTATE_POSSIBLE_DTYPE:
- if (cp == EXCLAMATION_MARK)
- state = DSTATE_DTYPE_OR_COMMENT;
- else
- goto CLEANUP;
- break;
- case DSTATE_DTYPE_OR_COMMENT:
- if (cp == HYPHEN_MINUS) {
- goto CLEANUP;
- } else {
- doctype = string_concat(doctype, cp_to_string(cp, ret));
- state = DSTATE_DTYPE;
- break;
- }
- break;
- case DSTATE_DTYPE:
- if (ascii_is_whitespace(cp)) {
- size_t dlen = strlen(doctype)+1;
- lower_doctype = malloc(dlen * sizeof(char));
- grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen);
- if (strcmp(lower_doctype, "doctype") == 0) {
- state = DSTATE_TEXT;
- } else {
- offset = -1;
- goto CLEANUP;
- }
- break;
- }
- doctype = string_concat(doctype, cp_to_string(cp, ret));
- break;
- }
- }
- }
-CLEANUP:
- free(doctype);
- free(lower_doctype);
- return offset;
-}
-
-char *tag_get_outer_html(struct Tag *tag, char *text)
+static char *tag_get_outer_html(struct Tag *tag, char *text)
{
char *outer_html = NULL;
int o = 0;
@@ -452,7 +468,7 @@ char *tag_get_outer_html(struct Tag *tag, char *text)
return outer_html;
}
-char *tag_get_inner_html(struct Tag *tag, char *text)
+static char *tag_get_inner_html(struct Tag *tag, char *text)
{
char *inner_html = NULL;
int o = 0;
@@ -466,7 +482,7 @@ char *tag_get_inner_html(struct Tag *tag, char *text)
return inner_html;
}
-enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset)
+static enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset)
{
tag->_inner_html_begin_offset = offset+1;
tag->_is_void_element = tag_is_void_element(tag);
@@ -479,16 +495,7 @@ enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset)
return STATE_INNER_TEXT;
}
-static inline bool tag_is_void_element(struct Tag *tag)
-{
- for (int i=0; i<13; i++) {
- if (strcmp(tag->name, void_elements[i]) == 0)
- return true;
- }
- return false;
-}
-
-void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset)
+static void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset)
{
int i = offset;
while (text[i] != '<')
@@ -496,7 +503,7 @@ void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t of
closed_tag->_inner_html_end_offset = i;
}
-void tag_free(struct Tag *tag)
+static void tag_free(struct Tag *tag)
{
free(tag->name);
free(tag->inner_text);
@@ -514,7 +521,7 @@ void tag_free(struct Tag *tag)
free(tag);
}
-void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags)
+static void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags)
{
if (opts->limit > 0 && found_tags->len == opts->limit)
return;
@@ -571,327 +578,491 @@ void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags
}
}
-void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text)
+static int tag_doctype_parse(const char *text)
{
- if (opts->is_except) {
- bool is_match = false;
- for (int i=0; i<strlen(text); i++) {
- is_match = false;
- for (int k=0; k<found_tags->len; k++) {
- if (
- found_tags->tags[k]->_outer_html_begin_offset <= i &&
- found_tags->tags[k]->_outer_html_end_offset > i
- )
- is_match = true;
- }
- if (!is_match)
- putchar(text[i]);
- }
- } else {
- char *requested_text = NULL;
- char *trimmed_text = NULL;
- for (int i=0; i<found_tags->len; i++) {
- switch (opts->out) {
- case OUT_INNER_HTML:
- requested_text = tag_get_inner_html(found_tags->tags[i], text);
- trimmed_text = string_trim(requested_text);
- free(requested_text);
+ size_t offset = 0;
+ enum DoctypeState state = DSTATE_TEXT;
+ char *doctype = NULL;
+ char *lower_doctype = NULL;
+ uint_least32_t cp;
+ size_t len = strlen(text);
+ size_t ret, off;
+ for (off = 0; off<len; off += ret) {
+ if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
+ fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n");
+ } else {
+ switch (state) {
+ case DSTATE_TEXT:
+ if (cp == LESS_THAN_SIGN) {
+ state = DSTATE_POSSIBLE_DTYPE;
+ break;
+ }
+ if (cp == GREATER_THAN_SIGN) {
+ offset = off;
+ goto CLEANUP;
+ }
break;
- case OUT_OUTER_HTML:
- requested_text = tag_get_outer_html(found_tags->tags[i], text);
- trimmed_text = string_trim(requested_text);
- free(requested_text);
+ case DSTATE_POSSIBLE_DTYPE:
+ if (cp == EXCLAMATION_MARK)
+ state = DSTATE_DTYPE_OR_COMMENT;
+ else
+ goto CLEANUP;
break;
- case OUT_INNER_TEXT:
- trimmed_text = string_trim(found_tags->tags[i]->inner_text);
+ case DSTATE_DTYPE_OR_COMMENT:
+ if (cp == HYPHEN_MINUS) {
+ goto CLEANUP;
+ } else {
+ doctype = string_concat(doctype, cp_to_string(cp, ret));
+ state = DSTATE_DTYPE;
+ break;
+ }
break;
- case OUT_ATTR_VALUE:
- if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) {
- for (int k=0; k<found_tags->tags[i]->attrs_len; k++) {
- if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0)
- printf("%s\n", found_tags->tags[i]->attrs[k]->value);
+ case DSTATE_DTYPE:
+ if (ascii_is_whitespace(cp)) {
+ size_t dlen = strlen(doctype)+1;
+ lower_doctype = malloc(dlen * sizeof(char));
+ grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen);
+ if (strcmp(lower_doctype, "doctype") == 0) {
+ state = DSTATE_TEXT;
+ } else {
+ offset = -1;
+ goto CLEANUP;
}
- } else if (strlen(opts->tag) > 0) {
- for (int k=0; k<found_tags->tags[i]->attrs_len; k++)
- printf("%s\n", found_tags->tags[i]->attrs[k]->value);
+ break;
}
+ doctype = string_concat(doctype, cp_to_string(cp, ret));
break;
}
- if (trimmed_text) {
- if (strlen(trimmed_text) > 0)
- printf("%s\n", trimmed_text);
- free(trimmed_text);
- }
}
}
+CLEANUP:
+ free(doctype);
+ free(lower_doctype);
+ return offset;
}
-struct TagList *tag_list_init(void)
-{
- struct TagList *tag_list = malloc(sizeof(struct TagList));
- tag_list->tags = NULL;
- tag_list->len = 0;
- return tag_list;
-}
-
-void tag_list_free(struct TagList *tag_list)
-{
- free(tag_list->tags);
- free(tag_list);
-}
-
-struct Attr *attr_init(void)
-{
- struct Attr *attr = malloc(sizeof(struct Attr));
- attr->name = malloc(sizeof(char));
- attr->name[0] = 0;
- attr->value = malloc(sizeof(char));
- attr->value[0] = 0;
- return attr;
-}
-
-static inline bool attr_name_char_is_valid(uint_least32_t cp)
-{
- if (is_control(cp))
- return false;
- if (is_non_char(cp))
- return false;
- if (
- cp == SPACE ||
- cp == QUOTATION_MARK ||
- cp == APOSTROPHE ||
- cp == GREATER_THAN_SIGN ||
- cp == SOLIDUS ||
- cp == EQUALS_SIGN
- )
- return false;
- return true;
-}
-
-static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp)
-{
- /*
- Not mentioned invalid characters.
- They are already handled before
- function call.
- */
- if (
- cp == EQUALS_SIGN ||
- cp == LESS_THAN_SIGN ||
- cp == GREATER_THAN_SIGN ||
- cp == GRAVE_ACCENT
- )
- return false;
- return true;
-}
-
-char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base)
-{
- size_t old_offset = offset;
- char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char));
- char *numeric_charref = malloc(sizeof(char));
- numeric_charref[0] = 0;
- size_t ret;
- uint_least32_t cp;
- do {
- ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
- numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret));
- offset += ret;
- } while (cp != SEMICOLON);
- *new_offset = offset - old_offset;
- long i = strtol(numeric_charref, NULL, base);
- ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE);
- character[ret] = 0;
- free(numeric_charref);
- return character;
-}
-
-char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs)
+static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state)
{
- uint_least32_t stop_at = 0;
- switch(avs) {
- case AVS_QUOTATION_MARK:
- stop_at = QUOTATION_MARK;
- break;
- case AVS_APOSTROPHE:
- stop_at = APOSTROPHE;
- break;
- case AVS_UNQUOTED:
- stop_at = GREATER_THAN_SIGN;
- break;
- case AVS_NO: /* Just to silence the compilier warning */
- break;
- }
- char *named_charref = malloc(sizeof(char));
- named_charref[0] = 0;
- size_t ret;
+ struct Tag *tag = tag_init();
+ tag->_outer_html_begin_offset= offset-1;
+ tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag));
+ tag_list->tags[tag_list->len] = tag;
+ tag_list->len++;
+ struct Tag *still_open_tag = tag;
+ char *end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ enum State return_to_state = STATE_INNER_TEXT;
+ size_t a = 0;
+ size_t attr_name_count = 0;
+ enum AttrValueSyntax avs = AVS_NO;
+ size_t hyphen_count = 0;
uint_least32_t cp;
- int i = 0;
- for (;;) {
- ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp);
- if (cp == AMPERSAND || ascii_is_whitespace(cp))
- break;
- if (avs > AVS_NO && cp == stop_at)
- break;
- named_charref = string_concat(named_charref, cp_to_string(cp, ret));
- if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
- break;
- offset += ret;
- i++;
- }
- return named_charref;
-}
-
-char *charref_named_encode(const char *name)
-{
- char *buf = NULL;
- size_t len;
- int i;
- for (i=0; i<2138; i++) {
- if (string_starts_with(name, single_cp_entities[i].name)) {
- buf = realloc(buf, MAX_CODEPOINT_SIZE+1);
- len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE);
- buf[len] = 0;
- charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf);
- return buf;
- }
- }
- for (i=0; i<93; i++) {
- if (string_starts_with(name, double_cp_entities[i].name)) {
- size_t buf_len = 0;
- buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1);
- len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE);
- buf_len += len;
- buf += len;
- len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE);
- buf_len += len;
- buf[buf_len] = 0;
- charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf);
- return buf;
+ size_t len = strlen(text);
+ size_t ret, off;
+ for (off = offset; off<len; off += ret) {
+ if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) {
+ fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n");
+ } else {
+ /* char *the_codepoint = cp_to_string(cp, ret);
+ printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state));
+ free(the_codepoint); */
+ switch (state) {
+ case STATE_INNER_TEXT:
+ if (cp == LESS_THAN_SIGN) {
+ state = STATE_TAG;
+ break;
+ }
+ if (cp == AMPERSAND) {
+ return_to_state = STATE_INNER_TEXT;
+ state = STATE_CHAR_REF;
+ break;
+ }
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret));
+ break;
+ case STATE_TAG:
+ if (cp == SOLIDUS) {
+ state = STATE_END_TAG_NAME;
+ break;
+ }
+ if (cp == EXCLAMATION_MARK) {
+ state = STATE_COMMENT;
+ break;
+ }
+ still_open_tag = tag_get_last_open(tag_list);
+ struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME);
+ still_open_tag->children = realloc(
+ still_open_tag->children,
+ (still_open_tag->children_len+1) * sizeof(struct Tag)
+ );
+ still_open_tag->children[still_open_tag->children_len] = one_tag;
+ still_open_tag->children_len++;
+ free(end_tag);
+ return tag;
+ case STATE_BEGIN_TAG_NAME:
+ if (cp == GREATER_THAN_SIGN) {
+ state = tag_process_end_of_opening_tag(tag, off);
+ break;
+ }
+ if (ascii_is_whitespace(cp)) {
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ if (ascii_is_digit(cp) || ascii_is_alpha(cp)) {
+ tag->name = string_concat(tag->name, cp_to_string(cp, ret));
+ }
+ break;
+ case STATE_END_TAG_NAME:
+ if (cp == GREATER_THAN_SIGN) {
+ struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
+ if (closed_tag != NULL)
+ tag_set_inner_html_end_offset(closed_tag, text, off);
+ free(end_tag);
+ end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!ascii_is_whitespace(cp))
+ end_tag = string_concat(end_tag, cp_to_string(cp, ret));
+ break;
+ case STATE_ATTR_NAME:
+ if (cp == GREATER_THAN_SIGN) {
+ state = tag_process_end_of_opening_tag(tag, off);
+ break;
+ }
+ if (ascii_is_whitespace(cp)) {
+ if (attr_name_count == a+1)
+ a++;
+ break;
+ }
+ if (cp == EQUALS_SIGN) {
+ state = STATE_ATTR_VALUE;
+ break;
+ }
+ if (attr_name_char_is_valid(cp)) {
+ if (attr_name_count != a+1) {
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct Attr)
+ );
+ tag->attrs[a] = attr_init();
+ attr_name_count = a + 1;
+ tag->attrs_len = attr_name_count;
+ }
+ tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret));
+ }
+ break;
+ case STATE_ATTR_VALUE:
+ if (ascii_is_whitespace(cp)) {
+ if (avs == AVS_UNQUOTED) {
+ avs = AVS_NO;
+ state = STATE_ATTR_NAME;
+ } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) {
+ if (
+ strcmp("id", tag->attrs[a]->name) == 0 ||
+ strcmp("class", tag->attrs[a]->name) == 0
+ ) {
+ char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
+ strcpy(tmp_name, tag->attrs[a]->name);
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct Attr)
+ );
+ a++;
+ tag->attrs[a] = attr_init();
+ free(tag->attrs[a]->name);
+ tag->attrs[a]->name = tmp_name;
+ tag->attrs_len++;
+ attr_name_count = a + 1;
+ } else {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
+ }
+ }
+ break;
+ }
+ if (cp == QUOTATION_MARK) {
+ if (avs == AVS_NO) {
+ avs = AVS_QUOTATION_MARK;
+ break;
+ }
+ if (avs == AVS_QUOTATION_MARK) {
+ avs = AVS_NO;
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ }
+ if (cp == APOSTROPHE) {
+ if (avs == AVS_NO) {
+ avs = AVS_APOSTROPHE;
+ break;
+ }
+ if (avs == AVS_APOSTROPHE) {
+ avs = AVS_NO;
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ }
+ if (cp == GREATER_THAN_SIGN) {
+ state = tag_process_end_of_opening_tag(tag, off);
+ break;
+ }
+ if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) {
+ avs = AVS_UNQUOTED;
+ }
+ if (avs > AVS_NO) {
+ if (cp == AMPERSAND) {
+ state = STATE_CHAR_REF;
+ return_to_state = STATE_ATTR_VALUE;
+ break;
+ }
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret));
+ }
+ break;
+ case STATE_COMMENT:
+ if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) {
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (cp == HYPHEN_MINUS)
+ hyphen_count++;
+ else
+ hyphen_count = 0;
+ break;
+ case STATE_STYLE:
+ if (cp == LESS_THAN_SIGN) {
+ state = STATE_STYLE_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_STYLE_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_STYLE_END_TAG;
+ else
+ state = STATE_STYLE;
+ break;
+ case STATE_STYLE_END_TAG:
+ if (cp == GREATER_THAN_SIGN) {
+ struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
+ if (closed_tag != NULL)
+ tag_set_inner_html_end_offset(closed_tag, text, off);
+ free(end_tag);
+ end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!ascii_is_whitespace(cp))
+ end_tag = string_concat(end_tag, cp_to_string(cp, ret));
+ break;
+ case STATE_SCRIPT:
+ if (cp == LESS_THAN_SIGN) {
+ state = STATE_SCRIPT_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_SCRIPT_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_SCRIPT_END_TAG;
+ else
+ state = STATE_SCRIPT;
+ break;
+ case STATE_SCRIPT_END_TAG:
+ if (cp == GREATER_THAN_SIGN) {
+ struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret);
+ if (closed_tag != NULL)
+ tag_set_inner_html_end_offset(closed_tag, text, off);
+ free(end_tag);
+ end_tag = malloc(sizeof(char));
+ end_tag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!ascii_is_whitespace(cp))
+ end_tag = string_concat(end_tag, cp_to_string(cp, ret));
+ break;
+ case STATE_CHAR_REF:
+ if (cp == NUMBER_SIGN) { /* hashtag */
+ state = STATE_CHAR_REF_NUMERIC;
+ break;
+ }
+ char *named_charref = charref_named_parse(text, off, len, avs);
+ off += strlen(named_charref)-1;
+ char *encoded_named_charref = charref_named_encode(named_charref);
+ if (return_to_state == STATE_INNER_TEXT) {
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref);
+ } else if (return_to_state == STATE_ATTR_VALUE) {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref);
+ }
+ free(named_charref);
+ state = return_to_state;
+ break;
+ case STATE_CHAR_REF_NUMERIC:
+ if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) {
+ size_t new_offset;
+ char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16);
+ off += new_offset;
+ if (return_to_state == STATE_INNER_TEXT) {
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
+ } else if (return_to_state == STATE_ATTR_VALUE) {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
+ }
+ state = return_to_state;
+ break;
+ } else if (ascii_is_digit(cp)) {
+ size_t new_offset;
+ char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10);
+ off += new_offset-1;
+ if (return_to_state == STATE_INNER_TEXT) {
+ still_open_tag = tag_get_last_open(tag_list);
+ still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref);
+ } else if (return_to_state == STATE_ATTR_VALUE) {
+ tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref);
+ }
+ state = return_to_state;
+ break;
+ }
+ state = return_to_state;
+ break;
+ }
}
}
- buf = realloc(buf, (strlen(name)+2) * sizeof(char));
- buf[0] = '&';
- buf[1] = 0;
- strcat(buf, name);
- return buf;
-}
-
-void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf)
-{
- const char *remaining = &parsed_string[strlen(charref)];
- size_t remaining_len = strlen(remaining);
- size_t buf_len = strlen(*buf);
- if (remaining_len > 0) {
- if (remaining_len == 1 && remaining[0] == ';')
- return;
- *buf = realloc(*buf, buf_len+remaining_len+1);
- strcat(*buf, remaining);
- }
+ free(end_tag);
+ return tag;
}
-static inline bool ascii_is_digit(uint_least32_t cp)
+static void tag_debug_print(struct Tag *tag, int indent)
{
- if (cp >= 0x30 && cp <= 0x39)
- return true;
- return false;
+ for (int i=0; i<indent; i++)
+ putchar(' ');
+ printf("%s", tag->name);
+ for (int i=0; i<tag->attrs_len; i++)
+ printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value);
+ printf("\n");
+ indent++;
+ for (int i=tag->children_len-1; i>-1; i--)
+ tag_debug_print(tag->children[i], indent);
}
-static inline bool ascii_alpha_is_upper(uint_least32_t cp)
+static struct TagList *tag_list_init(void)
{
- if (cp >= 0x41 && cp <= 0x5A)
- return true;
- return false;
+ struct TagList *tag_list = malloc(sizeof(struct TagList));
+ tag_list->tags = NULL;
+ tag_list->len = 0;
+ return tag_list;
}
-static inline bool ascii_alpha_is_lower(uint_least32_t cp)
+void tag_list_free(struct TagList *tag_list)
{
- if (cp >= 0x61 && cp <= 0x7A)
- return true;
- return false;
+ free(tag_list->tags);
+ free(tag_list);
}
-static inline bool ascii_is_alpha(uint_least32_t cp)
+static struct HTMLDocument *html_document_init(void)
{
- if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp))
- return true;
- return false;
+ struct HTMLDocument *document = malloc(sizeof(struct HTMLDocument));
+ document->buffer = NULL;
+ document->tag = NULL;
+ document->tag_list = NULL;
+ return document;
}
-static inline bool ascii_is_whitespace(uint_least32_t cp)
+void html_document_free(struct HTMLDocument *document)
{
- if (
- cp == TAB ||
- cp == LF ||
- cp == FF ||
- cp == CR ||
- cp == SPACE
- )
- return true;
- return false;
+ // free(doc->buffer);
+ tag_free(document->tag);
+ tag_list_free(document->tag_list);
+ free(document);
}
-static inline bool is_c0_control(uint_least32_t cp)
+struct HTMLDocument *html_document_parse(char *buffer)
{
- if (cp >= 0x00 && cp <= 0x1F)
- return true;
- return false;
+ struct HTMLDocument *document = html_document_init();
+ document->buffer = buffer;
+ document->tag_list = tag_list_init();
+ int len = tag_doctype_parse(document->buffer);
+ if (len == -1) {
+ fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n");
+ html_document_free(document);
+ return NULL;
+ } else {
+ document->buffer += len;
+ }
+ document->tag = tag_parse(document->tag_list, document->buffer, 0, STATE_INNER_TEXT);
+ return document;
}
-static inline bool is_control(uint_least32_t cp)
+struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts)
{
- if (is_c0_control(cp))
- return true;
- if (cp >= 0x7F && cp <= 0x9F)
- return true;
- return false;
+ struct TagList *found_tags = tag_list_init();
+ if (!find_opts_exist(opts)) {
+ found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag));
+ found_tags->tags[0] = document->tag->children[0];
+ found_tags->len = 1;
+ } else {
+ tag_find(document->tag, opts, found_tags);
+ }
+ return found_tags;
}
-static inline bool is_non_char(uint_least32_t cp)
+void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts)
{
- if (cp >= 0xFDD0 && cp <= 0xFDEF)
- return true;
- if (
- cp == 0xFFFE || cp == 0xFFFF ||
- cp == 0x1FFFE || cp == 0x1FFFF ||
- cp == 0x2FFFE || cp == 0x2FFFF ||
- cp == 0x3FFFE || cp == 0x3FFFF ||
- cp == 0x4FFFE || cp == 0x4FFFF ||
- cp == 0x5FFFE || cp == 0x5FFFF ||
- cp == 0x6FFFE || cp == 0x6FFFF ||
- cp == 0x7FFFE || cp == 0x7FFFF ||
- cp == 0x8FFFE || cp == 0x8FFFF ||
- cp == 0x9FFFE || cp == 0x9FFFF ||
- cp == 0xAFFFE || cp == 0xAFFFF ||
- cp == 0xBFFFE || cp == 0xBFFFF ||
- cp == 0xCFFFE || cp == 0xCFFFF ||
- cp == 0xDFFFE || cp == 0xDFFFF ||
- cp == 0xEFFFE || cp == 0xEFFFF ||
- cp == 0xFFFFE || cp == 0xFFFFF ||
- cp == 0x10FFFE || cp == 0x10FFFF
- )
- return true;
- return false;
+ if (opts->is_except) {
+ bool is_match = false;
+ for (int i=0; i<strlen(document->buffer); i++) {
+ is_match = false;
+ for (int k=0; k<found_tags->len; k++) {
+ if (
+ found_tags->tags[k]->_outer_html_begin_offset <= i &&
+ found_tags->tags[k]->_outer_html_end_offset > i
+ )
+ is_match = true;
+ }
+ if (!is_match)
+ putchar(document->buffer[i]);
+ }
+ } else {
+ char *requested_text = NULL;
+ char *trimmed_text = NULL;
+ for (int i=0; i<found_tags->len; i++) {
+ switch (opts->out) {
+ case OUT_INNER_HTML:
+ requested_text = tag_get_inner_html(found_tags->tags[i], document->buffer);
+ trimmed_text = string_trim(requested_text);
+ free(requested_text);
+ break;
+ case OUT_OUTER_HTML:
+ requested_text = tag_get_outer_html(found_tags->tags[i], document->buffer);
+ trimmed_text = string_trim(requested_text);
+ free(requested_text);
+ break;
+ case OUT_INNER_TEXT:
+ trimmed_text = string_trim(found_tags->tags[i]->inner_text);
+ break;
+ case OUT_ATTR_VALUE:
+ if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) {
+ for (int k=0; k<found_tags->tags[i]->attrs_len; k++) {
+ if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0)
+ printf("%s\n", found_tags->tags[i]->attrs[k]->value);
+ }
+ } else if (strlen(opts->tag) > 0) {
+ for (int k=0; k<found_tags->tags[i]->attrs_len; k++)
+ printf("%s\n", found_tags->tags[i]->attrs[k]->value);
+ }
+ break;
+ }
+ if (trimmed_text) {
+ if (strlen(trimmed_text) > 0)
+ printf("%s\n", trimmed_text);
+ free(trimmed_text);
+ }
+ }
+ }
}
-const char *state_to_string(enum State state)
+void html_document_debug_print_tree(struct HTMLDocument *document)
{
- switch(state) {
- case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
- case STATE_TAG: return "STATE_TAG";
- case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
- case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
- case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
- case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
- case STATE_COMMENT: return "STATE_COMMENT";
- case STATE_SCRIPT: return "STATE_SCRIPT";
- case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
- case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
- case STATE_STYLE: return "STATE_STYLE";
- case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
- case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
- case STATE_CHAR_REF: return "STATE_CHAR_REF";
- case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC";
- }
- return "";
+ tag_debug_print(document->tag, -1);
}
diff --git a/src/html.h b/src/html.h
@@ -22,11 +22,6 @@
#define LONGEST_NAMED_CHAR_REF 32
#define MAX_CODEPOINT_SIZE 4
-static const char *void_elements[] = {
- "area", "base", "br", "col", "embed", "hr", "img",
- "input", "link", "meta", "source", "track", "wbr"
-};
-
struct Attr {
char *name;
char *value; // optional
@@ -52,6 +47,12 @@ struct TagList {
size_t len;
};
+struct HTMLDocument {
+ char *buffer;
+ struct Tag *tag;
+ struct TagList *tag_list;
+};
+
enum State {
STATE_INNER_TEXT,
STATE_TAG,
@@ -84,42 +85,30 @@ enum AttrValueSyntax {
AVS_UNQUOTED
};
-void html_filter(char *text, struct FindOpts *opts);
-void html_print(struct Tag *tag, int indent);
-
-struct Tag *tag_init(void);
-struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state);
-struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset);
-struct Tag *tag_get_last_open(struct TagList *tag_list);
-int tag_doctype_parse(const char *text);
-char *tag_get_outer_html(struct Tag *tag, char *text);
-char *tag_get_inner_html(struct Tag *tag, char *text);
-enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset);
-static inline bool tag_is_void_element(struct Tag *tag);
-void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset);
-void tag_free(struct Tag *tag);
-void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags);
-void tag_print_find_result(struct Tag *root_tag, struct FindOpts *opts, struct TagList *found_tags, char *text);
-
-struct TagList *tag_list_init(void);
-void tag_list_free(struct TagList *tag_list);
+enum OutType {
+ OUT_INNER_HTML,
+ OUT_OUTER_HTML,
+ OUT_INNER_TEXT,
+ OUT_ATTR_VALUE
+};
-struct Attr *attr_init(void);
-static inline bool attr_name_char_is_valid(uint_least32_t cp);
-static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp);
+struct FindOpts {
+ char *tag;
+ char *attr;
+ char *key;
+ enum OutType out;
+ bool is_except;
+ int limit;
+};
-char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base);
-char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs);
-char *charref_named_encode(const char *name);
-void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf);
+struct FindOpts *find_opts_parse(const char *pattern);
+void find_opts_free(struct FindOpts *opts);
+enum OutType output_type_parse(const char *type);
-static inline bool ascii_is_digit(uint_least32_t cp);
-static inline bool ascii_alpha_is_upper(uint_least32_t cp);
-static inline bool ascii_alpha_is_lower(uint_least32_t cp);
-static inline bool ascii_is_alpha(uint_least32_t cp);
-static inline bool ascii_is_whitespace(uint_least32_t cp);
-static inline bool is_c0_control(uint_least32_t cp);
-static inline bool is_control(uint_least32_t cp);
-static inline bool is_non_char(uint_least32_t cp);
+struct HTMLDocument *html_document_parse(char *buffer);
+struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts);
+void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts);
+void html_document_free(struct HTMLDocument *document);
+void html_document_debug_print_tree(struct HTMLDocument *document);
-const char *state_to_string(enum State s);
+void tag_list_free(struct TagList *tag_list);
diff --git a/src/misc.c b/src/misc.c
@@ -1,3 +1,11 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <grapheme.h>
+#include "misc.h"
+
char *string_concat(char *str1, char *str2)
{
size_t len2 = strlen(str2);
diff --git a/src/misc.h b/src/misc.h
@@ -0,0 +1,6 @@
+char *string_concat(char *str1, char *str2);
+char *cp_to_string(uint_least32_t cp, size_t len);
+char *string_trim(char *text);
+bool string_starts_with(const char *string, const char *part);
+bool file_try_read(char *buf, FILE *fp);
+char *file_read(FILE *fp);
diff --git a/todo b/todo
@@ -1,3 +1,3 @@
-replace int,size_t with uint*
handle correctly when no search pattern was provided
-implement charref also for outerhtml,innerhtml
+implement charref also for outerhtml,innerhtml: but how?
+what about replacing FindOpts with CssSelector?