htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 6c04cb48bb011fd5e498f770447bbb1e3d98947d
parent 572566e1c67d1d3e41dc16dbce8b1cec56107ff9
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat,  6 Apr 2024 18:29:17 +0200

Refactor

Diffstat:
Msrc/html.c | 145+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Msrc/misc.c | 24++++++++++++++++--------
Msrc/misc.h | 3++-
3 files changed, 93 insertions(+), 79 deletions(-)

diff --git a/src/html.c b/src/html.c @@ -179,12 +179,9 @@ struct FindOpts *find_opts_parse(const char *pattern) { struct FindOpts *opts = malloc(sizeof(struct FindOpts)); opts->out = OUT_OUTER_HTML; - opts->tag = malloc(sizeof(char)); - opts->tag[0] = 0; - opts->attr = malloc(sizeof(char)); - opts->attr[0] = 0; - opts->key = malloc(sizeof(char)); - opts->key[0] = 0; + opts->tag = NULL; + opts->attr = NULL; + opts->key = NULL; bool is_class_value = false; bool is_id_value = false; int i = 0; @@ -300,10 +297,8 @@ enum OutType output_type_parse(const char *type) static struct Attr *attr_init(void) { struct Attr *attr = malloc(sizeof(struct Attr)); - attr->name = malloc(sizeof(char)); - attr->name[0] = 0; - attr->value = malloc(sizeof(char)); - attr->value[0] = 0; + attr->name = NULL; + attr->value = NULL; return attr; } @@ -311,8 +306,7 @@ static char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t { size_t old_offset = offset; char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char)); - char *numeric_charref = malloc(sizeof(char)); - numeric_charref[0] = 0; + char *numeric_charref = NULL; size_t ret; uint_least32_t cp; do { @@ -344,8 +338,7 @@ static char *charref_named_parse(char *text, size_t offset, size_t len, enum Att case AVS_NO: /* Just to silence the compilier warning */ break; } - char *named_charref = malloc(sizeof(char)); - named_charref[0] = 0; + char *named_charref = NULL; size_t ret; uint_least32_t cp; int i = 0; @@ -382,43 +375,48 @@ static char *charref_named_encode(const char *name) char *buf = NULL; size_t len; int i; - for (i=0; i<2138; i++) { - if (string_starts_with(name, single_cp_entities[i].name)) { - buf = realloc(buf, MAX_CODEPOINT_SIZE+1); - len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE); - buf[len] = 0; - charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf); - return buf; + if (name) { + for (i=0; i<2138; i++) { + if (string_starts_with(name, single_cp_entities[i].name)) { + buf = realloc(buf, MAX_CODEPOINT_SIZE+1); + len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE); + buf[len] = 0; + charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf); + return buf; + } } - } - for (i=0; i<93; i++) { - if (string_starts_with(name, double_cp_entities[i].name)) { - size_t buf_len = 0; - buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1); - len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE); - buf_len += len; - buf += len; - len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE); - buf_len += len; - buf[buf_len] = 0; - charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf); - return buf; + for (i=0; i<93; i++) { + if (string_starts_with(name, double_cp_entities[i].name)) { + size_t buf_len = 0; + buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1); + len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE); + buf_len += len; + buf += len; + len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE); + buf_len += len; + buf[buf_len] = 0; + charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf); + return buf; + } } + buf = realloc(buf, (strlen(name)+2) * sizeof(char)); + buf[0] = '&'; + buf[1] = 0; + strcat(buf, name); + return buf; + } else { + buf = realloc(buf, 2 * sizeof(char)); + buf[0] = '&'; + buf[1] = 0; + return buf; } - buf = realloc(buf, (strlen(name)+2) * sizeof(char)); - buf[0] = '&'; - buf[1] = 0; - strcat(buf, name); - return buf; } static struct Tag *tag_init(void) { struct Tag *tag = malloc(sizeof(struct Tag)); - tag->name = malloc(sizeof(char)); - tag->name[0] = 0; - tag->inner_text = malloc(sizeof(char)); - tag->inner_text[0] = 0; + tag->name = NULL; + tag->inner_text = NULL; tag->attrs = NULL; tag->children = NULL; tag->attrs_len = 0; @@ -528,45 +526,53 @@ static void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *fou bool matches_tag = false; bool matches_attr_key = false; bool matches_attr_value = false; - if (strcmp(tag->name, opts->tag) == 0) - matches_tag = true; - for (int i=0; i<tag->attrs_len; i++) { - if (strcmp(tag->attrs[i]->name, opts->key) == 0) - matches_attr_key = true; - if (strcmp(tag->attrs[i]->value, opts->attr) == 0) - matches_attr_value = true; - } - if (strlen(opts->tag) > 0 && strlen(opts->key) > 0 && strlen(opts->attr) > 0) { + if (!string_is_empty(opts->tag)) { + if (strcmp(tag->name, opts->tag) == 0) + matches_tag = true; + } + if (!string_is_empty(opts->key)) { + for (int i=0; i<tag->attrs_len; i++) { + if (strcmp(tag->attrs[i]->name, opts->key) == 0) + matches_attr_key = true; + } + } + if (!string_is_empty(opts->attr)) { + for (int i=0; i<tag->attrs_len; i++) { + if (strcmp(tag->attrs[i]->value, opts->attr) == 0) + matches_attr_value = true; + } + } + if (!string_is_empty(opts->tag) && !string_is_empty(opts->key) && !string_is_empty(opts->attr)) { if (matches_tag && matches_attr_key && matches_attr_value) { found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); found_tags->tags[found_tags->len] = tag; found_tags->len++; } - } else if (strlen(opts->tag) > 0 && strlen(opts->key) > 0) { + } else if (!string_is_empty(opts->tag) && !string_is_empty(opts->key)) { if (matches_tag && matches_attr_key) { found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); found_tags->tags[found_tags->len] = tag; found_tags->len++; } - } else if (strlen(opts->tag) > 0) { + } else if (!string_is_empty(opts->tag)) { if (matches_tag) { found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); found_tags->tags[found_tags->len] = tag; found_tags->len++; } - } else if (strlen(opts->key) > 0 && strlen(opts->attr) > 0) { + } else if (!string_is_empty(opts->key) && !string_is_empty(opts->attr)) { if (matches_attr_key && matches_attr_value) { found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); found_tags->tags[found_tags->len] = tag; found_tags->len++; } - } else if (strlen(opts->key) > 0) { + } else if (!string_is_empty(opts->key)) { if (matches_attr_key) { found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); found_tags->tags[found_tags->len] = tag; found_tags->len++; } - } else if (strlen(opts->attr) > 0) { + } else if (!string_is_empty(opts->attr)) { if (matches_attr_value) { found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); found_tags->tags[found_tags->len] = tag; @@ -649,8 +655,7 @@ static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset tag_list->tags[tag_list->len] = tag; tag_list->len++; struct Tag *still_open_tag = tag; - char *end_tag = malloc(sizeof(char)); - end_tag[0] = 0; + char *end_tag = NULL; enum State return_to_state = STATE_INNER_TEXT; size_t a = 0; size_t attr_name_count = 0; @@ -718,8 +723,7 @@ static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset if (closed_tag != NULL) tag_set_inner_html_end_offset(closed_tag, text, off); free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; + end_tag = NULL; state = STATE_INNER_TEXT; break; } @@ -742,10 +746,7 @@ static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset } if (attr_name_char_is_valid(cp)) { if (attr_name_count != a+1) { - tag->attrs = realloc( - tag->attrs, - (a+1) * sizeof(struct Attr) - ); + tag->attrs = realloc(tag->attrs, (a+1) * sizeof(struct Attr)); tag->attrs[a] = attr_init(); attr_name_count = a + 1; tag->attrs_len = attr_name_count; @@ -847,8 +848,7 @@ static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset if (closed_tag != NULL) tag_set_inner_html_end_offset(closed_tag, text, off); free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; + end_tag = NULL; state = STATE_INNER_TEXT; break; } @@ -873,8 +873,7 @@ static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset if (closed_tag != NULL) tag_set_inner_html_end_offset(closed_tag, text, off); free(end_tag); - end_tag = malloc(sizeof(char)); - end_tag[0] = 0; + end_tag = NULL; state = STATE_INNER_TEXT; break; } @@ -887,7 +886,11 @@ static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset break; } char *named_charref = charref_named_parse(text, off, len, avs); - off += strlen(named_charref)-1; + if (named_charref) { + off += strlen(named_charref)-1; + } else { + off--; + } char *encoded_named_charref = charref_named_encode(named_charref); if (return_to_state == STATE_INNER_TEXT) { still_open_tag = tag_get_last_open(tag_list); @@ -991,6 +994,8 @@ struct HTMLDocument *html_document_parse(char *buffer) document->buffer += len; } document->tag = tag_parse(document->tag_list, document->buffer, 0, STATE_INNER_TEXT); + document->tag->name = malloc(sizeof(char)); + document->tag->name[0] = 0; return document; } diff --git a/src/misc.c b/src/misc.c @@ -6,6 +6,14 @@ #include <grapheme.h> #include "misc.h" +char *cp_to_string(uint_least32_t cp, size_t len) +{ + char *str = malloc((len+1) * sizeof(char)); + grapheme_encode_utf8(cp, str, len); + str[len] = 0; + return str; +} + char *string_concat(char *str1, char *str2) { size_t len2 = strlen(str2); @@ -20,14 +28,6 @@ char *string_concat(char *str1, char *str2) return str1; } -char *cp_to_string(uint_least32_t cp, size_t len) -{ - char *str = malloc((len+1) * sizeof(char)); - grapheme_encode_utf8(cp, str, len); - str[len] = 0; - return str; -} - char *string_trim(char *text) { char *trimmed_text = NULL; @@ -81,6 +81,14 @@ bool string_starts_with(const char *string, const char *part) return true; } +bool string_is_empty(char *string) +{ + if (string && string[0] != 0) { + return false; + } + return true; +} + // Do not use for reading from a socket fd bool file_try_read(char *buf, FILE *fp) { diff --git a/src/misc.h b/src/misc.h @@ -1,6 +1,7 @@ -char *string_concat(char *str1, char *str2); char *cp_to_string(uint_least32_t cp, size_t len); +char *string_concat(char *str1, char *str2); char *string_trim(char *text); bool string_starts_with(const char *string, const char *part); +bool string_is_empty(char *string); bool file_try_read(char *buf, FILE *fp); char *file_read(FILE *fp);