htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit ac663c0098d315558a6ff54de0e349e865a89f8f
parent 4a81391e98fb3dec17409c70feb98be905242433
Author: Robin <kroekerrobin@gmail.com>
Date:   Fri, 18 Aug 2023 15:53:50 +0200

Improve <!DOCTYPE ... parsing

Diffstat:
Mhtml.c | 98++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mhtml.h | 8++++++++
2 files changed, 83 insertions(+), 23 deletions(-)

diff --git a/html.c b/html.c @@ -187,28 +187,75 @@ isValidUnquotedAttrValue(uint_least32_t cp) size_t parseDoctype(const char *text) { - char *firstLine = NULL; - int i = 0; - while (text[i] != '\n') - { - firstLine = realloc(firstLine, (i+1) * sizeof(char)); - firstLine[i] = text[i]; - i++; - } - firstLine = realloc(firstLine, (i+1) * sizeof(char)); - firstLine[i] = 0; - if (strcmp("<!DOCTYPE html>", firstLine) == 0) - { - free(firstLine); - return i+1; - } - if (strcmp("<!doctype html>", firstLine) == 0) + size_t offset = 0; + enum doctype_state state = DSTATE_TEXT; + char *doctype = NULL; + char *lowerDoctype = NULL; + uint_least32_t cp; + size_t len = strlen(text); + size_t ret, off; + for (off = 0; off<len; off += ret) { - free(firstLine); - return i+1; + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) + { + printError("Something wrong with ending of text"); + } + else + { + switch (state) + { + case DSTATE_TEXT: + if (cp == LESS_THAN_SIGN) + { + state = DSTATE_POSSIBLE_DTYPE; + break; + } + if (cp == GREATER_THAN_SIGN) + { + offset = off; + goto CLEANUP; + } + break; + case DSTATE_POSSIBLE_DTYPE: + if (cp == EXCLAMATION_MARK) + state = DSTATE_DTYPE_OR_COMMENT; + else + goto CLEANUP; + break; + case DSTATE_DTYPE_OR_COMMENT: + if (cp == HYPHEN_MINUS) + goto CLEANUP; + else + { + doctype = stringCat(doctype, cpToChars(cp, ret)); + state = DSTATE_DTYPE; + break; + } + break; + case DSTATE_DTYPE: + if (isASCIIWhitespace(cp)) + { + size_t dlen = strlen(doctype)+1; + lowerDoctype = malloc(dlen * sizeof(char)); + grapheme_to_lowercase_utf8(doctype, dlen, lowerDoctype, dlen); + if (strcmp(lowerDoctype, "doctype") == 0) + state = DSTATE_TEXT; + else + { + offset = -1; + goto CLEANUP; + } + break; + } + doctype = stringCat(doctype, cpToChars(cp, ret)); + break; + } + } } - free(firstLine); - return 0; +CLEANUP: + free(doctype); + free(lowerDoctype); + return offset; } struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset) @@ -766,8 +813,13 @@ void filterHtml(char *text, struct find_opts *opts) { struct tag_list *tagList = initTagList(); struct tag_list *foundTags = initTagList(); - size_t len = parseDoctype(text); // FIXME: not only look in first line - if (len) + size_t len = parseDoctype(text); + if (len == -1) + { + fprintf(stderr, "Error parsing <!DOCTYPE ....\n"); + goto CLEAN; + } + else text += len; struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); if (!existFindPattern(opts)) @@ -778,9 +830,9 @@ void filterHtml(char *text, struct find_opts *opts) } else findTag(rootTag, opts, foundTags); - // printf("len: %ld\n", foundTags->len); printResult(text, rootTag, opts, foundTags); freeTag(rootTag); +CLEAN: freeTagList(tagList); freeTagList(foundTags); } diff --git a/html.h b/html.h @@ -83,6 +83,14 @@ enum state STATE_STYLE_END_TAG }; +enum doctype_state +{ + DSTATE_TEXT, + DSTATE_POSSIBLE_DTYPE, + DSTATE_DTYPE_OR_COMMENT, + DSTATE_DTYPE +}; + enum attr_value_syntax { AVS_NO,