commit ac663c0098d315558a6ff54de0e349e865a89f8f
parent 4a81391e98fb3dec17409c70feb98be905242433
Author: Robin <kroekerrobin@gmail.com>
Date: Fri, 18 Aug 2023 15:53:50 +0200
Improve <!DOCTYPE ... parsing
Diffstat:
| M | html.c | | | 98 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- |
| M | html.h | | | 8 | ++++++++ |
2 files changed, 83 insertions(+), 23 deletions(-)
diff --git a/html.c b/html.c
@@ -187,28 +187,75 @@ isValidUnquotedAttrValue(uint_least32_t cp)
size_t parseDoctype(const char *text)
{
- char *firstLine = NULL;
- int i = 0;
- while (text[i] != '\n')
- {
- firstLine = realloc(firstLine, (i+1) * sizeof(char));
- firstLine[i] = text[i];
- i++;
- }
- firstLine = realloc(firstLine, (i+1) * sizeof(char));
- firstLine[i] = 0;
- if (strcmp("<!DOCTYPE html>", firstLine) == 0)
- {
- free(firstLine);
- return i+1;
- }
- if (strcmp("<!doctype html>", firstLine) == 0)
+ size_t offset = 0;
+ enum doctype_state state = DSTATE_TEXT;
+ char *doctype = NULL;
+ char *lowerDoctype = NULL;
+ uint_least32_t cp;
+ size_t len = strlen(text);
+ size_t ret, off;
+ for (off = 0; off<len; off += ret)
{
- free(firstLine);
- return i+1;
+ if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off)
+ {
+ printError("Something wrong with ending of text");
+ }
+ else
+ {
+ switch (state)
+ {
+ case DSTATE_TEXT:
+ if (cp == LESS_THAN_SIGN)
+ {
+ state = DSTATE_POSSIBLE_DTYPE;
+ break;
+ }
+ if (cp == GREATER_THAN_SIGN)
+ {
+ offset = off;
+ goto CLEANUP;
+ }
+ break;
+ case DSTATE_POSSIBLE_DTYPE:
+ if (cp == EXCLAMATION_MARK)
+ state = DSTATE_DTYPE_OR_COMMENT;
+ else
+ goto CLEANUP;
+ break;
+ case DSTATE_DTYPE_OR_COMMENT:
+ if (cp == HYPHEN_MINUS)
+ goto CLEANUP;
+ else
+ {
+ doctype = stringCat(doctype, cpToChars(cp, ret));
+ state = DSTATE_DTYPE;
+ break;
+ }
+ break;
+ case DSTATE_DTYPE:
+ if (isASCIIWhitespace(cp))
+ {
+ size_t dlen = strlen(doctype)+1;
+ lowerDoctype = malloc(dlen * sizeof(char));
+ grapheme_to_lowercase_utf8(doctype, dlen, lowerDoctype, dlen);
+ if (strcmp(lowerDoctype, "doctype") == 0)
+ state = DSTATE_TEXT;
+ else
+ {
+ offset = -1;
+ goto CLEANUP;
+ }
+ break;
+ }
+ doctype = stringCat(doctype, cpToChars(cp, ret));
+ break;
+ }
+ }
}
- free(firstLine);
- return 0;
+CLEANUP:
+ free(doctype);
+ free(lowerDoctype);
+ return offset;
}
struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset)
@@ -766,8 +813,13 @@ void filterHtml(char *text, struct find_opts *opts)
{
struct tag_list *tagList = initTagList();
struct tag_list *foundTags = initTagList();
- size_t len = parseDoctype(text); // FIXME: not only look in first line
- if (len)
+ size_t len = parseDoctype(text);
+ if (len == -1)
+ {
+ fprintf(stderr, "Error parsing <!DOCTYPE ....\n");
+ goto CLEAN;
+ }
+ else
text += len;
struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
if (!existFindPattern(opts))
@@ -778,9 +830,9 @@ void filterHtml(char *text, struct find_opts *opts)
}
else
findTag(rootTag, opts, foundTags);
- // printf("len: %ld\n", foundTags->len);
printResult(text, rootTag, opts, foundTags);
freeTag(rootTag);
+CLEAN:
freeTagList(tagList);
freeTagList(foundTags);
}
diff --git a/html.h b/html.h
@@ -83,6 +83,14 @@ enum state
STATE_STYLE_END_TAG
};
+enum doctype_state
+{
+ DSTATE_TEXT,
+ DSTATE_POSSIBLE_DTYPE,
+ DSTATE_DTYPE_OR_COMMENT,
+ DSTATE_DTYPE
+};
+
enum attr_value_syntax
{
AVS_NO,