htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 7a549bd99d0c278a0fcc199e920e709231ea5d25
parent de3711d4d4300d4b5294ec1a8ea4aac4994febde
Author: Robin <kroekerrobin@gmail.com>
Date:   Fri, 11 Aug 2023 15:49:21 +0200

Fix parsing tag and subtag with same name

e.g.:
<div>
  <div></div>
</div

Diffstat:
Mhtex.c | 34----------------------------------
Mhtml.c | 94++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mhtml.h | 5++++-
Mlib.c | 34++++++++++++++++++++++++++++++++++
4 files changed, 91 insertions(+), 76 deletions(-)

diff --git a/htex.c b/htex.c @@ -8,40 +8,6 @@ #include "lib.c" #include "html.c" -// Do not use for reading from a socket fd -bool tryRead(char *buf, FILE *fp) -{ - size_t bytesRead = fread(buf, 1, 1, fp); - if (feof(fp) != 0) - return false; - if (ferror(fp) != 0) - tryRead(buf, fp); - if (bytesRead != 1) - tryRead(buf, fp); - return true; -} - -char *readFile(FILE *fp) -{ - char *text = NULL; - int i = 0; - char buf; - while (1) - { - if (tryRead(&buf, fp)) - { - text = realloc(text, (i+1) * sizeof(char)); - text[i] = buf; - i++; - } - else - break; - } - text = realloc(text, (i+1) * sizeof(char)); - text[i] = 0; - return text; -} - struct filter_opts *parseFilterOpts(const char *pattern) { struct filter_opts *opt = malloc(sizeof(struct filter_opts)); diff --git a/html.c b/html.c @@ -14,6 +14,9 @@ const char *stateToString(enum state s) case STATE_SCRIPT: return "STATE_SCRIPT"; case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; + case STATE_STYLE: return "STATE_STYLE"; + case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; + case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; } return ""; } @@ -210,6 +213,7 @@ size_t parseDoctype(const char *text) struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset) { + printf("Closing %s\n", endTag); for (int i=tagList->len-1; i>-1; i--) { if (strcmp(tagList->tags[i]->name, endTag) == 0) @@ -232,18 +236,6 @@ struct tag *getLastOpenTag(struct tag_list *tagList) return tagList->tags[0]; } -/* void saveOuterHtml(struct tag *tag, char *text) -{ - int o = 0; - for (int i=tag->_outerHtmlBeginOffset; i<tag->_outerHtmlEndOffset; i++) - { - tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char)); - tag->outerHtml[o] = text[i]; - o++; - } - tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char)); - tag->outerHtml[o] = 0; -} */ char *getOuterHtml(char *text, struct tag *t) { char *outerHtml = NULL; @@ -274,19 +266,6 @@ char *getInnerHtml(char *text, struct tag *t) return innerHtml; } -/* void saveInnerHtml(struct tag *tag, char *text) -{ - int o = 0; - for (int i=tag->_innerHtmlBeginOffset; i<tag->_innerHtmlEndOffset; i++) - { - tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char)); - tag->innerHtml[o] = text[i]; - o++; - } - tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char)); - tag->innerHtml[o] = 0; -} */ - void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off) { int i = off; @@ -322,9 +301,9 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } else { - // char *the_codepoint = cpToChars(cp, ret); - // printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state)); - // free(the_codepoint); + char *the_codepoint = cpToChars(cp, ret); + printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state)); + free(the_codepoint); switch (state) { case STATE_INNER_TEXT: @@ -333,7 +312,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_TAG; break; } - // stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret)); break; case STATE_TAG: if (cp == SOLIDUS) @@ -346,8 +324,8 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_COMMENT; break; } - struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList); stillOpenTag = getLastOpenTag(tagList); + struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList); stillOpenTag->children = realloc( stillOpenTag->children, (stillOpenTag->childrenLen+1) * sizeof(struct tag) @@ -362,12 +340,11 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis tag->_innerHtmlBeginOffset = off+1; tag->_isVoidElement = isVoidElement(tag->name); if (tag->_isVoidElement) - { tag->_outerHtmlEndOffset = off+1; - // saveOuterHtml(tag, text); - } if (strcmp(tag->name, "script") == 0) state = STATE_SCRIPT; + else if (strcmp(tag->name, "style") == 0) + state = STATE_STYLE; else state = STATE_INNER_TEXT; break; @@ -387,8 +364,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); setInnerHtmlEndOffset(closedTag, text, off); - // saveOuterHtml(closedTag, text); - // saveInnerHtml(closedTag, text); free(endTag); endTag = malloc(sizeof(char)); endTag[0] = 0; @@ -406,9 +381,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis if (tag->_isVoidElement) { tag->_outerHtmlEndOffset = off+1; - // saveOuterHtml(tag, text); } - state = STATE_INNER_TEXT; + if (strcmp(tag->name, "script") == 0) + state = STATE_SCRIPT; + else if (strcmp(tag->name, "style") == 0) + state = STATE_STYLE; + else + state = STATE_INNER_TEXT; break; } if (isASCIIWhitespace(cp)) @@ -485,9 +464,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis if (tag->_isVoidElement) { tag->_outerHtmlEndOffset = off+1; - // saveOuterHtml(tag, text); } - state = STATE_INNER_TEXT; + if (strcmp(tag->name, "script") == 0) + state = STATE_SCRIPT; + else if (strcmp(tag->name, "style") == 0) + state = STATE_STYLE; + else + state = STATE_INNER_TEXT; break; } if ( @@ -516,6 +499,33 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis else hyphenCount = 0; break; + case STATE_STYLE: + if (cp == LESS_THAN_SIGN) + { + state = STATE_STYLE_POSSIBLE_END_TAG; + break; + } + break; + case STATE_STYLE_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_STYLE_END_TAG; + else + state = STATE_STYLE; + break; + case STATE_STYLE_END_TAG: + if (cp == GREATER_THAN_SIGN) + { + struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); + setInnerHtmlEndOffset(closedTag, text, off); + free(endTag); + endTag = malloc(sizeof(char)); + endTag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!isASCIIWhitespace(cp)) + endTag = stringCat(endTag, cpToChars(cp, ret)); + break; case STATE_SCRIPT: if (cp == LESS_THAN_SIGN) { @@ -534,8 +544,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); setInnerHtmlEndOffset(closedTag, text, off); - // saveOuterHtml(closedTag, text); - // saveInnerHtml(closedTag, text); free(endTag); endTag = malloc(sizeof(char)); endTag[0] = 0; @@ -688,6 +696,10 @@ void filterHtml(char *text, struct filter_opts *opts) if (len) text = text + len; struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); + for (int i=0; i<tagList->len; i++) + { + printf("%s\n", tagList->tags[i]->name); + } findTag(rootTag, opts, foundTags); printResult(text, opts, foundTags); // printHtml(rootTag, 0); diff --git a/html.h b/html.h @@ -73,7 +73,10 @@ enum state STATE_COMMENT, STATE_SCRIPT, STATE_SCRIPT_POSSIBLE_END_TAG, - STATE_SCRIPT_END_TAG + STATE_SCRIPT_END_TAG, + STATE_STYLE, + STATE_STYLE_POSSIBLE_END_TAG, + STATE_STYLE_END_TAG }; enum attr_value_syntax diff --git a/lib.c b/lib.c @@ -77,3 +77,37 @@ char *trim(char *text) free(text); return trimmedText; } + +// Do not use for reading from a socket fd +bool tryRead(char *buf, FILE *fp) +{ + size_t bytesRead = fread(buf, 1, 1, fp); + if (feof(fp) != 0) + return false; + if (ferror(fp) != 0) + tryRead(buf, fp); + if (bytesRead != 1) + tryRead(buf, fp); + return true; +} + +char *readFile(FILE *fp) +{ + char *text = NULL; + int i = 0; + char buf; + while (1) + { + if (tryRead(&buf, fp)) + { + text = realloc(text, (i+1) * sizeof(char)); + text[i] = buf; + i++; + } + else + break; + } + text = realloc(text, (i+1) * sizeof(char)); + text[i] = 0; + return text; +}