htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 95542daf7c0bd7d289d61c34e6c2522c101e7a12
parent 8b7678b3f56e1b9d1a4fdc58b1d0635bfba473fe
Author: Robin <kroekerrobin@gmail.com>
Date:   Thu,  3 Aug 2023 22:47:22 +0200

Parse innerText correctly

Diffstat:
Mhtml.c | 59++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/html.c b/html.c @@ -197,18 +197,33 @@ size_t parseDOCTYPE(const char *text) return 0; } -struct tag *parseTag(const char *text, enum state state, struct tag_list **tagList) +void closeLastUnclosedTag(struct tag_list *tagList, const char *endTag) { - struct tag *tag = initTag(); - (*tagList)->tags = realloc((*tagList)->tags, ((*tagList)->len+1) * sizeof(struct tag)); - (*tagList)->tags[(*tagList)->len] = tag; - (*tagList)->len++; - /* printf("tagList: "); - for (int i=0; i<tagList->len; i++) + for (int i=tagList->len-1; i>-1; i--) + { + if (strcmp(tagList->tags[i]->name, endTag) == 0) + tagList->tags[i]->_isClosed = true; + } +} + +struct tag *getLastOpenTag(struct tag_list *tagList) +{ + for (int i=tagList->len-1; i>-1; i--) { - printf("%02X, ", tagList->tags[i]); + if (!tagList->tags[i]->_isVoidElement && !tagList->tags[i]->_isClosed) + return tagList->tags[i]; } - printf("\n"); */ +} + +struct tag *parseTag(const char *text, enum state state, struct tag_list *tagList) +{ + struct tag *tag = initTag(); + tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag)); + tagList->tags[tagList->len] = tag; + tagList->len++; + struct tag *innerTextTag = tag; + char *endTag = malloc(sizeof(char)); + endTag[0] = 0; size_t a = 0; size_t attrNameCount = 0; size_t attrValueCount = 0; @@ -236,7 +251,8 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list **tagLi state = STATE_TAG; break; } - tag->innerText = stringCat(tag->innerText, cpToChars(cp, ret)); + innerTextTag = getLastOpenTag(tagList); + innerTextTag->innerText = stringCat(innerTextTag->innerText, cpToChars(cp, ret)); break; case STATE_TAG: if (cp == SOLIDUS) @@ -272,7 +288,16 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list **tagLi break; case STATE_END_TAG_NAME: if (cp == GREATER_THAN_SIGN) + { + closeLastUnclosedTag(tagList, endTag); + free(endTag); + endTag = malloc(sizeof(char)); + endTag[0] = 0; state = STATE_INNER_TEXT; + break; + } + if (!isASCIIWhitespace(cp)) + endTag = stringCat(endTag, cpToChars(cp, ret)); break; case STATE_ATTR_NAME: if (cp == GREATER_THAN_SIGN) @@ -376,7 +401,6 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list **tagLi void freeTag(struct tag *t) { - printf("freeing a tag...\n"); if (t->name != NULL) free(t->name); if (t->innerText != NULL) @@ -392,6 +416,14 @@ void freeTag(struct tag *t) free(t); } +void printHtml(struct tag *tag, int i) +{ + printf("%d: %s, %s\n", i, tag->name, tag->innerText); + // printf("%*s\n", width + strlen(tag->name), tag->name); + for (int i=0; i<tag->childrenLen; i++) + printHtml(tag->children[i], i); +} + void parseHtml(const char *text) { struct tag *rootTag; @@ -400,9 +432,10 @@ void parseHtml(const char *text) tagList->len = 0; size_t len = parseDOCTYPE(text); if (len) - rootTag = parseTag(text+len, STATE_INNER_TEXT, &tagList); + rootTag = parseTag(text+len, STATE_INNER_TEXT, tagList); else - rootTag = parseTag(text, STATE_INNER_TEXT, &tagList); + rootTag = parseTag(text, STATE_INNER_TEXT, tagList); + printHtml(rootTag, 0); freeTag(rootTag); free(tagList->tags); free(tagList);