htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 0240fb1173b0287cc14983e94c41003beba9f3b6
parent bcc4ec1b9dcc6d5133ad5be90a95c8de364c88df
Author: Robin <kroekerrobin@gmail.com>
Date:   Mon,  7 Aug 2023 21:45:42 +0200

Add support for parsing outerHtml

Diffstat:
Mhtml.c | 45+++++++++++++++++++++++++++++++++++++--------
Mhtml.h | 4++++
Mlib.c | 8++++++--
3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/html.c b/html.c @@ -31,6 +31,8 @@ struct tag *initTag() t->name[0] = 0; t->innerText = malloc(sizeof(char)); t->innerText[0] = 0; + t->innerHtml = NULL; + t->outerHtml = NULL; t->attrs = NULL; t->children = NULL; t->attrsLen = 0; @@ -197,12 +199,16 @@ size_t parseDOCTYPE(const char *text) return 0; } -void closeLastUnclosedTag(struct tag_list *tagList, const char *endTag) +struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset) { for (int i=tagList->len-1; i>-1; i--) { if (strcmp(tagList->tags[i]->name, endTag) == 0) + { tagList->tags[i]->_isClosed = true; + tagList->tags[i]->_endOffset = endOffset; + return tagList->tags[i]; + } } } @@ -216,9 +222,23 @@ struct tag *getLastOpenTag(struct tag_list *tagList) return tagList->tags[0]; } -struct tag *parseTag(const char *text, enum state state, struct tag_list *tagList) +void saveOuterAndInnerHtml(struct tag *tag, char *text) +{ + int o = 0; + for (int i=tag->_beginOffset; i<tag->_endOffset; i++) + { + tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char)); + tag->outerHtml[o] = text[i]; + o++; + } + tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char)); + tag->outerHtml[o] = 0; +} + +struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList) { struct tag *tag = initTag(); + tag->_beginOffset = offset-1; tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag)); tagList->tags[tagList->len] = tag; tagList->len++; @@ -232,7 +252,7 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list *tagLis uint_least32_t cp; size_t len = strlen(text); size_t ret, off; - for (off = 0; off<len; off += ret) + for (off = offset; off<len; off += ret) { if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { @@ -269,7 +289,7 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list *tagLis stillOpenTag->children, (stillOpenTag->childrenLen+1) * sizeof(struct tag) ); - struct tag *oneTag = parseTag(text+off, STATE_BEGIN_TAG_NAME, tagList); + struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList); stillOpenTag->children[stillOpenTag->childrenLen] = oneTag; stillOpenTag->childrenLen++; free(endTag); @@ -294,7 +314,8 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list *tagLis case STATE_END_TAG_NAME: if (cp == GREATER_THAN_SIGN) { - closeLastUnclosedTag(tagList, endTag); + struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); + saveOuterAndInnerHtml(closedTag, text); free(endTag); endTag = malloc(sizeof(char)); endTag[0] = 0; @@ -407,6 +428,8 @@ void freeTag(struct tag *t) { free(t->name); free(t->innerText); + free(t->innerHtml); + free(t->outerHtml); for (int i=0; i<t->attrsLen; i++) { free(t->attrs[i]->name); @@ -489,7 +512,7 @@ struct tag *findTag(struct tag *tag, struct tag_list *list, struct filter_opts * return NULL; } -void filterHtml(const char *text, struct filter_opts *opts) +void filterHtml(char *text, struct filter_opts *opts) { struct tag *rootTag; struct tag_list *tagList = malloc(sizeof(struct tag_list)); @@ -497,14 +520,20 @@ void filterHtml(const char *text, struct filter_opts *opts) tagList->len = 0; size_t len = parseDOCTYPE(text); if (len) - rootTag = parseTag(text+len, STATE_INNER_TEXT, tagList); + rootTag = parseTag(text+len, 0, STATE_INNER_TEXT, tagList); else - rootTag = parseTag(text, STATE_INNER_TEXT, tagList); + rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); struct tag *result = findTag(rootTag, tagList, opts); if (result == NULL) printError("No tag found."); else + { printf("result: %s\n", result->name); + if (!result->_isVoidElement) + { + printf("outerHtml: %s\n", result->outerHtml); + } + } freeTag(rootTag); freeTagList(tagList); } diff --git a/html.h b/html.h @@ -36,12 +36,16 @@ struct tag { char *name; char *innerText; + char *innerHtml; + char *outerHtml; struct attr **attrs; struct tag **children; size_t attrsLen; size_t childrenLen; bool _isVoidElement; // means there is no closing tag bool _isClosed; + size_t _beginOffset; + size_t _endOffset; }; struct tag_list diff --git a/lib.c b/lib.c @@ -1,7 +1,11 @@ char *stringCat(char *str1,char *str2) { - int str1Len = strlen(str1); - int str2Len = strlen(str2); + int str1Len = 0; + int str2Len = 0; + if (str1) + str1Len = strlen(str1); + if (str2) + str2Len = strlen(str2); char *string = malloc((str1Len+str2Len+1) * sizeof(char)); int i = 0; int k = 0;