htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 0e33554f458efcf0b45d3985db29a333c2c35385
parent 0240fb1173b0287cc14983e94c41003beba9f3b6
Author: Robin <kroekerrobin@gmail.com>
Date:   Tue,  8 Aug 2023 21:30:09 +0200

Support outerHtml for void elements

Diffstat:
Mhtex.c | 7+++++++
Mhtml.c | 73++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mhtml.h | 8++++++--
Mtodo | 7++-----
4 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/htex.c b/htex.c @@ -45,6 +45,8 @@ char *readFile(FILE *fp) struct filter_opts *parseFilterOpts(const char *pattern) { struct filter_opts *opt = malloc(sizeof(struct filter_opts)); + opt->outerHtml = true; + opt->innerHtml = false; opt->tag = malloc(sizeof(char)); opt->tag[0] = 0; opt->attr = malloc(sizeof(char)); @@ -221,6 +223,11 @@ int main(int argc, char *argv[]) } } struct filter_opts *options = parseFilterOpts(searchPattern); + if (isInnerHtml) + { + options->innerHtml = true; + options->outerHtml = false; + } filterHtml(text, options); freeOpts(options); free(text); diff --git a/html.c b/html.c @@ -158,9 +158,9 @@ static inline bool isValidUnquotedAttrValue(uint_least32_t cp) { /* - Not mentioned invalid characters - are already handled before funtion - call. + Not mentioned invalid characters. + They are already handled before + funtion call. */ if ( cp == EQUALS_SIGN || @@ -206,7 +206,7 @@ struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, s if (strcmp(tagList->tags[i]->name, endTag) == 0) { tagList->tags[i]->_isClosed = true; - tagList->tags[i]->_endOffset = endOffset; + tagList->tags[i]->_outerHtmlEndOffset = endOffset; return tagList->tags[i]; } } @@ -222,10 +222,14 @@ struct tag *getLastOpenTag(struct tag_list *tagList) return tagList->tags[0]; } -void saveOuterAndInnerHtml(struct tag *tag, char *text) +/* char *getInnerHtml(struct tag *tag) +{ +} */ + +void saveOuterHtml(struct tag *tag, char *text) { int o = 0; - for (int i=tag->_beginOffset; i<tag->_endOffset; i++) + for (int i=tag->_outerHtmlBeginOffset; i<tag->_outerHtmlEndOffset; i++) { tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char)); tag->outerHtml[o] = text[i]; @@ -235,10 +239,33 @@ void saveOuterAndInnerHtml(struct tag *tag, char *text) tag->outerHtml[o] = 0; } +void saveInnerHtml(struct tag *tag, char *text) +{ + int o = 0; + for (int i=tag->_innerHtmlBeginOffset; i<tag->_innerHtmlEndOffset; i++) + { + tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char)); + tag->innerHtml[o] = text[i]; + o++; + } + tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char)); + tag->innerHtml[o] = 0; +} + +void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off) +{ + int i = off; + while (text[i] != '<') + { + i--; + } + closedTag->_innerHtmlEndOffset = i; +} + struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList) { struct tag *tag = initTag(); - tag->_beginOffset = offset-1; + tag->_outerHtmlBeginOffset= offset-1; tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag)); tagList->tags[tagList->len] = tag; tagList->len++; @@ -297,7 +324,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis case STATE_BEGIN_TAG_NAME: if (cp == GREATER_THAN_SIGN) { + tag->_innerHtmlBeginOffset = off+1; tag->_isVoidElement = isVoidElement(tag->name); + if (tag->_isVoidElement) + { + tag->_outerHtmlEndOffset = off+1; + saveOuterHtml(tag, text); + } state = STATE_INNER_TEXT; break; } @@ -315,7 +348,9 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis if (cp == GREATER_THAN_SIGN) { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); - saveOuterAndInnerHtml(closedTag, text); + setInnerHtmlEndOffset(closedTag, text, off); + saveOuterHtml(closedTag, text); + saveInnerHtml(closedTag, text); free(endTag); endTag = malloc(sizeof(char)); endTag[0] = 0; @@ -328,7 +363,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis case STATE_ATTR_NAME: if (cp == GREATER_THAN_SIGN) { + tag->_innerHtmlBeginOffset = off+1; tag->_isVoidElement = isVoidElement(tag->name); + if (tag->_isVoidElement) + { + tag->_outerHtmlEndOffset = off+1; + saveOuterHtml(tag, text); + } state = STATE_INNER_TEXT; break; } @@ -399,6 +440,18 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis break; } } + if (cp == GREATER_THAN_SIGN) + { + tag->_innerHtmlBeginOffset = off+1; + tag->_isVoidElement = isVoidElement(tag->name); + if (tag->_isVoidElement) + { + tag->_outerHtmlEndOffset = off+1; + saveOuterHtml(tag, text); + } + state = STATE_INNER_TEXT; + break; + } if ( attrValueSyntax == AVS_NO && isValidUnquotedAttrValue(cp) @@ -529,10 +582,12 @@ void filterHtml(char *text, struct filter_opts *opts) else { printf("result: %s\n", result->name); + printf("%ld %ld\n", result->_outerHtmlBeginOffset, result->_outerHtmlEndOffset); if (!result->_isVoidElement) { - printf("outerHtml: %s\n", result->outerHtml); + printf("innerHtml: %s\n", result->innerHtml); } + printf("outerHtml: %s\n", result->outerHtml); } freeTag(rootTag); freeTagList(tagList); diff --git a/html.h b/html.h @@ -24,6 +24,8 @@ struct filter_opts char *tag; char *attr; char *key; + bool innerHtml; + bool outerHtml; }; struct attr @@ -44,8 +46,10 @@ struct tag size_t childrenLen; bool _isVoidElement; // means there is no closing tag bool _isClosed; - size_t _beginOffset; - size_t _endOffset; + size_t _outerHtmlBeginOffset; + size_t _outerHtmlEndOffset; + size_t _innerHtmlBeginOffset; + size_t _innerHtmlEndOffset; }; struct tag_list diff --git a/todo b/todo @@ -1,5 +1,2 @@ -refactor; heavy -implement find_attribute_value_by_* -implement filtering not only by class or id, also like this .test[data="asdf"] -implement finding tags that have no end tag, e.g. the img tag -Actually correctly parse html according to spec ;-) +strip beginning and ending whitespace of inner and outer html +find element by attr value (something's still wrong)