htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit de3711d4d4300d4b5294ec1a8ea4aac4994febde
parent 7ee9e2ffcc747ef4a0a14eeec8f55402d4653ca6
Author: Robin <kroekerrobin@gmail.com>
Date:   Thu, 10 Aug 2023 19:24:35 +0200

Don't save inner and outer html, only the offsets

Diffstat:
MMakefile | 4++--
Mhtex.c | 10+---------
Mhtml.c | 161+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mhtml.h | 7++++---
Mlib.c | 1+
Mtodo | 1+
6 files changed, 121 insertions(+), 63 deletions(-)

diff --git a/Makefile b/Makefile @@ -2,9 +2,9 @@ PREFIX = /usr/local MANPREFIX = $(PREFIX)/share/man all: - $(CC) -O -Werror -o htex htex.c -lgrapheme + $(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme debug: - $(CC) -fsanitize=address -O -Werror -o htex htex.c -lgrapheme + $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme clean: rm htex install: all diff --git a/htex.c b/htex.c @@ -52,9 +52,7 @@ struct filter_opts *parseFilterOpts(const char *pattern) opt->attr[0] = 0; opt->key = malloc(sizeof(char)); opt->key[0] = 0; - char *classValue = NULL; bool isClassValue = false; - char *idValue = NULL; bool isIdValue = false; int i = 0; bool isAttrKey = false; @@ -73,7 +71,6 @@ struct filter_opts *parseFilterOpts(const char *pattern) isIdValue = true; i = 1; break; - default: } for (; i<strlen(pattern); i++) { @@ -170,22 +167,17 @@ int main(int argc, char *argv[]) int o = 0; int option_index = 0; bool isInnerHtml = false; - bool isExcept = false; char *text = NULL; char *searchPattern = NULL; static struct option long_options[] = { { "innerhtml", no_argument, 0, 'i' }, - { "except", no_argument, 0, 'e' }, { 0, 0, 0, 0 } }; - while ((o = getopt_long(argc, argv, "ie", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "i", long_options, &option_index)) != -1) { switch(o) { case 'i': isInnerHtml = true; break; - case 'e': - isExcept = true; - break; } } if (argc == optind) diff --git a/html.c b/html.c @@ -11,7 +11,11 @@ const char *stateToString(enum state s) case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; case STATE_COMMENT: return "STATE_COMMENT"; + case STATE_SCRIPT: return "STATE_SCRIPT"; + case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; + case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; } + return ""; } struct attr *initAttr() @@ -31,8 +35,6 @@ struct tag *initTag() t->name[0] = 0; t->innerText = malloc(sizeof(char)); t->innerText[0] = 0; - t->innerHtml = NULL; - t->outerHtml = NULL; t->attrs = NULL; t->children = NULL; t->attrsLen = 0; @@ -42,6 +44,14 @@ struct tag *initTag() return t; } +struct tag_list *initTagList() +{ + struct tag_list *t = malloc(sizeof(struct tag_list)); + t->tags = NULL; + t->len = 0; + return t; +} + static inline bool isASCIIDigit(uint_least32_t cp) { if (cp >= 0x30 && cp <= 0x39) @@ -172,10 +182,9 @@ isValidUnquotedAttrValue(uint_least32_t cp) return true; } -size_t parseDOCTYPE(const char *text) +size_t parseDoctype(const char *text) { char *firstLine = NULL; - char c; int i = 0; while (text[i] != '\n') { @@ -210,6 +219,7 @@ struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, s return tagList->tags[i]; } } + return NULL; } struct tag *getLastOpenTag(struct tag_list *tagList) @@ -222,11 +232,7 @@ struct tag *getLastOpenTag(struct tag_list *tagList) return tagList->tags[0]; } -/* char *getInnerHtml(struct tag *tag) -{ -} */ - -void saveOuterHtml(struct tag *tag, char *text) +/* void saveOuterHtml(struct tag *tag, char *text) { int o = 0; for (int i=tag->_outerHtmlBeginOffset; i<tag->_outerHtmlEndOffset; i++) @@ -237,9 +243,38 @@ void saveOuterHtml(struct tag *tag, char *text) } tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char)); tag->outerHtml[o] = 0; +} */ +char *getOuterHtml(char *text, struct tag *t) +{ + char *outerHtml = NULL; + int o = 0; + for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++) + { + outerHtml = realloc(outerHtml, (o+1) * sizeof(char)); + outerHtml[o] = text[i]; + o++; + } + outerHtml = realloc(outerHtml, (o+1) * sizeof(char)); + outerHtml[o] = 0; + return outerHtml; +} + +char *getInnerHtml(char *text, struct tag *t) +{ + char *innerHtml = NULL; + int o = 0; + for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++) + { + innerHtml = realloc(innerHtml, (o+1) * sizeof(char)); + innerHtml[o] = text[i]; + o++; + } + innerHtml = realloc(innerHtml, (o+1) * sizeof(char)); + innerHtml[o] = 0; + return innerHtml; } -void saveInnerHtml(struct tag *tag, char *text) +/* void saveInnerHtml(struct tag *tag, char *text) { int o = 0; for (int i=tag->_innerHtmlBeginOffset; i<tag->_innerHtmlEndOffset; i++) @@ -250,7 +285,7 @@ void saveInnerHtml(struct tag *tag, char *text) } tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char)); tag->innerHtml[o] = 0; -} +} */ void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off) { @@ -274,7 +309,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis endTag[0] = 0; size_t a = 0; size_t attrNameCount = 0; - size_t attrValueCount = 0; enum attr_value_syntax attrValueSyntax = AVS_NO; size_t hyphenCount = 0; uint_least32_t cp; @@ -299,8 +333,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_TAG; break; } - stillOpenTag = getLastOpenTag(tagList); - stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret)); + // stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret)); break; case STATE_TAG: if (cp == SOLIDUS) @@ -313,11 +346,12 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_COMMENT; break; } + struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList); + stillOpenTag = getLastOpenTag(tagList); stillOpenTag->children = realloc( stillOpenTag->children, (stillOpenTag->childrenLen+1) * sizeof(struct tag) ); - struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList); stillOpenTag->children[stillOpenTag->childrenLen] = oneTag; stillOpenTag->childrenLen++; free(endTag); @@ -330,9 +364,12 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis if (tag->_isVoidElement) { tag->_outerHtmlEndOffset = off+1; - saveOuterHtml(tag, text); + // saveOuterHtml(tag, text); } - state = STATE_INNER_TEXT; + if (strcmp(tag->name, "script") == 0) + state = STATE_SCRIPT; + else + state = STATE_INNER_TEXT; break; } if (isASCIIWhitespace(cp)) @@ -350,8 +387,8 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); setInnerHtmlEndOffset(closedTag, text, off); - saveOuterHtml(closedTag, text); - saveInnerHtml(closedTag, text); + // saveOuterHtml(closedTag, text); + // saveInnerHtml(closedTag, text); free(endTag); endTag = malloc(sizeof(char)); endTag[0] = 0; @@ -369,7 +406,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis if (tag->_isVoidElement) { tag->_outerHtmlEndOffset = off+1; - saveOuterHtml(tag, text); + // saveOuterHtml(tag, text); } state = STATE_INNER_TEXT; break; @@ -448,7 +485,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis if (tag->_isVoidElement) { tag->_outerHtmlEndOffset = off+1; - saveOuterHtml(tag, text); + // saveOuterHtml(tag, text); } state = STATE_INNER_TEXT; break; @@ -469,7 +506,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } break; case STATE_COMMENT: - if (cp == GREATER_THAN_SIGN && hyphenCount == 2) + if (cp == GREATER_THAN_SIGN && hyphenCount >= 2) { state = STATE_INNER_TEXT; break; @@ -479,18 +516,46 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis else hyphenCount = 0; break; + case STATE_SCRIPT: + if (cp == LESS_THAN_SIGN) + { + state = STATE_SCRIPT_POSSIBLE_END_TAG; + break; + } + break; + case STATE_SCRIPT_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_SCRIPT_END_TAG; + else + state = STATE_SCRIPT; + break; + case STATE_SCRIPT_END_TAG: + if (cp == GREATER_THAN_SIGN) + { + struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); + setInnerHtmlEndOffset(closedTag, text, off); + // saveOuterHtml(closedTag, text); + // saveInnerHtml(closedTag, text); + free(endTag); + endTag = malloc(sizeof(char)); + endTag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!isASCIIWhitespace(cp)) + endTag = stringCat(endTag, cpToChars(cp, ret)); + break; } } } free(endTag); + return tag; } void freeTag(struct tag *t) { free(t->name); free(t->innerText); - free(t->innerHtml); - free(t->outerHtml); for (int i=0; i<t->attrsLen; i++) { free(t->attrs[i]->name); @@ -513,15 +578,6 @@ void freeTagList(struct tag_list *t) free(t); } -void printHtml(struct tag *t) -{ - printf("name: %s\n", t->name); - for (int i=0; i<t->childrenLen; i++) - { - printHtml(t->children[i]); - } -} - void findTag(struct tag *tag, struct filter_opts *opt, struct tag_list *foundTags) { bool matchesTag = false; @@ -596,17 +652,29 @@ void findTag(struct tag *tag, struct filter_opts *opt, struct tag_list *foundTag } } -void printResult(struct tag_list *foundTags, struct filter_opts *opts) +void printHtml(struct tag *t, int indent) +{ + for (int i=0; i<indent; i++) + putchar(' '); + printf("%s\n", t->name); + indent++; + for (int i=t->childrenLen-1; i>-1; i--) + { + printHtml(t->children[i], indent); + } +} + +void printResult(char *text, struct filter_opts *opts, struct tag_list *foundTags) { - char *trimmedOutput; + char *trimmedOutput = NULL; for (int i=0; i<foundTags->len; i++) { if (foundTags->tags[i]->_isVoidElement) opts->out = OUT_OUTER_HTML; if (opts->out == OUT_OUTER_HTML) - trimmedOutput = trim(foundTags->tags[i]->outerHtml); + trimmedOutput = trim(getOuterHtml(text, foundTags->tags[i])); else if (opts->out == OUT_INNER_HTML) - trimmedOutput = trim(foundTags->tags[i]->innerHtml); + trimmedOutput = trim(getInnerHtml(text, foundTags->tags[i])); printf("%s\n", trimmedOutput); free(trimmedOutput); } @@ -614,20 +682,15 @@ void printResult(struct tag_list *foundTags, struct filter_opts *opts) void filterHtml(char *text, struct filter_opts *opts) { - struct tag *rootTag; - struct tag_list *tagList = malloc(sizeof(struct tag_list)); - tagList->tags = NULL; - tagList->len = 0; - size_t len = parseDOCTYPE(text); + struct tag_list *tagList = initTagList(); + struct tag_list *foundTags = initTagList(); + size_t len = parseDoctype(text); if (len) - rootTag = parseTag(text+len, 0, STATE_INNER_TEXT, tagList); - else - rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); - struct tag_list *foundTags = malloc(sizeof(struct tag_list)); - foundTags->tags = NULL; - foundTags->len = 0; + text = text + len; + struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); findTag(rootTag, opts, foundTags); - printResult(foundTags, opts); + printResult(text, opts, foundTags); + // printHtml(rootTag, 0); freeTag(rootTag); freeTagList(tagList); freeTagList(foundTags); diff --git a/html.h b/html.h @@ -44,8 +44,6 @@ struct tag { char *name; char *innerText; - char *innerHtml; - char *outerHtml; struct attr **attrs; struct tag **children; size_t attrsLen; @@ -72,7 +70,10 @@ enum state STATE_END_TAG_NAME, STATE_ATTR_NAME, STATE_ATTR_VALUE, - STATE_COMMENT + STATE_COMMENT, + STATE_SCRIPT, + STATE_SCRIPT_POSSIBLE_END_TAG, + STATE_SCRIPT_END_TAG }; enum attr_value_syntax diff --git a/lib.c b/lib.c @@ -74,5 +74,6 @@ char *trim(char *text) } trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); trimmedText[k] = 0; + free(text); return trimmedText; } diff --git a/todo b/todo @@ -0,0 +1 @@ +only save offsets for inner and outer html and don't save it