htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 3bb5b904098dd95d37006dfef9bc9316cab2dc6d
parent 8a72c1ae03282373a398d2ed7ff0473cc1061352
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat, 12 Aug 2023 08:06:08 +0200

Support --except argument

Diffstat:
Mhtex.c | 8+++++++-
Mhtml.c | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mhtml.h | 2+-
3 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/htex.c b/htex.c @@ -133,17 +133,22 @@ int main(int argc, char *argv[]) int o = 0; int option_index = 0; bool isInnerHtml = false; + bool isExcept = false; char *text = NULL; char *searchPattern = NULL; static struct option long_options[] = { { "innerhtml", no_argument, 0, 'i' }, + { "except", no_argument, 0, 'e' }, { 0, 0, 0, 0 } }; - while ((o = getopt_long(argc, argv, "i", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "ie", long_options, &option_index)) != -1) { switch(o) { case 'i': isInnerHtml = true; break; + case 'e': + isExcept = true; + break; } } if (argc == optind) @@ -180,6 +185,7 @@ int main(int argc, char *argv[]) } } struct filter_opts *options = parseFilterOpts(searchPattern); + options->isExcept = isExcept; if (isInnerHtml) options->out = OUT_INNER_HTML; filterHtml(text, options); diff --git a/html.c b/html.c @@ -36,8 +36,6 @@ struct tag *initTag() struct tag *t = malloc(sizeof(struct tag)); t->name = malloc(sizeof(char)); t->name[0] = 0; - t->innerText = malloc(sizeof(char)); - t->innerText[0] = 0; t->attrs = NULL; t->children = NULL; t->attrsLen = 0; @@ -564,7 +562,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis void freeTag(struct tag *t) { free(t->name); - free(t->innerText); for (int i=0; i<t->attrsLen; i++) { free(t->attrs[i]->name); @@ -673,19 +670,67 @@ void printHtml(struct tag *t, int indent) } } -void printResult(char *text, struct filter_opts *opts, struct tag_list *foundTags) +void printTag(char *text, struct tag *t, enum output_type out, struct tag_list *foundTags) { - char *trimmedOutput = NULL; + unsigned int p = *(unsigned int *)t; + unsigned int cp; + bool isMatch = false; for (int i=0; i<foundTags->len; i++) { - if (foundTags->tags[i]->_isVoidElement) - opts->out = OUT_OUTER_HTML; - if (opts->out == OUT_OUTER_HTML) - trimmedOutput = trim(getOuterHtml(text, foundTags->tags[i])); - else if (opts->out == OUT_INNER_HTML) - trimmedOutput = trim(getInnerHtml(text, foundTags->tags[i])); - printf("%s\n", trimmedOutput); - free(trimmedOutput); + cp = *(unsigned int *)foundTags->tags[i]; + if (p == cp) + isMatch = true; + } + if (!isMatch) + { + char *trimmedText = NULL; + switch (out) + { + case OUT_INNER_HTML: + trimmedText = trim(getInnerHtml(text, t)); + break; + case OUT_OUTER_HTML: + trimmedText = trim(getOuterHtml(text, t)); + break; + } + if (strlen(trimmedText) > 0) + printf("%s\n", trimmedText); + free(trimmedText); + } + for (int i=t->childrenLen-1; i>-1; i--) + { + printTag(text, t->children[i], out, foundTags); + } +} + +void printResult +( + char *text, + struct tag *rootTag, + struct filter_opts *opts, + struct tag_list *foundTags +) +{ + if (opts->isExcept) + printTag(text, rootTag, opts->out, foundTags); + else + { + char *trimmedText = NULL; + for (int i=0; i<foundTags->len; i++) + { + switch (opts->out) + { + case OUT_INNER_HTML: + trimmedText = trim(getInnerHtml(text, foundTags->tags[i])); + break; + case OUT_OUTER_HTML: + trimmedText = trim(getOuterHtml(text, foundTags->tags[i])); + break; + } + if (strlen(trimmedText) > 0) + printf("%s\n", trimmedText); + free(trimmedText); + } } } @@ -698,7 +743,7 @@ void filterHtml(char *text, struct filter_opts *opts) text = text + len; struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); findTag(rootTag, opts, foundTags); - printResult(text, opts, foundTags); + printResult(text, rootTag, opts, foundTags); freeTag(rootTag); freeTagList(tagList); freeTagList(foundTags); diff --git a/html.h b/html.h @@ -32,6 +32,7 @@ struct filter_opts char *attr; char *key; enum output_type out; + bool isExcept; }; struct attr @@ -43,7 +44,6 @@ struct attr struct tag { char *name; - char *innerText; struct attr **attrs; struct tag **children; size_t attrsLen;