htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 445f2fb06c2b8cac70ed2e4496d78f9b5523e3ee
parent 18e781e88b49526a72b1405d9b0baeeb13e4fc58
Author: Robin <kroekerrobin@gmail.com>
Date:   Wed, 16 Aug 2023 11:28:12 +0200

Support --except argument and get attr value

Diffstat:
Mhtex.c | 45++++++++++++++++++++++++++++-----------------
Mhtml.c | 82+++++++++++++++++++++++++++++++++++--------------------------------------------
Mhtml.h | 3++-
Mtodo | 1-
4 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/htex.c b/htex.c @@ -128,37 +128,48 @@ void freeOpts(struct find_opts *opt) free(opt); } +enum output_type parseOutputArg(char *arg) +{ + if (arg == NULL) + return OUT_OUTER_HTML; + if (strcmp(arg, "innerhtml") == 0) + return OUT_INNER_HTML; + if (strcmp(arg, "innertext") == 0) + return OUT_INNER_TEXT; + if (strcmp(arg, "attr_value") == 0) + return OUT_ATTR_VALUE; + return -1; +} + int main(int argc, char *argv[]) { int o = 0; int option_index = 0; - bool isInnerHtml = false; - bool isInnerText = false; + char *output = NULL; bool isExcept = false; char *text = NULL; char *searchPattern = NULL; static struct option long_options[] = { - { "innerhtml", no_argument, 0, 'i' }, - { "innertext", no_argument, 0, 't' }, + { "output", required_argument, 0, 'o' }, { "except", no_argument, 0, 'e' }, { 0, 0, 0, 0 } }; - while ((o = getopt_long(argc, argv, "ite", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "o:e", long_options, &option_index)) != -1) { switch(o) { - case 'i': - isInnerHtml = true; - break; - case 't': - isInnerText = true; + case 'o': + output = realloc(output, (strlen(optarg)+1) * sizeof(char)); + strcpy(output, optarg); break; case 'e': isExcept = true; break; } } - if (isInnerHtml && isInnerText) + enum output_type out = parseOutputArg(output); + if (out == -1) { - fprintf(stderr, "Provide either --innerhtml or --innertext.\n"); + fprintf(stderr, "Provide a valid output type!\n"); + free(output); return -1; } if (argc == optind) @@ -190,17 +201,17 @@ int main(int argc, char *argv[]) fclose(fp); if (strlen(text) == 0) { - printf("No data in file.\n"); + fprintf(stderr, "No data in file.\n"); + free(output); + free(text); return 0; } } struct find_opts *options = parseFilterOpts(searchPattern); + options->out = out; options->isExcept = isExcept; - if (isInnerHtml) - options->out = OUT_INNER_HTML; - if (isInnerText) - options->out = OUT_INNER_TEXT; filterHtml(text, options); + free(output); freeOpts(options); free(text); return 0; diff --git a/html.c b/html.c @@ -676,48 +676,6 @@ void printHtml(struct tag *t, int indent) } } -void printTag(char *text, struct tag *t, enum output_type out, struct tag_list *foundTags) -{ - switch (out) - { - case OUT_INNER_HTML: - break; - case OUT_OUTER_HTML: - break; - case OUT_INNER_TEXT: - break; - } - /* unsigned int p = *(unsigned int *)t; - unsigned int cp; - bool isMatch = false; - for (int i=0; i<foundTags->len; i++) - { - cp = *(unsigned int *)foundTags->tags[i]; - if (p == cp) - isMatch = true; - } - if (!isMatch) - { - char *trimmedText = NULL; - switch (out) - { - case OUT_INNER_HTML: - trimmedText = trim(getInnerHtml(text, t)); - break; - case OUT_OUTER_HTML: - trimmedText = trim(getOuterHtml(text, t)); - break; - } - if (strlen(trimmedText) > 0) - printf("%s\n", trimmedText); - free(trimmedText); - } */ - for (int i=t->childrenLen-1; i>-1; i--) - { - printTag(text, t->children[i], out, foundTags); - } -} - void printResult ( char *text, @@ -728,7 +686,21 @@ void printResult { if (opts->isExcept) { - // printTag(text, rootTag, opts->out, foundTags); + bool isMatch = false; + for (int i=0; i<strlen(text); i++) + { + isMatch = false; + for (int k=0; k<foundTags->len; k++) + { + if ( + foundTags->tags[k]->_outerHtmlBeginOffset <= i && + foundTags->tags[k]->_outerHtmlEndOffset > i + ) + isMatch = true; + } + if (!isMatch) + putchar(text[i]); + } } else { @@ -751,10 +723,28 @@ void printResult case OUT_INNER_TEXT: trimmedText = trim(foundTags->tags[i]->innerText); break; + case OUT_ATTR_VALUE: + if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) + { + for (int k=0; k<foundTags->tags[i]->attrsLen; k++) + { + if (strcmp(foundTags->tags[i]->attrs[k]->name, opts->key) == 0) + printf("%s\n", foundTags->tags[i]->attrs[k]->value); + } + } + else if (strlen(opts->tag) > 0) + { + for (int k=0; k<foundTags->tags[i]->attrsLen; k++) + printf("%s\n", foundTags->tags[i]->attrs[k]->value); + } + break; + } + if (trimmedText) + { + if (strlen(trimmedText) > 0) + printf("%s\n", trimmedText); + free(trimmedText); } - if (strlen(trimmedText) > 0) - printf("%s\n", trimmedText); - free(trimmedText); } } } diff --git a/html.h b/html.h @@ -24,7 +24,8 @@ enum output_type { OUT_INNER_HTML, OUT_OUTER_HTML, - OUT_INNER_TEXT + OUT_INNER_TEXT, + OUT_ATTR_VALUE }; struct find_opts diff --git a/todo b/todo @@ -1 +0,0 @@ -support --except argument