htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 0544bcf9a77afced6fe52c018fb1c4758ffde18c
parent f527a413ca2f869e114bee4d0b3db7f55dbfaf21
Author: Robin <kroekerrobin@gmail.com>
Date:   Sun, 13 Aug 2023 16:23:41 +0200

Awake innerText to life

Diffstat:
Mhtex.c | 14+++++++++++++-
Mhtml.c | 56++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mhtml.h | 4+++-
Mlib.c | 3+--
4 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/htex.c b/htex.c @@ -133,24 +133,34 @@ int main(int argc, char *argv[]) int o = 0; int option_index = 0; bool isInnerHtml = false; + bool isInnerText = false; bool isExcept = false; char *text = NULL; char *searchPattern = NULL; static struct option long_options[] = { { "innerhtml", no_argument, 0, 'i' }, + { "innertext", no_argument, 0, 't' }, { "except", no_argument, 0, 'e' }, { 0, 0, 0, 0 } }; - while ((o = getopt_long(argc, argv, "ie", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "ite", long_options, &option_index)) != -1) { switch(o) { case 'i': isInnerHtml = true; break; + case 't': + isInnerText = true; + break; case 'e': isExcept = true; break; } } + if (isInnerHtml && isInnerText) + { + fprintf(stderr, "Provide either --innerhtml or --innertext.\n"); + return -1; + } if (argc == optind) { fprintf(stderr, "Provide a search pattern!\n"); @@ -188,6 +198,8 @@ int main(int argc, char *argv[]) options->isExcept = isExcept; if (isInnerHtml) options->out = OUT_INNER_HTML; + if (isInnerText) + options->out = OUT_INNER_TEXT; filterHtml(text, options); freeOpts(options); free(text); diff --git a/html.c b/html.c @@ -36,6 +36,8 @@ struct tag *initTag() struct tag *t = malloc(sizeof(struct tag)); t->name = malloc(sizeof(char)); t->name[0] = 0; + t->innerText = malloc(sizeof(char)); + t->innerText[0] = 0; t->attrs = NULL; t->children = NULL; t->attrsLen = 0; @@ -325,6 +327,8 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_TAG; break; } + stillOpenTag = getLastOpenTag(tagList); + stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret)); break; case STATE_TAG: if (cp == SOLIDUS) @@ -545,6 +549,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis void freeTag(struct tag *t) { free(t->name); + free(t->innerText); for (int i=0; i<t->attrsLen; i++) { free(t->attrs[i]->name); @@ -655,7 +660,32 @@ void printHtml(struct tag *t, int indent) void printTag(char *text, struct tag *t, enum output_type out, struct tag_list *foundTags) { - unsigned int p = *(unsigned int *)t; + // bool doPrint = true; + switch (out) + { + case OUT_INNER_HTML: + /* for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++) + { + doPrint = true; + for (int k=0; k<foundTags->len; k++) + { + if ( + foundTags->tags[k]->_innerHtmlBeginOffset < i && + foundTags->tags[k]->_innerHtmlEndOffset > i + ) + doPrint = false; + } + if (doPrint) + putchar(text[i]); + } */ + break; + case OUT_OUTER_HTML: + break; + case OUT_INNER_TEXT: + break; + } + putchar('\n'); + /* unsigned int p = *(unsigned int *)t; unsigned int cp; bool isMatch = false; for (int i=0; i<foundTags->len; i++) @@ -679,7 +709,7 @@ void printTag(char *text, struct tag *t, enum output_type out, struct tag_list * if (strlen(trimmedText) > 0) printf("%s\n", trimmedText); free(trimmedText); - } + } */ for (int i=t->childrenLen-1; i>-1; i--) { printTag(text, t->children[i], out, foundTags); @@ -695,19 +725,37 @@ void printResult ) { if (opts->isExcept) + { + /* printf("len: %ld\n", foundTags->len); + char *trimmedText = NULL; + for (int i=0; i<foundTags->len; i++) + { + trimmedText = trim(getOuterHtml(text, foundTags->tags[i])); + printf("%s\n", trimmedText); + free(trimmedText); + } */ printTag(text, rootTag, opts->out, foundTags); + } else { + char *requestedText = NULL; char *trimmedText = NULL; for (int i=0; i<foundTags->len; i++) { switch (opts->out) { case OUT_INNER_HTML: - trimmedText = trim(getInnerHtml(text, foundTags->tags[i])); + requestedText = getInnerHtml(text, foundTags->tags[i]); + trimmedText = trim(requestedText); + free(requestedText); break; case OUT_OUTER_HTML: - trimmedText = trim(getOuterHtml(text, foundTags->tags[i])); + requestedText = getOuterHtml(text, foundTags->tags[i]); + trimmedText = trim(requestedText); + free(requestedText); + break; + case OUT_INNER_TEXT: + trimmedText = trim(foundTags->tags[i]->innerText); break; } if (strlen(trimmedText) > 0) diff --git a/html.h b/html.h @@ -23,7 +23,8 @@ const char *voidElements[] = { enum output_type { OUT_INNER_HTML, - OUT_OUTER_HTML + OUT_OUTER_HTML, + OUT_INNER_TEXT }; struct filter_opts @@ -46,6 +47,7 @@ struct tag char *name; struct attr **attrs; struct tag **children; + char *innerText; size_t attrsLen; size_t childrenLen; bool _isVoidElement; // means there is no closing tag diff --git a/lib.c b/lib.c @@ -1,4 +1,4 @@ -char *stringCat(char *str1,char *str2) +char *stringCat(char *str1, char *str2) { int str1Len = 0; int str2Len = 0; @@ -74,7 +74,6 @@ char *trim(char *text) } trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); trimmedText[k] = 0; - free(text); return trimmedText; }