htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit bcc4ec1b9dcc6d5133ad5be90a95c8de364c88df
parent 633e945a287ee1fc0b03d7bab858bbc8e7cdba09
Author: Robin <kroekerrobin@gmail.com>
Date:   Sun,  6 Aug 2023 16:28:57 +0200

Add basic tag finding

syntax: tag[key=value]

Diffstat:
Mhtex.c | 163++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mhtml.c | 62+++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mhtml.h | 14++++++++++++++
3 files changed, 208 insertions(+), 31 deletions(-)

diff --git a/htex.c b/htex.c @@ -42,38 +42,143 @@ char *readFile(FILE *fp) return text; } +struct filter_opts *parseFilterOpts(const char *pattern) +{ + struct filter_opts *opt = malloc(sizeof(struct filter_opts)); + opt->tag = malloc(sizeof(char)); + opt->tag[0] = 0; + opt->attr = malloc(sizeof(char)); + opt->attr[0] = 0; + opt->key = malloc(sizeof(char)); + opt->key[0] = 0; + char *classValue = NULL; + bool isClassValue = false; + char *idValue = NULL; + bool isIdValue = false; + int i = 0; + bool isAttrKey = false; + bool isAttrOrTag = true; + char *attrOrTag = NULL; + int aot = 0; + int ak = 0; + int av = 0; + switch (pattern[0]) + { + case '.': + isClassValue = true; + i = 1; + break; + case '#': + isIdValue = true; + i = 1; + break; + default: + } + for (; i<strlen(pattern); i++) + { + if (pattern[i] == ']') + break; + if ( + !isAttrKey && + !isAttrOrTag && + pattern[i] != ']' && + pattern[i] != '"' + ) + { + opt->attr = realloc(opt->attr, (av+1) * sizeof(char)); + opt->attr[av] = pattern[i]; + av++; + } + if (pattern[i] == '=') + isAttrKey = false; + if (isAttrKey && !isAttrOrTag) + { + opt->key = realloc(opt->key, (ak+1) * sizeof(char)); + opt->key[ak] = pattern[i]; + ak++; + } + if (pattern[i] == '[') + { + isAttrKey = true; + isAttrOrTag = false; + } + if (isAttrOrTag) + { + attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char)); + attrOrTag[aot] = pattern[i]; + aot++; + } + } + attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char)); + attrOrTag[aot] = 0; + if (isIdValue) + { + free(opt->key); + opt->key = NULL; + free(opt->attr); + opt->attr = NULL; + opt->attr = attrOrTag; + opt->key = realloc(opt->key, 3 * sizeof(char)); + opt->key[0] = 'i'; + opt->key[1] = 'd'; + opt->key[2] = 0; + } + else if (isClassValue) + { + free(opt->key); + opt->key = NULL; + free(opt->attr); + opt->attr = NULL; + opt->attr = attrOrTag; + opt->key = realloc(opt->key, 6 * sizeof(char)); + opt->key[0] = 'c'; + opt->key[1] = 'l'; + opt->key[2] = 'a'; + opt->key[3] = 's'; + opt->key[4] = 's'; + opt->key[5] = 0; + } + else + { + free(opt->tag); + opt->tag = attrOrTag; + if (av > 0) + { + opt->attr = realloc(opt->attr, (av+1) * sizeof(char)); + opt->attr[av] = 0; + } + if (ak > 0) + { + opt->key = realloc(opt->key, (ak+1) * sizeof(char)); + opt->key[ak] = 0; + } + } + return opt; +} + +void freeOpts(struct filter_opts *opt) +{ + free(opt->tag); + free(opt->attr); + free(opt->key); + free(opt); +} + int main(int argc, char *argv[]) { int o = 0; int option_index = 0; - char *tag = NULL; - char *attribute = NULL; - char *key = NULL; bool isInnerHtml = false; bool isExcept = false; char *text = NULL; + char *searchPattern = NULL; static struct option long_options[] = { - { "tag", required_argument, 0, 't' }, - { "attribute", required_argument, 0, 'a' }, - { "key", required_argument, 0, 'k' }, { "innerhtml", no_argument, 0, 'i' }, { "except", no_argument, 0, 'e' }, { 0, 0, 0, 0 } }; - while ((o = getopt_long(argc, argv, "t:a:k:ie", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "ie", long_options, &option_index)) != -1) { switch(o) { - case 't': - tag = malloc((strlen(optarg)+1) * sizeof(char)); - strcpy(tag, optarg); - break; - case 'a': - attribute = malloc((strlen(optarg)+1) * sizeof(char)); - strcpy(attribute, optarg); - break; - case 'k': - key = malloc((strlen(optarg)+1) * sizeof(char)); - strcpy(key, optarg); - break; case 'i': isInnerHtml = true; break; @@ -82,17 +187,24 @@ int main(int argc, char *argv[]) break; } } - if (argc > optind+1) + if (argc == optind) + { + fprintf(stderr, "Provide a search pattern!\n"); + return -1; + } + if (argc > optind+2) { fprintf(stderr, "Provide only one file!\n"); return -1; } - if (argc == optind) + if (argc == optind+1) { + searchPattern = argv[argc-1]; text = readFile(stdin); } - else + else if (argc == optind+2) { + searchPattern = argv[argc-2]; char *filepath = argv[argc-1]; FILE *fp = fopen(filepath, "r"); if (fp == NULL) @@ -108,10 +220,9 @@ int main(int argc, char *argv[]) return 0; } } - parseHtml(text); - free(tag); - free(attribute); - free(key); + struct filter_opts *options = parseFilterOpts(searchPattern); + filterHtml(text, options); + freeOpts(options); free(text); return 0; } diff --git a/html.c b/html.c @@ -438,7 +438,58 @@ void printHtml(struct tag *t) } } -void parseHtml(const char *text) +struct tag *findTag(struct tag *tag, struct tag_list *list, struct filter_opts *opt) +{ + bool matchesTag = false; + bool matchesAttrKey = false; + bool matchesAttrKeyAndValue = false; + if (strcmp(tag->name, opt->tag) == 0) + matchesTag = true; + for (int i=0; i<tag->attrsLen; i++) + { + if (strcmp(tag->attrs[i]->name, opt->key) == 0) + matchesAttrKey = true; + if (matchesAttrKey) + { + if (strcmp(tag->attrs[i]->value, opt->attr) == 0) + matchesAttrKeyAndValue = true; + } + } + if (strlen(opt->tag) > 0 && strlen(opt->key) > 0 && strlen(opt->attr) > 0) + { + if (matchesTag && matchesAttrKeyAndValue) + return tag; + } + else if (strlen(opt->tag) > 0 && strlen(opt->key) > 0) + { + if (matchesTag && matchesAttrKey) + return tag; + } + else if (strlen(opt->tag) > 0) + { + if (matchesTag) + return tag; + } + else if (strlen(opt->key) > 0 && strlen(opt->attr) > 0) + { + if (matchesAttrKeyAndValue) + return tag; + } + else if (strlen(opt->key) > 0) + { + if (matchesAttrKey) + return tag; + } + for (int i=0; i<tag->childrenLen; i++) + { + struct tag *foundTag = findTag(tag->children[i], list, opt); + if (foundTag != NULL) + return foundTag; + } + return NULL; +} + +void filterHtml(const char *text, struct filter_opts *opts) { struct tag *rootTag; struct tag_list *tagList = malloc(sizeof(struct tag_list)); @@ -449,10 +500,11 @@ void parseHtml(const char *text) rootTag = parseTag(text+len, STATE_INNER_TEXT, tagList); else rootTag = parseTag(text, STATE_INNER_TEXT, tagList); - printHtml(rootTag); - /* printf("%s\n", rootTag->children[0]->children[0]->name); - printf("%s\n", rootTag->children[0]->children[0]->children[1]->name); - printf("%s\n", rootTag->children[0]->children[0]->children[1]->attrs[0]->value); */ + struct tag *result = findTag(rootTag, tagList, opts); + if (result == NULL) + printError("No tag found."); + else + printf("result: %s\n", result->name); freeTag(rootTag); freeTagList(tagList); } diff --git a/html.h b/html.h @@ -19,6 +19,13 @@ const char *voidElements[] = { "input", "link", "meta", "source", "track", "wbr" }; +struct filter_opts +{ + char *tag; + char *attr; + char *key; +}; + struct attr { char *name; @@ -61,3 +68,10 @@ enum attr_value_syntax AVS_APOSTROPHE, AVS_UNQUOTED }; + +/* enum search_type +{ + ST_NO, + ST_LIST, + ST_HIERARCHY +}; */