htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 3be05c0cb6825138a7a7fdaa28466b90713d56e3
parent 54b54cb00e4500f10d254e7ec694f45f3fbeab58
Author: Robin <kroekerrobin@gmail.com>
Date:   Sun, 13 Aug 2023 21:40:21 +0200

Merge branch 'big_change'

Diffstat:
M.gitignore | 2+-
MMakefile | 7++++---
Mhtex.c | 648++++++++++++++++++++++++-------------------------------------------------------
Ahtml.c | 790+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ahtml.h | 90+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alib.c | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtodo | 6+-----
7 files changed, 1195 insertions(+), 460 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,2 +1,2 @@ -test*.html +test/* htex diff --git a/Makefile b/Makefile @@ -2,7 +2,9 @@ PREFIX = /usr/local MANPREFIX = $(PREFIX)/share/man all: - $(CC) -O -Wall -Werror -o htex htex.c + $(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme +debug: + $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme clean: rm htex install: all @@ -14,4 +16,4 @@ install: all chmod 644 "$(MANPREFIX)/man1/htex.1" uninstall: rm "$(PREFIX)/bin/htex" - rm "$(MANPREFIX)/man1/htex.1" -\ No newline at end of file + rm "$(MANPREFIX)/man1/htex.1" diff --git a/htex.c b/htex.c @@ -1,461 +1,207 @@ #include <stdio.h> +#include <string.h> #include <stdbool.h> #include <stdlib.h> -#include <unistd.h> #include <getopt.h> -#include <string.h> -#include <fcntl.h> - -char *text; -char attribute_name[200]; -char tag_name[50]; -bool inner_html = false; -bool except = false; -struct match { - int start; - int end; -}; -struct match *matches; - -int find_start_of_opening_tag_pos(int class_position) { - int i = 1; - while (1) { - int pos = class_position - i; - if (pos < 0) - return -1; - if (text[pos] == '<') { - return pos; - } - i++; - } -} - -int find_end_of_opening_tag_pos(int class_position) { - int i = 1; - while (1) { - int pos = class_position + i; - if (pos < 0) - return -1; - if (text[pos] == '>') { - return pos + 1; - } - i++; - } -} - -/* - This function works only if the html tag - has attributes. -*/ -void find_tag_name(int open_tag_pos) { - int i = 1; - int end_of_tag_name = 0; - while (1) { - int pos = open_tag_pos + i; - if (pos > strlen(text)) - return; - if (text[pos] == ' ' || text[pos] == '\n') { - end_of_tag_name = pos-1; - break; - } - i++; - } - int length_tag_name = end_of_tag_name - open_tag_pos; - for (int k=0; k<length_tag_name; k++) { - tag_name[k] = text[open_tag_pos+k+1]; - } - tag_name[length_tag_name] = '\0'; -} - -int find_closing_tag_pos(int open_tag_pos, bool inner_html) { - int level = 1; - int failure = 0; - char close_tag[strlen(tag_name)+3]; - close_tag[0] = '<'; - close_tag[1] = '/'; - for (int k=0; k<strlen(tag_name); k++) { - close_tag[2+k] = tag_name[k]; - } - close_tag[sizeof(close_tag)-1] = '>'; - close_tag[sizeof(close_tag)] = '\0'; +#include <inttypes.h> +#include <grapheme.h> +#include "lib.c" +#include "html.c" - for (int l=open_tag_pos; l<strlen(text); l++) { // Could be more precise - if (text[l] == '<') { - for (int o=0; o<strlen(tag_name); o++) { - if (tag_name[o] != text[l+o+1]) { - failure = 1; - break; - } - } - if (failure == 0) { - if ( - text[l+strlen(tag_name)+1] != ' ' && - text[l+strlen(tag_name)+1] != '>' - ) { - failure = 1; - } - } - if (failure == 0) { - level++; - } - failure = 0; - if (text[l+1] == '/') { - for (int o=2; o<strlen(close_tag); o++) { - if (close_tag[o] != text[l+o]) { - failure = 1; - break; - } - } - if (failure == 0) { - level--; - if (level == 0) { - if (inner_html) { - return l; - } else { - return l + strlen(tag_name) + 3; - } - } - } - failure = 0; - } - } - } - return -1; +struct find_opts *parseFilterOpts(const char *pattern) +{ + struct find_opts *opt = malloc(sizeof(struct find_opts)); + opt->out = OUT_OUTER_HTML; + opt->tag = malloc(sizeof(char)); + opt->tag[0] = 0; + opt->attr = malloc(sizeof(char)); + opt->attr[0] = 0; + opt->key = malloc(sizeof(char)); + opt->key[0] = 0; + bool isClassValue = false; + bool isIdValue = false; + int i = 0; + bool isAttrKey = false; + bool isAttrOrTag = true; + char *attrOrTag = NULL; + int aot = 0; + int ak = 0; + int av = 0; + switch (pattern[0]) + { + case '.': + isClassValue = true; + i = 1; + break; + case '#': + isIdValue = true; + i = 1; + break; + } + for (; i<strlen(pattern); i++) + { + if (pattern[i] == ']') + break; + if ( + !isAttrKey && + !isAttrOrTag && + pattern[i] != ']' && + pattern[i] != '"' + ) + { + opt->attr = realloc(opt->attr, (av+1) * sizeof(char)); + opt->attr[av] = pattern[i]; + av++; + } + if (pattern[i] == '=') + isAttrKey = false; + if (isAttrKey && !isAttrOrTag) + { + opt->key = realloc(opt->key, (ak+1) * sizeof(char)); + opt->key[ak] = pattern[i]; + ak++; + } + if (pattern[i] == '[') + { + isAttrKey = true; + isAttrOrTag = false; + } + if (isAttrOrTag) + { + attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char)); + attrOrTag[aot] = pattern[i]; + aot++; + } + } + attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char)); + attrOrTag[aot] = 0; + if (isIdValue) + { + free(opt->key); + opt->key = NULL; + free(opt->attr); + opt->attr = NULL; + opt->attr = attrOrTag; + opt->key = realloc(opt->key, 3 * sizeof(char)); + opt->key[0] = 'i'; + opt->key[1] = 'd'; + opt->key[2] = 0; + } + else if (isClassValue) + { + free(opt->key); + opt->key = NULL; + free(opt->attr); + opt->attr = NULL; + opt->attr = attrOrTag; + opt->key = realloc(opt->key, 6 * sizeof(char)); + opt->key[0] = 'c'; + opt->key[1] = 'l'; + opt->key[2] = 'a'; + opt->key[3] = 's'; + opt->key[4] = 's'; + opt->key[5] = 0; + } + else + { + free(opt->tag); + opt->tag = attrOrTag; + if (av > 0) + { + opt->attr = realloc(opt->attr, (av+1) * sizeof(char)); + opt->attr[av] = 0; + } + if (ak > 0) + { + opt->key = realloc(opt->key, (ak+1) * sizeof(char)); + opt->key[ak] = 0; + } + } + return opt; } -bool correct_name_begin_or_end(char prev_char) { - switch(prev_char) { - case '"': - return true; - case '\'': - return true; - case ' ': - return true; - default: - return false; - } +void freeOpts(struct find_opts *opt) +{ + free(opt->tag); + free(opt->attr); + free(opt->key); + free(opt); } -void find_html_tag_by_class(char *class_name) { - int o = 0; - int failure = 0; - int counter = 0; - int is_not_quotation_mark = 1; - - for (int k=0; k<strlen(text); k++) { - if ( - text[k] == 'c' && - text[k+1] == 'l' && - text[k+2] == 'a' && - text[k+3] == 's' && - text[k+4] == 's' - ) { - while (is_not_quotation_mark == 1) { - if (text[k+7+o] == '"' || text[k+7+o] == '\'') { - is_not_quotation_mark = 0; - break; - } - if (class_name[0] == text[k+7+o]) { - for (int l=1; l<strlen(class_name); l++) { - if (class_name[l] != text[k+7+o+l]) { - failure = 1; - break; - } - } - if (failure == 0) { - if ( - !correct_name_begin_or_end(text[k+6+o]) || - !correct_name_begin_or_end(text[k+7+o+strlen(class_name)]) - ) { - failure = 1; - } - } - if (failure == 0) { - if (inner_html) { - int start_of_open_tag_pos = find_start_of_opening_tag_pos(k); - find_tag_name(start_of_open_tag_pos); - int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); - int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true); - for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); - } - printf("\n"); - } else { - int open_tag_pos = find_start_of_opening_tag_pos(k); - find_tag_name(open_tag_pos); - int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); - int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false); - if (except) { - matches = realloc(matches, (counter+1) * sizeof(struct match)); - matches[counter].start = open_tag_pos; - matches[counter].end = close_tag_pos; - counter++; - } else { - for (int e=open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); - } - printf("\n"); - } - } - } - failure = 0; - } - o++; - } - is_not_quotation_mark = 1; - o = 0; - } - } - if (except) { - int start = 0; - for (int i=0; i<counter; i++) { - for (int e=start; e<matches[i].start; e++) { - printf("%c", text[e]); - } - start = matches[i].end; - } - for (int i=start; i<strlen(text); i++) { - printf("%c", text[i]); - } - printf("\n"); - free(matches); - } -} - -void find_html_tag_by_id(char *id_name) { - int o = 0; - int failure = 0; - int counter = 0; - int is_not_quotation_mark = 1; - - for (int k=0; k<strlen(text); k++) { - if ( - text[k] == 'i' && - text[k+1] == 'd' - ) { - while (is_not_quotation_mark == 1) { - if (text[k+4+o] == '"' || text[k+4+o] == '\'') { - is_not_quotation_mark = 0; - break; - } - if (id_name[0] == text[k+4+o]) { - for (int l=1; l<strlen(id_name); l++) { - if (id_name[l] != text[k+4+o+l]) { - failure = 1; - break; - } - } - if (failure == 0) { - if ( - !correct_name_begin_or_end(text[k+3+o]) || - !correct_name_begin_or_end(text[k+4+o+strlen(id_name)]) - ) { - failure = 1; - } - } - if (failure == 0) { - if (inner_html) { - int start_of_open_tag_pos = find_start_of_opening_tag_pos(k); - find_tag_name(start_of_open_tag_pos); - int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); - int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true); - for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); - } - printf("\n"); - } else { - int start_of_open_tag_pos = find_start_of_opening_tag_pos(k); - find_tag_name(start_of_open_tag_pos); - int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); - int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false); - if (except) { - matches = realloc(matches, (counter+1) * sizeof(struct match)); - matches[counter].start = start_of_open_tag_pos; - matches[counter].end = close_tag_pos; - counter++; - } else { - for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); - } - printf("\n"); - } - } - } - failure = 0; - } - o++; - } - is_not_quotation_mark = 1; - o = 0; - } - } - if (except) { - int start = 0; - for (int i=0; i<counter; i++) { - for (int e=start; e<matches[i].start; e++) { - printf("%c", text[e]); - } - start = matches[i].end; - } - for (int i=start; i<strlen(text); i++) { - printf("%c", text[i]); - } - printf("\n"); - free(matches); - } -} -void find_html_tag_by_tag() { - int failure = 0; - int counter = 0; - for (int k=0; k<strlen(text); k++) { - if (text[k] == '<' && text[k+1] != '/') { - for (int o=0; o<strlen(attribute_name); o++) { - if (attribute_name[o] != text[k+1+o]) { - failure = 1; - break; - } - } - if (failure == 0) { - if ( - text[k+1+strlen(attribute_name)] == '>' || - text[k+1+strlen(attribute_name)] == ' ' || - text[k+1+strlen(attribute_name)] == '\n' - ) { - int open_tag_pos = k; - int after_tag_pos = k+1+strlen(attribute_name)+1; - if (inner_html) { - int close_tag_pos = find_closing_tag_pos(after_tag_pos, true); - int end_of_open_tag_pos = find_end_of_opening_tag_pos(k+strlen(attribute_name)); - for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); - } - printf("\n"); - } else { - int close_tag_pos = find_closing_tag_pos(after_tag_pos, false); - if (except) { - matches = realloc(matches, (counter+1) * sizeof(struct match)); - matches[counter].start = open_tag_pos; - matches[counter].end = close_tag_pos; - counter++; - } else { - for (int e=open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); - } - printf("\n"); - } - } - } - } - failure = 0; - } - } - if (except) { - int start = 0; - for (int i=0; i<counter; i++) { - for (int e=start; e<matches[i].start; e++) { - printf("%c", text[e]); - } - start = matches[i].end; - } - for (int i=start; i<strlen(text); i++) { - printf("%c", text[i]); - } - printf("\n"); - free(matches); - } -} - -void find_html_tag() { - char identifier[200]; - for (int i=0; i<strlen(attribute_name); i++) { - identifier[i] = attribute_name[i+1]; - } - switch(attribute_name[0]) { - case '.': - find_html_tag_by_class(identifier); - break; - case '#': - find_html_tag_by_id(identifier); - break; - default: - for (int i=0; i<strlen(attribute_name); i++) { - tag_name[i] = attribute_name[i]; - } - find_html_tag_by_tag(); - } -} - -int main(int argc, char *argv[]) { - int i = 0; - char buffer; - int o; - text = malloc(sizeof(char)); - if (!text) { - printf("malloc error.\n"); - return -1; - } - - static struct option long_options[] = { - { "attribute", required_argument, 0, 'a' }, - { "innerhtml", no_argument, 0, 'i' }, - { "except", no_argument, 0, 'e' }, - { 0, 0, 0, 0 } - }; - int option_index = 0; - while ((o = getopt_long(argc, argv, "eia:", long_options, &option_index)) != -1) { - switch(o) { - case 'a': - for (int j=0; j<strlen(optarg); j++) { - attribute_name[j] = optarg[j]; - } - break; - case 'i': - inner_html = true; - break; - case 'e': - except = true; - break; - } - } - if (inner_html && except) { - printf("You can't use the options -i (--innerhtml) and -e (--except) at the same time.\n"); - return -1; - } - if (argc == (optind + 1)) { - if (*argv[argc-1] == '-') { - while (read(0, &buffer, 1) > 0) { - text[i] = buffer; - i++; - text = realloc(text, (i+1) * sizeof(char)); - if (!text || text == NULL) { - printf("realloc error.\n"); - return -1; - } - } - text[i] = '\0'; - find_html_tag(); - free(text); - } else { - int fd = open(argv[argc-1], O_RDONLY); - if (fd != -1) { - while (read(fd, &buffer, 1) > 0) { - text[i] = buffer; - i++; - text = realloc(text, (i+1) * sizeof(char)); - if (!text || text == NULL) { - printf("realloc error.\n"); - return -1; - } - } - text[i] = '\0'; - find_html_tag(); - free(text); - } else { - printf("Couldn't read file \"%s\"\n", argv[argc-1]); - } - } - } else { - printf("Nothing to read from.\n"); - } - return 0; +int main(int argc, char *argv[]) +{ + int o = 0; + int option_index = 0; + bool isInnerHtml = false; + bool isInnerText = false; + bool isExcept = false; + char *text = NULL; + char *searchPattern = NULL; + static struct option long_options[] = { + { "innerhtml", no_argument, 0, 'i' }, + { "innertext", no_argument, 0, 't' }, + { "except", no_argument, 0, 'e' }, + { 0, 0, 0, 0 } + }; + while ((o = getopt_long(argc, argv, "ite", long_options, &option_index)) != -1) { + switch(o) { + case 'i': + isInnerHtml = true; + break; + case 't': + isInnerText = true; + break; + case 'e': + isExcept = true; + break; + } + } + if (isInnerHtml && isInnerText) + { + fprintf(stderr, "Provide either --innerhtml or --innertext.\n"); + return -1; + } + if (argc == optind) + { + fprintf(stderr, "Provide a search pattern!\n"); + return -1; + } + if (argc > optind+2) + { + fprintf(stderr, "Provide only one file!\n"); + return -1; + } + if (argc == optind+1) + { + searchPattern = argv[argc-1]; + text = readFile(stdin); + } + else if (argc == optind+2) + { + searchPattern = argv[argc-2]; + char *filepath = argv[argc-1]; + FILE *fp = fopen(filepath, "r"); + if (fp == NULL) + { + perror("fopen failed: "); + return -1; + } + text = readFile(fp); + fclose(fp); + if (strlen(text) == 0) + { + printf("No data in file.\n"); + return 0; + } + } + struct find_opts *options = parseFilterOpts(searchPattern); + options->isExcept = isExcept; + if (isInnerHtml) + options->out = OUT_INNER_HTML; + if (isInnerText) + options->out = OUT_INNER_TEXT; + filterHtml(text, options); + freeOpts(options); + free(text); + return 0; } diff --git a/html.c b/html.c @@ -0,0 +1,790 @@ +#include "html.h" + +const char *stateToString(enum state s) +{ + switch(s) + { + case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; + case STATE_TAG: return "STATE_TAG"; + case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; + case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; + case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; + case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; + case STATE_COMMENT: return "STATE_COMMENT"; + case STATE_SCRIPT: return "STATE_SCRIPT"; + case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; + case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; + case STATE_STYLE: return "STATE_STYLE"; + case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; + case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; + } + return ""; +} + +struct attr *initAttr() +{ + struct attr *a = malloc(sizeof(struct attr)); + a->name = malloc(sizeof(char)); + a->name[0] = 0; + a->value = malloc(sizeof(char)); + a->value[0] = 0; + return a; +} + +struct tag *initTag() +{ + struct tag *t = malloc(sizeof(struct tag)); + t->name = malloc(sizeof(char)); + t->name[0] = 0; + t->innerText = malloc(sizeof(char)); + t->innerText[0] = 0; + t->attrs = NULL; + t->children = NULL; + t->attrsLen = 0; + t->childrenLen = 0; + t->_isVoidElement = false; + t->_isClosed = false; + return t; +} + +struct tag_list *initTagList() +{ + struct tag_list *t = malloc(sizeof(struct tag_list)); + t->tags = NULL; + t->len = 0; + return t; +} + +static inline bool isASCIIDigit(uint_least32_t cp) +{ + if (cp >= 0x30 && cp <= 0x39) + return true; + return false; +} + +static inline bool isASCIIAlphaUpper(uint_least32_t cp) +{ + if (cp >= 0x41 && cp <= 0x5A) + return true; + return false; +} + +static inline bool isASCIIAlphaLower(uint_least32_t cp) +{ + if (cp >= 0x61 && cp <= 0x7A) + return true; + return false; +} + +static inline bool isASCIIAlpha(uint_least32_t cp) +{ + if (isASCIIAlphaLower(cp) || isASCIIAlphaUpper(cp)) + return true; + return false; +} + +static inline bool isASCIIWhitespace(uint_least32_t cp) +{ + if ( + cp == TAB || + cp == LF || + cp == FF || + cp == CR || + cp == SPACE + ) + return true; + return false; +} + +static inline bool isVoidElement(const char *tagName) +{ + for (int i=0; i<13; i++) + { + if (strcmp(tagName, voidElements[i]) == 0) + return true; + } + return false; +} + +static inline bool isC0Control(uint_least32_t cp) +{ + if (cp >= 0x00 && cp <= 0x1F) + return true; + return false; +} + +static inline bool isControl(uint_least32_t cp) +{ + if (isC0Control(cp)) + return true; + if (cp >= 0x7F && cp <= 0x9F) + return true; + return false; +} + +static inline bool isNonChar(uint_least32_t cp) +{ + if (cp >= 0xFDD0 && cp <= 0xFDEF) + return true; + if ( + cp == 0xFFFE || cp == 0xFFFF || + cp == 0x1FFFE || cp == 0x1FFFF || + cp == 0x2FFFE || cp == 0x2FFFF || + cp == 0x3FFFE || cp == 0x3FFFF || + cp == 0x4FFFE || cp == 0x4FFFF || + cp == 0x5FFFE || cp == 0x5FFFF || + cp == 0x6FFFE || cp == 0x6FFFF || + cp == 0x7FFFE || cp == 0x7FFFF || + cp == 0x8FFFE || cp == 0x8FFFF || + cp == 0x9FFFE || cp == 0x9FFFF || + cp == 0xAFFFE || cp == 0xAFFFF || + cp == 0xBFFFE || cp == 0xBFFFF || + cp == 0xCFFFE || cp == 0xCFFFF || + cp == 0xDFFFE || cp == 0xDFFFF || + cp == 0xEFFFE || cp == 0xEFFFF || + cp == 0xFFFFE || cp == 0xFFFFF || + cp == 0x10FFFE || cp == 0x10FFFF + ) + return true; + return false; +} + +static inline bool isValidAttrName(uint_least32_t cp) +{ + if (isControl(cp)) + return false; + if (isNonChar(cp)) + return false; + if ( + cp == SPACE || + cp == QUOTATION_MARK || + cp == APOSTROPHE || + cp == GREATER_THAN_SIGN || + cp == SOLIDUS || + cp == EQUALS_SIGN + ) + return false; + return true; +} + +static inline bool +isValidUnquotedAttrValue(uint_least32_t cp) +{ + /* + Not mentioned invalid characters. + They are already handled before + funtion call. + */ + if ( + cp == EQUALS_SIGN || + cp == LESS_THAN_SIGN || + cp == GREATER_THAN_SIGN || + cp == GRAVE_ACCENT + ) + return false; + return true; +} + +size_t parseDoctype(const char *text) +{ + char *firstLine = NULL; + int i = 0; + while (text[i] != '\n') + { + firstLine = realloc(firstLine, (i+1) * sizeof(char)); + firstLine[i] = text[i]; + i++; + } + firstLine = realloc(firstLine, (i+1) * sizeof(char)); + firstLine[i] = 0; + if (strcmp("<!DOCTYPE html>", firstLine) == 0) + { + free(firstLine); + return i+1; + } + if (strcmp("<!doctype html>", firstLine) == 0) + { + free(firstLine); + return i+1; + } + free(firstLine); + return 0; +} + +struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset) +{ + for (int i=tagList->len-1; i>-1; i--) + { + if (strcmp(tagList->tags[i]->name, endTag) == 0 && !tagList->tags[i]->_isClosed) + { + tagList->tags[i]->_isClosed = true; + tagList->tags[i]->_outerHtmlEndOffset = endOffset; + return tagList->tags[i]; + } + } + return NULL; +} + +struct tag *getLastOpenTag(struct tag_list *tagList) +{ + for (int i=tagList->len-1; i>-1; i--) + { + if (!tagList->tags[i]->_isVoidElement && !tagList->tags[i]->_isClosed) + { + return tagList->tags[i]; + } + } + return tagList->tags[0]; +} + +char *getOuterHtml(char *text, struct tag *t) +{ + char *outerHtml = NULL; + int o = 0; + for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++) + { + outerHtml = realloc(outerHtml, (o+1) * sizeof(char)); + outerHtml[o] = text[i]; + o++; + } + outerHtml = realloc(outerHtml, (o+1) * sizeof(char)); + outerHtml[o] = 0; + return outerHtml; +} + +char *getInnerHtml(char *text, struct tag *t) +{ + char *innerHtml = NULL; + int o = 0; + for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++) + { + innerHtml = realloc(innerHtml, (o+1) * sizeof(char)); + innerHtml[o] = text[i]; + o++; + } + innerHtml = realloc(innerHtml, (o+1) * sizeof(char)); + innerHtml[o] = 0; + return innerHtml; +} + +void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off) +{ + int i = off; + while (text[i] != '<') + { + i--; + } + closedTag->_innerHtmlEndOffset = i; +} + +enum state endOfBeginTag(struct tag *t, size_t offset) +{ + t->_innerHtmlBeginOffset = offset+1; + t->_isVoidElement = isVoidElement(t->name); + if (t->_isVoidElement) + t->_outerHtmlEndOffset = offset+1; + if (strcmp(t->name, "script") == 0) + return STATE_SCRIPT; + else if (strcmp(t->name, "style") == 0) + return STATE_STYLE; + else + return STATE_INNER_TEXT; +} + +struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList) +{ + struct tag *tag = initTag(); + tag->_outerHtmlBeginOffset= offset-1; + tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag)); + tagList->tags[tagList->len] = tag; + tagList->len++; + struct tag *stillOpenTag = tag; + char *endTag = malloc(sizeof(char)); + endTag[0] = 0; + size_t a = 0; + size_t attrNameCount = 0; + enum attr_value_syntax attrValueSyntax = AVS_NO; + size_t hyphenCount = 0; + uint_least32_t cp; + size_t len = strlen(text); + size_t ret, off; + for (off = offset; off<len; off += ret) + { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) + { + printError("Something wrong with ending of text"); + } + else + { + // char *the_codepoint = cpToChars(cp, ret); + // printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state)); + // free(the_codepoint); + switch (state) + { + case STATE_INNER_TEXT: + if (cp == LESS_THAN_SIGN) + { + state = STATE_TAG; + break; + } + stillOpenTag = getLastOpenTag(tagList); + stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret)); + break; + case STATE_TAG: + if (cp == SOLIDUS) + { + state = STATE_END_TAG_NAME; + break; + } + if (cp == EXCLAMATION_MARK) + { + state = STATE_COMMENT; + break; + } + stillOpenTag = getLastOpenTag(tagList); + struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList); + stillOpenTag->children = realloc( + stillOpenTag->children, + (stillOpenTag->childrenLen+1) * sizeof(struct tag) + ); + stillOpenTag->children[stillOpenTag->childrenLen] = oneTag; + stillOpenTag->childrenLen++; + free(endTag); + return tag; + case STATE_BEGIN_TAG_NAME: + if (cp == GREATER_THAN_SIGN) + { + state = endOfBeginTag(tag, off); + break; + } + if (isASCIIWhitespace(cp)) + { + state = STATE_ATTR_NAME; + break; + } + if (isASCIIDigit(cp) || isASCIIAlpha(cp)) + { + tag->name = stringCat(tag->name, cpToChars(cp, ret)); + } + break; + case STATE_END_TAG_NAME: + if (cp == GREATER_THAN_SIGN) + { + struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); + setInnerHtmlEndOffset(closedTag, text, off); + free(endTag); + endTag = malloc(sizeof(char)); + endTag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!isASCIIWhitespace(cp)) + endTag = stringCat(endTag, cpToChars(cp, ret)); + break; + case STATE_ATTR_NAME: + if (cp == GREATER_THAN_SIGN) + { + state = endOfBeginTag(tag, off); + break; + } + if (isASCIIWhitespace(cp)) + { + if (attrNameCount == a+1) + a++; + break; + } + if (cp == EQUALS_SIGN) + { + state = STATE_ATTR_VALUE; + break; + } + if (isValidAttrName(cp)) + { + if (attrNameCount != a+1) + { + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct attr) + ); + tag->attrs[a] = initAttr(); + attrNameCount = a + 1; + tag->attrsLen = attrNameCount; + } + tag->attrs[a]->name = stringCat( + tag->attrs[a]->name, + cpToChars(cp, ret) + ); + } + break; + case STATE_ATTR_VALUE: + if (isASCIIWhitespace(cp)) + { + if (attrValueSyntax == AVS_UNQUOTED) + { + attrValueSyntax = AVS_NO; + state = STATE_ATTR_NAME; + } + else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE) + { + char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); + strcpy(tmpName, tag->attrs[a]->name); + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct attr) + ); + a++; + tag->attrs[a] = initAttr(); + free(tag->attrs[a]->name); + tag->attrs[a]->name = tmpName; + tag->attrsLen++; + attrNameCount = a + 1; + } + break; + } + if (cp == QUOTATION_MARK) + { + if (attrValueSyntax == AVS_NO) + { + attrValueSyntax = AVS_QUOTATION_MARK; + break; + } + if (attrValueSyntax == AVS_QUOTATION_MARK) + { + attrValueSyntax = AVS_NO; + state = STATE_ATTR_NAME; + break; + } + } + if (cp == APOSTROPHE) + { + if (attrValueSyntax == AVS_NO) + { + attrValueSyntax = AVS_APOSTROPHE; + break; + } + if (attrValueSyntax == AVS_APOSTROPHE) + { + attrValueSyntax = AVS_NO; + state = STATE_ATTR_NAME; + break; + } + } + if (cp == GREATER_THAN_SIGN) + { + state = endOfBeginTag(tag, off); + break; + } + if ( + attrValueSyntax == AVS_NO && + isValidUnquotedAttrValue(cp) + ) + { + attrValueSyntax = AVS_UNQUOTED; + } + if (attrValueSyntax > AVS_NO) + { + tag->attrs[a]->value = stringCat( + tag->attrs[a]->value, + cpToChars(cp, ret) + ); + } + break; + case STATE_COMMENT: + if (cp == GREATER_THAN_SIGN && hyphenCount >= 2) + { + state = STATE_INNER_TEXT; + break; + } + if (cp == HYPHEN_MINUS) + hyphenCount++; + else + hyphenCount = 0; + break; + case STATE_STYLE: + if (cp == LESS_THAN_SIGN) + { + state = STATE_STYLE_POSSIBLE_END_TAG; + break; + } + break; + case STATE_STYLE_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_STYLE_END_TAG; + else + state = STATE_STYLE; + break; + case STATE_STYLE_END_TAG: + if (cp == GREATER_THAN_SIGN) + { + struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); + setInnerHtmlEndOffset(closedTag, text, off); + free(endTag); + endTag = malloc(sizeof(char)); + endTag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!isASCIIWhitespace(cp)) + endTag = stringCat(endTag, cpToChars(cp, ret)); + break; + case STATE_SCRIPT: + if (cp == LESS_THAN_SIGN) + { + state = STATE_SCRIPT_POSSIBLE_END_TAG; + break; + } + break; + case STATE_SCRIPT_POSSIBLE_END_TAG: + if (cp == SOLIDUS) + state = STATE_SCRIPT_END_TAG; + else + state = STATE_SCRIPT; + break; + case STATE_SCRIPT_END_TAG: + if (cp == GREATER_THAN_SIGN) + { + struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); + setInnerHtmlEndOffset(closedTag, text, off); + free(endTag); + endTag = malloc(sizeof(char)); + endTag[0] = 0; + state = STATE_INNER_TEXT; + break; + } + if (!isASCIIWhitespace(cp)) + endTag = stringCat(endTag, cpToChars(cp, ret)); + break; + } + } + } + free(endTag); + return tag; +} + +void freeTag(struct tag *t) +{ + free(t->name); + free(t->innerText); + for (int i=0; i<t->attrsLen; i++) + { + free(t->attrs[i]->name); + free(t->attrs[i]->value); + free(t->attrs[i]); + } + free(t->attrs); + for (int i=0; i<t->childrenLen; i++) + { + if (t->children[i] != NULL) + freeTag(t->children[i]); + } + free(t->children); + free(t); +} + +void freeTagList(struct tag_list *t) +{ + free(t->tags); + free(t); +} + +void findTag(struct tag *tag, struct find_opts *opt, struct tag_list *foundTags) +{ + bool matchesTag = false; + bool matchesAttrKey = false; + bool matchesAttrValue = false; + if (strcmp(tag->name, opt->tag) == 0) + matchesTag = true; + for (int i=0; i<tag->attrsLen; i++) + { + if (strcmp(tag->attrs[i]->name, opt->key) == 0) + matchesAttrKey = true; + if (strcmp(tag->attrs[i]->value, opt->attr) == 0) + matchesAttrValue = true; + } + if (strlen(opt->tag) > 0 && strlen(opt->key) > 0 && strlen(opt->attr) > 0) + { + if (matchesTag && matchesAttrKey && matchesAttrValue) + { + foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); + foundTags->tags[foundTags->len] = tag; + foundTags->len++; + } + } + else if (strlen(opt->tag) > 0 && strlen(opt->key) > 0) + { + if (matchesTag && matchesAttrKey) + { + foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); + foundTags->tags[foundTags->len] = tag; + foundTags->len++; + } + } + else if (strlen(opt->tag) > 0) + { + if (matchesTag) + { + foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); + foundTags->tags[foundTags->len] = tag; + foundTags->len++; + } + } + else if (strlen(opt->key) > 0 && strlen(opt->attr) > 0) + { + if (matchesAttrKey && matchesAttrValue) + { + foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); + foundTags->tags[foundTags->len] = tag; + foundTags->len++; + } + } + else if (strlen(opt->key) > 0) + { + if (matchesAttrKey) + { + foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); + foundTags->tags[foundTags->len] = tag; + foundTags->len++; + } + } + else if (strlen(opt->attr) > 0) + { + if (matchesAttrValue) + { + foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); + foundTags->tags[foundTags->len] = tag; + foundTags->len++; + } + } + for (int i=tag->childrenLen-1; i>-1; i--) + { + findTag(tag->children[i], opt, foundTags); + } +} + +void printHtml(struct tag *t, int indent) +{ + for (int i=0; i<indent; i++) + putchar(' '); + printf("%s\n", t->name); + indent++; + for (int i=t->childrenLen-1; i>-1; i--) + { + printHtml(t->children[i], indent); + } +} + +void printTag(char *text, struct tag *t, enum output_type out, struct tag_list *foundTags) +{ + switch (out) + { + case OUT_INNER_HTML: + break; + case OUT_OUTER_HTML: + break; + case OUT_INNER_TEXT: + break; + } + /* unsigned int p = *(unsigned int *)t; + unsigned int cp; + bool isMatch = false; + for (int i=0; i<foundTags->len; i++) + { + cp = *(unsigned int *)foundTags->tags[i]; + if (p == cp) + isMatch = true; + } + if (!isMatch) + { + char *trimmedText = NULL; + switch (out) + { + case OUT_INNER_HTML: + trimmedText = trim(getInnerHtml(text, t)); + break; + case OUT_OUTER_HTML: + trimmedText = trim(getOuterHtml(text, t)); + break; + } + if (strlen(trimmedText) > 0) + printf("%s\n", trimmedText); + free(trimmedText); + } */ + for (int i=t->childrenLen-1; i>-1; i--) + { + printTag(text, t->children[i], out, foundTags); + } +} + +void printResult +( + char *text, + struct tag *rootTag, + struct find_opts *opts, + struct tag_list *foundTags +) +{ + if (opts->isExcept) + { + // printTag(text, rootTag, opts->out, foundTags); + } + else + { + char *requestedText = NULL; + char *trimmedText = NULL; + for (int i=0; i<foundTags->len; i++) + { + switch (opts->out) + { + case OUT_INNER_HTML: + requestedText = getInnerHtml(text, foundTags->tags[i]); + trimmedText = trim(requestedText); + free(requestedText); + break; + case OUT_OUTER_HTML: + requestedText = getOuterHtml(text, foundTags->tags[i]); + trimmedText = trim(requestedText); + free(requestedText); + break; + case OUT_INNER_TEXT: + trimmedText = trim(foundTags->tags[i]->innerText); + break; + } + if (strlen(trimmedText) > 0) + printf("%s\n", trimmedText); + free(trimmedText); + } + } +} + +bool existFindPattern(struct find_opts *opts) +{ + if (strlen(opts->tag) > 0) + return true; + if (strlen(opts->attr) > 0) + return true; + if (strlen(opts->key) > 0) + return true; + return false; +} + +void filterHtml(char *text, struct find_opts *opts) +{ + struct tag_list *tagList = initTagList(); + struct tag_list *foundTags = initTagList(); + size_t len = parseDoctype(text); + if (len) + text += len; + struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); + if (!existFindPattern(opts)) + { + foundTags->tags = realloc(foundTags->tags, sizeof(struct tag)); + foundTags->tags[0] = rootTag; + foundTags->len = 1; + } + else + findTag(rootTag, opts, foundTags); + printResult(text, rootTag, opts, foundTags); + freeTag(rootTag); + freeTagList(tagList); + freeTagList(foundTags); +} diff --git a/html.h b/html.h @@ -0,0 +1,90 @@ +#define printError(msg) do { fprintf(stderr, "%s: %s\n", __func__, msg); } while (0) + +#define LESS_THAN_SIGN 0x3C +#define GREATER_THAN_SIGN 0x3E +#define EQUALS_SIGN 0x3D +#define TAB 0x09 +#define LF 0x0A +#define FF 0x0C +#define CR 0x0D +#define SPACE 0x20 +#define SOLIDUS 0x2F +#define EXCLAMATION_MARK 0x21 +#define QUOTATION_MARK 0x22 +#define APOSTROPHE 0x27 +#define GRAVE_ACCENT 0x60 +#define HYPHEN_MINUS 0x2D + +const char *voidElements[] = { + "area", "base", "br", "col", "embed", "hr", "img", + "input", "link", "meta", "source", "track", "wbr" +}; + +enum output_type +{ + OUT_INNER_HTML, + OUT_OUTER_HTML, + OUT_INNER_TEXT +}; + +struct find_opts +{ + char *tag; + char *attr; + char *key; + enum output_type out; + bool isExcept; +}; + +struct attr +{ + char *name; + char *value; // optional +}; + +struct tag +{ + char *name; + struct attr **attrs; + struct tag **children; + char *innerText; + size_t attrsLen; + size_t childrenLen; + bool _isVoidElement; // means there is no closing tag + bool _isClosed; + size_t _outerHtmlBeginOffset; + size_t _outerHtmlEndOffset; + size_t _innerHtmlBeginOffset; + size_t _innerHtmlEndOffset; +}; + +struct tag_list +{ + struct tag **tags; + size_t len; +}; + +enum state +{ + STATE_INNER_TEXT, + STATE_TAG, + STATE_BEGIN_TAG_NAME, + STATE_END_TAG_NAME, + STATE_ATTR_NAME, + STATE_ATTR_VALUE, + STATE_COMMENT, + STATE_SCRIPT, + STATE_SCRIPT_POSSIBLE_END_TAG, + STATE_SCRIPT_END_TAG, + STATE_STYLE, + STATE_STYLE_POSSIBLE_END_TAG, + STATE_STYLE_END_TAG +}; + +enum attr_value_syntax +{ + AVS_NO, + AVS_QUOTATION_MARK, + AVS_APOSTROPHE, + AVS_UNQUOTED +}; diff --git a/lib.c b/lib.c @@ -0,0 +1,112 @@ +char *stringCat(char *str1, char *str2) +{ + int str1Len = 0; + int str2Len = 0; + if (str1) + str1Len = strlen(str1); + if (str2) + str2Len = strlen(str2); + char *string = malloc((str1Len+str2Len+1) * sizeof(char)); + int i = 0; + int k = 0; + for (; i<str1Len; i++) + { + string[i] = str1[i]; + } + for (; k<str2Len; k++) + { + string[i+k] = str2[k]; + } + string[i+k] = '\0'; + free(str1); + free(str2); + return string; +} + +char *cpToChars(uint_least32_t cp, size_t len) +{ + char *str = malloc((len+1) * sizeof(char)); + grapheme_encode_utf8(cp, str, len); + str[len] = 0; + return str; +} + +char *trim(char *text) +{ + char *trimmedText = NULL; + int begin = 0; + int end = 0; + for (int i=0; i<strlen(text); i++) + { + if + ( + text[i] == ' ' || + text[i] == '\n' || + text[i] == '\t' || + text[i] == '\r' + ) + begin++; + else + break; + } + for (int i=strlen(text)-1; i>=0; i--) + { + if + ( + text[i] == ' '|| + text[i] == '\n' || + text[i] == '\t' || + text[i] == '\r' + ) + end++; + else + break; + } + int k = 0; + for (int i=0; i<strlen(text); i++) + { + if (i >= begin && i < strlen(text) - end) + { + trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); + trimmedText[k] = text[i]; + k++; + } + } + trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); + trimmedText[k] = 0; + return trimmedText; +} + +// Do not use for reading from a socket fd +bool tryRead(char *buf, FILE *fp) +{ + size_t bytesRead = fread(buf, 1, 1, fp); + if (feof(fp) != 0) + return false; + if (ferror(fp) != 0) + tryRead(buf, fp); + if (bytesRead != 1) + tryRead(buf, fp); + return true; +} + +char *readFile(FILE *fp) +{ + char *text = NULL; + int i = 0; + char buf; + while (1) + { + if (tryRead(&buf, fp)) + { + text = realloc(text, (i+1) * sizeof(char)); + text[i] = buf; + i++; + } + else + break; + } + text = realloc(text, (i+1) * sizeof(char)); + text[i] = 0; + return text; +} diff --git a/todo b/todo @@ -1,5 +1 @@ -refactor; heavy -implement find_attribute_value_by_* -implement filtering not only by class or id, also like this .test[data="asdf"] -implement finding tags that have no end tag, e.g. the img tag -Actually correctly parse html according to spec ;-) +support --except argument