commit 3be05c0cb6825138a7a7fdaa28466b90713d56e3
parent 54b54cb00e4500f10d254e7ec694f45f3fbeab58
Author: Robin <kroekerrobin@gmail.com>
Date: Sun, 13 Aug 2023 21:40:21 +0200
Merge branch 'big_change'
Diffstat:
| M | .gitignore | | | 2 | +- |
| M | Makefile | | | 7 | ++++--- |
| M | htex.c | | | 648 | ++++++++++++++++++++++++------------------------------------------------------- |
| A | html.c | | | 790 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | html.h | | | 90 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | lib.c | | | 112 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | todo | | | 6 | +----- |
7 files changed, 1195 insertions(+), 460 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,2 @@
-test*.html
+test/*
htex
diff --git a/Makefile b/Makefile
@@ -2,7 +2,9 @@ PREFIX = /usr/local
MANPREFIX = $(PREFIX)/share/man
all:
- $(CC) -O -Wall -Werror -o htex htex.c
+ $(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
+debug:
+ $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
clean:
rm htex
install: all
@@ -14,4 +16,4 @@ install: all
chmod 644 "$(MANPREFIX)/man1/htex.1"
uninstall:
rm "$(PREFIX)/bin/htex"
- rm "$(MANPREFIX)/man1/htex.1"
-\ No newline at end of file
+ rm "$(MANPREFIX)/man1/htex.1"
diff --git a/htex.c b/htex.c
@@ -1,461 +1,207 @@
#include <stdio.h>
+#include <string.h>
#include <stdbool.h>
#include <stdlib.h>
-#include <unistd.h>
#include <getopt.h>
-#include <string.h>
-#include <fcntl.h>
-
-char *text;
-char attribute_name[200];
-char tag_name[50];
-bool inner_html = false;
-bool except = false;
-struct match {
- int start;
- int end;
-};
-struct match *matches;
-
-int find_start_of_opening_tag_pos(int class_position) {
- int i = 1;
- while (1) {
- int pos = class_position - i;
- if (pos < 0)
- return -1;
- if (text[pos] == '<') {
- return pos;
- }
- i++;
- }
-}
-
-int find_end_of_opening_tag_pos(int class_position) {
- int i = 1;
- while (1) {
- int pos = class_position + i;
- if (pos < 0)
- return -1;
- if (text[pos] == '>') {
- return pos + 1;
- }
- i++;
- }
-}
-
-/*
- This function works only if the html tag
- has attributes.
-*/
-void find_tag_name(int open_tag_pos) {
- int i = 1;
- int end_of_tag_name = 0;
- while (1) {
- int pos = open_tag_pos + i;
- if (pos > strlen(text))
- return;
- if (text[pos] == ' ' || text[pos] == '\n') {
- end_of_tag_name = pos-1;
- break;
- }
- i++;
- }
- int length_tag_name = end_of_tag_name - open_tag_pos;
- for (int k=0; k<length_tag_name; k++) {
- tag_name[k] = text[open_tag_pos+k+1];
- }
- tag_name[length_tag_name] = '\0';
-}
-
-int find_closing_tag_pos(int open_tag_pos, bool inner_html) {
- int level = 1;
- int failure = 0;
- char close_tag[strlen(tag_name)+3];
- close_tag[0] = '<';
- close_tag[1] = '/';
- for (int k=0; k<strlen(tag_name); k++) {
- close_tag[2+k] = tag_name[k];
- }
- close_tag[sizeof(close_tag)-1] = '>';
- close_tag[sizeof(close_tag)] = '\0';
+#include <inttypes.h>
+#include <grapheme.h>
+#include "lib.c"
+#include "html.c"
- for (int l=open_tag_pos; l<strlen(text); l++) { // Could be more precise
- if (text[l] == '<') {
- for (int o=0; o<strlen(tag_name); o++) {
- if (tag_name[o] != text[l+o+1]) {
- failure = 1;
- break;
- }
- }
- if (failure == 0) {
- if (
- text[l+strlen(tag_name)+1] != ' ' &&
- text[l+strlen(tag_name)+1] != '>'
- ) {
- failure = 1;
- }
- }
- if (failure == 0) {
- level++;
- }
- failure = 0;
- if (text[l+1] == '/') {
- for (int o=2; o<strlen(close_tag); o++) {
- if (close_tag[o] != text[l+o]) {
- failure = 1;
- break;
- }
- }
- if (failure == 0) {
- level--;
- if (level == 0) {
- if (inner_html) {
- return l;
- } else {
- return l + strlen(tag_name) + 3;
- }
- }
- }
- failure = 0;
- }
- }
- }
- return -1;
+struct find_opts *parseFilterOpts(const char *pattern)
+{
+ struct find_opts *opt = malloc(sizeof(struct find_opts));
+ opt->out = OUT_OUTER_HTML;
+ opt->tag = malloc(sizeof(char));
+ opt->tag[0] = 0;
+ opt->attr = malloc(sizeof(char));
+ opt->attr[0] = 0;
+ opt->key = malloc(sizeof(char));
+ opt->key[0] = 0;
+ bool isClassValue = false;
+ bool isIdValue = false;
+ int i = 0;
+ bool isAttrKey = false;
+ bool isAttrOrTag = true;
+ char *attrOrTag = NULL;
+ int aot = 0;
+ int ak = 0;
+ int av = 0;
+ switch (pattern[0])
+ {
+ case '.':
+ isClassValue = true;
+ i = 1;
+ break;
+ case '#':
+ isIdValue = true;
+ i = 1;
+ break;
+ }
+ for (; i<strlen(pattern); i++)
+ {
+ if (pattern[i] == ']')
+ break;
+ if (
+ !isAttrKey &&
+ !isAttrOrTag &&
+ pattern[i] != ']' &&
+ pattern[i] != '"'
+ )
+ {
+ opt->attr = realloc(opt->attr, (av+1) * sizeof(char));
+ opt->attr[av] = pattern[i];
+ av++;
+ }
+ if (pattern[i] == '=')
+ isAttrKey = false;
+ if (isAttrKey && !isAttrOrTag)
+ {
+ opt->key = realloc(opt->key, (ak+1) * sizeof(char));
+ opt->key[ak] = pattern[i];
+ ak++;
+ }
+ if (pattern[i] == '[')
+ {
+ isAttrKey = true;
+ isAttrOrTag = false;
+ }
+ if (isAttrOrTag)
+ {
+ attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char));
+ attrOrTag[aot] = pattern[i];
+ aot++;
+ }
+ }
+ attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char));
+ attrOrTag[aot] = 0;
+ if (isIdValue)
+ {
+ free(opt->key);
+ opt->key = NULL;
+ free(opt->attr);
+ opt->attr = NULL;
+ opt->attr = attrOrTag;
+ opt->key = realloc(opt->key, 3 * sizeof(char));
+ opt->key[0] = 'i';
+ opt->key[1] = 'd';
+ opt->key[2] = 0;
+ }
+ else if (isClassValue)
+ {
+ free(opt->key);
+ opt->key = NULL;
+ free(opt->attr);
+ opt->attr = NULL;
+ opt->attr = attrOrTag;
+ opt->key = realloc(opt->key, 6 * sizeof(char));
+ opt->key[0] = 'c';
+ opt->key[1] = 'l';
+ opt->key[2] = 'a';
+ opt->key[3] = 's';
+ opt->key[4] = 's';
+ opt->key[5] = 0;
+ }
+ else
+ {
+ free(opt->tag);
+ opt->tag = attrOrTag;
+ if (av > 0)
+ {
+ opt->attr = realloc(opt->attr, (av+1) * sizeof(char));
+ opt->attr[av] = 0;
+ }
+ if (ak > 0)
+ {
+ opt->key = realloc(opt->key, (ak+1) * sizeof(char));
+ opt->key[ak] = 0;
+ }
+ }
+ return opt;
}
-bool correct_name_begin_or_end(char prev_char) {
- switch(prev_char) {
- case '"':
- return true;
- case '\'':
- return true;
- case ' ':
- return true;
- default:
- return false;
- }
+void freeOpts(struct find_opts *opt)
+{
+ free(opt->tag);
+ free(opt->attr);
+ free(opt->key);
+ free(opt);
}
-void find_html_tag_by_class(char *class_name) {
- int o = 0;
- int failure = 0;
- int counter = 0;
- int is_not_quotation_mark = 1;
-
- for (int k=0; k<strlen(text); k++) {
- if (
- text[k] == 'c' &&
- text[k+1] == 'l' &&
- text[k+2] == 'a' &&
- text[k+3] == 's' &&
- text[k+4] == 's'
- ) {
- while (is_not_quotation_mark == 1) {
- if (text[k+7+o] == '"' || text[k+7+o] == '\'') {
- is_not_quotation_mark = 0;
- break;
- }
- if (class_name[0] == text[k+7+o]) {
- for (int l=1; l<strlen(class_name); l++) {
- if (class_name[l] != text[k+7+o+l]) {
- failure = 1;
- break;
- }
- }
- if (failure == 0) {
- if (
- !correct_name_begin_or_end(text[k+6+o]) ||
- !correct_name_begin_or_end(text[k+7+o+strlen(class_name)])
- ) {
- failure = 1;
- }
- }
- if (failure == 0) {
- if (inner_html) {
- int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
- find_tag_name(start_of_open_tag_pos);
- int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
- int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
- for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
- printf("%c", text[e]);
- }
- printf("\n");
- } else {
- int open_tag_pos = find_start_of_opening_tag_pos(k);
- find_tag_name(open_tag_pos);
- int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
- int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false);
- if (except) {
- matches = realloc(matches, (counter+1) * sizeof(struct match));
- matches[counter].start = open_tag_pos;
- matches[counter].end = close_tag_pos;
- counter++;
- } else {
- for (int e=open_tag_pos; e<close_tag_pos; e++) {
- printf("%c", text[e]);
- }
- printf("\n");
- }
- }
- }
- failure = 0;
- }
- o++;
- }
- is_not_quotation_mark = 1;
- o = 0;
- }
- }
- if (except) {
- int start = 0;
- for (int i=0; i<counter; i++) {
- for (int e=start; e<matches[i].start; e++) {
- printf("%c", text[e]);
- }
- start = matches[i].end;
- }
- for (int i=start; i<strlen(text); i++) {
- printf("%c", text[i]);
- }
- printf("\n");
- free(matches);
- }
-}
-
-void find_html_tag_by_id(char *id_name) {
- int o = 0;
- int failure = 0;
- int counter = 0;
- int is_not_quotation_mark = 1;
-
- for (int k=0; k<strlen(text); k++) {
- if (
- text[k] == 'i' &&
- text[k+1] == 'd'
- ) {
- while (is_not_quotation_mark == 1) {
- if (text[k+4+o] == '"' || text[k+4+o] == '\'') {
- is_not_quotation_mark = 0;
- break;
- }
- if (id_name[0] == text[k+4+o]) {
- for (int l=1; l<strlen(id_name); l++) {
- if (id_name[l] != text[k+4+o+l]) {
- failure = 1;
- break;
- }
- }
- if (failure == 0) {
- if (
- !correct_name_begin_or_end(text[k+3+o]) ||
- !correct_name_begin_or_end(text[k+4+o+strlen(id_name)])
- ) {
- failure = 1;
- }
- }
- if (failure == 0) {
- if (inner_html) {
- int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
- find_tag_name(start_of_open_tag_pos);
- int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
- int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
- for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
- printf("%c", text[e]);
- }
- printf("\n");
- } else {
- int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
- find_tag_name(start_of_open_tag_pos);
- int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
- int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false);
- if (except) {
- matches = realloc(matches, (counter+1) * sizeof(struct match));
- matches[counter].start = start_of_open_tag_pos;
- matches[counter].end = close_tag_pos;
- counter++;
- } else {
- for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) {
- printf("%c", text[e]);
- }
- printf("\n");
- }
- }
- }
- failure = 0;
- }
- o++;
- }
- is_not_quotation_mark = 1;
- o = 0;
- }
- }
- if (except) {
- int start = 0;
- for (int i=0; i<counter; i++) {
- for (int e=start; e<matches[i].start; e++) {
- printf("%c", text[e]);
- }
- start = matches[i].end;
- }
- for (int i=start; i<strlen(text); i++) {
- printf("%c", text[i]);
- }
- printf("\n");
- free(matches);
- }
-}
-void find_html_tag_by_tag() {
- int failure = 0;
- int counter = 0;
- for (int k=0; k<strlen(text); k++) {
- if (text[k] == '<' && text[k+1] != '/') {
- for (int o=0; o<strlen(attribute_name); o++) {
- if (attribute_name[o] != text[k+1+o]) {
- failure = 1;
- break;
- }
- }
- if (failure == 0) {
- if (
- text[k+1+strlen(attribute_name)] == '>' ||
- text[k+1+strlen(attribute_name)] == ' ' ||
- text[k+1+strlen(attribute_name)] == '\n'
- ) {
- int open_tag_pos = k;
- int after_tag_pos = k+1+strlen(attribute_name)+1;
- if (inner_html) {
- int close_tag_pos = find_closing_tag_pos(after_tag_pos, true);
- int end_of_open_tag_pos = find_end_of_opening_tag_pos(k+strlen(attribute_name));
- for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
- printf("%c", text[e]);
- }
- printf("\n");
- } else {
- int close_tag_pos = find_closing_tag_pos(after_tag_pos, false);
- if (except) {
- matches = realloc(matches, (counter+1) * sizeof(struct match));
- matches[counter].start = open_tag_pos;
- matches[counter].end = close_tag_pos;
- counter++;
- } else {
- for (int e=open_tag_pos; e<close_tag_pos; e++) {
- printf("%c", text[e]);
- }
- printf("\n");
- }
- }
- }
- }
- failure = 0;
- }
- }
- if (except) {
- int start = 0;
- for (int i=0; i<counter; i++) {
- for (int e=start; e<matches[i].start; e++) {
- printf("%c", text[e]);
- }
- start = matches[i].end;
- }
- for (int i=start; i<strlen(text); i++) {
- printf("%c", text[i]);
- }
- printf("\n");
- free(matches);
- }
-}
-
-void find_html_tag() {
- char identifier[200];
- for (int i=0; i<strlen(attribute_name); i++) {
- identifier[i] = attribute_name[i+1];
- }
- switch(attribute_name[0]) {
- case '.':
- find_html_tag_by_class(identifier);
- break;
- case '#':
- find_html_tag_by_id(identifier);
- break;
- default:
- for (int i=0; i<strlen(attribute_name); i++) {
- tag_name[i] = attribute_name[i];
- }
- find_html_tag_by_tag();
- }
-}
-
-int main(int argc, char *argv[]) {
- int i = 0;
- char buffer;
- int o;
- text = malloc(sizeof(char));
- if (!text) {
- printf("malloc error.\n");
- return -1;
- }
-
- static struct option long_options[] = {
- { "attribute", required_argument, 0, 'a' },
- { "innerhtml", no_argument, 0, 'i' },
- { "except", no_argument, 0, 'e' },
- { 0, 0, 0, 0 }
- };
- int option_index = 0;
- while ((o = getopt_long(argc, argv, "eia:", long_options, &option_index)) != -1) {
- switch(o) {
- case 'a':
- for (int j=0; j<strlen(optarg); j++) {
- attribute_name[j] = optarg[j];
- }
- break;
- case 'i':
- inner_html = true;
- break;
- case 'e':
- except = true;
- break;
- }
- }
- if (inner_html && except) {
- printf("You can't use the options -i (--innerhtml) and -e (--except) at the same time.\n");
- return -1;
- }
- if (argc == (optind + 1)) {
- if (*argv[argc-1] == '-') {
- while (read(0, &buffer, 1) > 0) {
- text[i] = buffer;
- i++;
- text = realloc(text, (i+1) * sizeof(char));
- if (!text || text == NULL) {
- printf("realloc error.\n");
- return -1;
- }
- }
- text[i] = '\0';
- find_html_tag();
- free(text);
- } else {
- int fd = open(argv[argc-1], O_RDONLY);
- if (fd != -1) {
- while (read(fd, &buffer, 1) > 0) {
- text[i] = buffer;
- i++;
- text = realloc(text, (i+1) * sizeof(char));
- if (!text || text == NULL) {
- printf("realloc error.\n");
- return -1;
- }
- }
- text[i] = '\0';
- find_html_tag();
- free(text);
- } else {
- printf("Couldn't read file \"%s\"\n", argv[argc-1]);
- }
- }
- } else {
- printf("Nothing to read from.\n");
- }
- return 0;
+int main(int argc, char *argv[])
+{
+ int o = 0;
+ int option_index = 0;
+ bool isInnerHtml = false;
+ bool isInnerText = false;
+ bool isExcept = false;
+ char *text = NULL;
+ char *searchPattern = NULL;
+ static struct option long_options[] = {
+ { "innerhtml", no_argument, 0, 'i' },
+ { "innertext", no_argument, 0, 't' },
+ { "except", no_argument, 0, 'e' },
+ { 0, 0, 0, 0 }
+ };
+ while ((o = getopt_long(argc, argv, "ite", long_options, &option_index)) != -1) {
+ switch(o) {
+ case 'i':
+ isInnerHtml = true;
+ break;
+ case 't':
+ isInnerText = true;
+ break;
+ case 'e':
+ isExcept = true;
+ break;
+ }
+ }
+ if (isInnerHtml && isInnerText)
+ {
+ fprintf(stderr, "Provide either --innerhtml or --innertext.\n");
+ return -1;
+ }
+ if (argc == optind)
+ {
+ fprintf(stderr, "Provide a search pattern!\n");
+ return -1;
+ }
+ if (argc > optind+2)
+ {
+ fprintf(stderr, "Provide only one file!\n");
+ return -1;
+ }
+ if (argc == optind+1)
+ {
+ searchPattern = argv[argc-1];
+ text = readFile(stdin);
+ }
+ else if (argc == optind+2)
+ {
+ searchPattern = argv[argc-2];
+ char *filepath = argv[argc-1];
+ FILE *fp = fopen(filepath, "r");
+ if (fp == NULL)
+ {
+ perror("fopen failed: ");
+ return -1;
+ }
+ text = readFile(fp);
+ fclose(fp);
+ if (strlen(text) == 0)
+ {
+ printf("No data in file.\n");
+ return 0;
+ }
+ }
+ struct find_opts *options = parseFilterOpts(searchPattern);
+ options->isExcept = isExcept;
+ if (isInnerHtml)
+ options->out = OUT_INNER_HTML;
+ if (isInnerText)
+ options->out = OUT_INNER_TEXT;
+ filterHtml(text, options);
+ freeOpts(options);
+ free(text);
+ return 0;
}
diff --git a/html.c b/html.c
@@ -0,0 +1,790 @@
+#include "html.h"
+
+const char *stateToString(enum state s)
+{
+ switch(s)
+ {
+ case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
+ case STATE_TAG: return "STATE_TAG";
+ case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
+ case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
+ case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
+ case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
+ case STATE_COMMENT: return "STATE_COMMENT";
+ case STATE_SCRIPT: return "STATE_SCRIPT";
+ case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
+ case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
+ case STATE_STYLE: return "STATE_STYLE";
+ case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
+ case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
+ }
+ return "";
+}
+
+struct attr *initAttr()
+{
+ struct attr *a = malloc(sizeof(struct attr));
+ a->name = malloc(sizeof(char));
+ a->name[0] = 0;
+ a->value = malloc(sizeof(char));
+ a->value[0] = 0;
+ return a;
+}
+
+struct tag *initTag()
+{
+ struct tag *t = malloc(sizeof(struct tag));
+ t->name = malloc(sizeof(char));
+ t->name[0] = 0;
+ t->innerText = malloc(sizeof(char));
+ t->innerText[0] = 0;
+ t->attrs = NULL;
+ t->children = NULL;
+ t->attrsLen = 0;
+ t->childrenLen = 0;
+ t->_isVoidElement = false;
+ t->_isClosed = false;
+ return t;
+}
+
+struct tag_list *initTagList()
+{
+ struct tag_list *t = malloc(sizeof(struct tag_list));
+ t->tags = NULL;
+ t->len = 0;
+ return t;
+}
+
+static inline bool isASCIIDigit(uint_least32_t cp)
+{
+ if (cp >= 0x30 && cp <= 0x39)
+ return true;
+ return false;
+}
+
+static inline bool isASCIIAlphaUpper(uint_least32_t cp)
+{
+ if (cp >= 0x41 && cp <= 0x5A)
+ return true;
+ return false;
+}
+
+static inline bool isASCIIAlphaLower(uint_least32_t cp)
+{
+ if (cp >= 0x61 && cp <= 0x7A)
+ return true;
+ return false;
+}
+
+static inline bool isASCIIAlpha(uint_least32_t cp)
+{
+ if (isASCIIAlphaLower(cp) || isASCIIAlphaUpper(cp))
+ return true;
+ return false;
+}
+
+static inline bool isASCIIWhitespace(uint_least32_t cp)
+{
+ if (
+ cp == TAB ||
+ cp == LF ||
+ cp == FF ||
+ cp == CR ||
+ cp == SPACE
+ )
+ return true;
+ return false;
+}
+
+static inline bool isVoidElement(const char *tagName)
+{
+ for (int i=0; i<13; i++)
+ {
+ if (strcmp(tagName, voidElements[i]) == 0)
+ return true;
+ }
+ return false;
+}
+
+static inline bool isC0Control(uint_least32_t cp)
+{
+ if (cp >= 0x00 && cp <= 0x1F)
+ return true;
+ return false;
+}
+
+static inline bool isControl(uint_least32_t cp)
+{
+ if (isC0Control(cp))
+ return true;
+ if (cp >= 0x7F && cp <= 0x9F)
+ return true;
+ return false;
+}
+
+static inline bool isNonChar(uint_least32_t cp)
+{
+ if (cp >= 0xFDD0 && cp <= 0xFDEF)
+ return true;
+ if (
+ cp == 0xFFFE || cp == 0xFFFF ||
+ cp == 0x1FFFE || cp == 0x1FFFF ||
+ cp == 0x2FFFE || cp == 0x2FFFF ||
+ cp == 0x3FFFE || cp == 0x3FFFF ||
+ cp == 0x4FFFE || cp == 0x4FFFF ||
+ cp == 0x5FFFE || cp == 0x5FFFF ||
+ cp == 0x6FFFE || cp == 0x6FFFF ||
+ cp == 0x7FFFE || cp == 0x7FFFF ||
+ cp == 0x8FFFE || cp == 0x8FFFF ||
+ cp == 0x9FFFE || cp == 0x9FFFF ||
+ cp == 0xAFFFE || cp == 0xAFFFF ||
+ cp == 0xBFFFE || cp == 0xBFFFF ||
+ cp == 0xCFFFE || cp == 0xCFFFF ||
+ cp == 0xDFFFE || cp == 0xDFFFF ||
+ cp == 0xEFFFE || cp == 0xEFFFF ||
+ cp == 0xFFFFE || cp == 0xFFFFF ||
+ cp == 0x10FFFE || cp == 0x10FFFF
+ )
+ return true;
+ return false;
+}
+
+static inline bool isValidAttrName(uint_least32_t cp)
+{
+ if (isControl(cp))
+ return false;
+ if (isNonChar(cp))
+ return false;
+ if (
+ cp == SPACE ||
+ cp == QUOTATION_MARK ||
+ cp == APOSTROPHE ||
+ cp == GREATER_THAN_SIGN ||
+ cp == SOLIDUS ||
+ cp == EQUALS_SIGN
+ )
+ return false;
+ return true;
+}
+
+static inline bool
+isValidUnquotedAttrValue(uint_least32_t cp)
+{
+ /*
+ Not mentioned invalid characters.
+ They are already handled before
+ funtion call.
+ */
+ if (
+ cp == EQUALS_SIGN ||
+ cp == LESS_THAN_SIGN ||
+ cp == GREATER_THAN_SIGN ||
+ cp == GRAVE_ACCENT
+ )
+ return false;
+ return true;
+}
+
+size_t parseDoctype(const char *text)
+{
+ char *firstLine = NULL;
+ int i = 0;
+ while (text[i] != '\n')
+ {
+ firstLine = realloc(firstLine, (i+1) * sizeof(char));
+ firstLine[i] = text[i];
+ i++;
+ }
+ firstLine = realloc(firstLine, (i+1) * sizeof(char));
+ firstLine[i] = 0;
+ if (strcmp("<!DOCTYPE html>", firstLine) == 0)
+ {
+ free(firstLine);
+ return i+1;
+ }
+ if (strcmp("<!doctype html>", firstLine) == 0)
+ {
+ free(firstLine);
+ return i+1;
+ }
+ free(firstLine);
+ return 0;
+}
+
+struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset)
+{
+ for (int i=tagList->len-1; i>-1; i--)
+ {
+ if (strcmp(tagList->tags[i]->name, endTag) == 0 && !tagList->tags[i]->_isClosed)
+ {
+ tagList->tags[i]->_isClosed = true;
+ tagList->tags[i]->_outerHtmlEndOffset = endOffset;
+ return tagList->tags[i];
+ }
+ }
+ return NULL;
+}
+
+struct tag *getLastOpenTag(struct tag_list *tagList)
+{
+ for (int i=tagList->len-1; i>-1; i--)
+ {
+ if (!tagList->tags[i]->_isVoidElement && !tagList->tags[i]->_isClosed)
+ {
+ return tagList->tags[i];
+ }
+ }
+ return tagList->tags[0];
+}
+
+char *getOuterHtml(char *text, struct tag *t)
+{
+ char *outerHtml = NULL;
+ int o = 0;
+ for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++)
+ {
+ outerHtml = realloc(outerHtml, (o+1) * sizeof(char));
+ outerHtml[o] = text[i];
+ o++;
+ }
+ outerHtml = realloc(outerHtml, (o+1) * sizeof(char));
+ outerHtml[o] = 0;
+ return outerHtml;
+}
+
+char *getInnerHtml(char *text, struct tag *t)
+{
+ char *innerHtml = NULL;
+ int o = 0;
+ for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++)
+ {
+ innerHtml = realloc(innerHtml, (o+1) * sizeof(char));
+ innerHtml[o] = text[i];
+ o++;
+ }
+ innerHtml = realloc(innerHtml, (o+1) * sizeof(char));
+ innerHtml[o] = 0;
+ return innerHtml;
+}
+
+void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off)
+{
+ int i = off;
+ while (text[i] != '<')
+ {
+ i--;
+ }
+ closedTag->_innerHtmlEndOffset = i;
+}
+
+enum state endOfBeginTag(struct tag *t, size_t offset)
+{
+ t->_innerHtmlBeginOffset = offset+1;
+ t->_isVoidElement = isVoidElement(t->name);
+ if (t->_isVoidElement)
+ t->_outerHtmlEndOffset = offset+1;
+ if (strcmp(t->name, "script") == 0)
+ return STATE_SCRIPT;
+ else if (strcmp(t->name, "style") == 0)
+ return STATE_STYLE;
+ else
+ return STATE_INNER_TEXT;
+}
+
+struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList)
+{
+ struct tag *tag = initTag();
+ tag->_outerHtmlBeginOffset= offset-1;
+ tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag));
+ tagList->tags[tagList->len] = tag;
+ tagList->len++;
+ struct tag *stillOpenTag = tag;
+ char *endTag = malloc(sizeof(char));
+ endTag[0] = 0;
+ size_t a = 0;
+ size_t attrNameCount = 0;
+ enum attr_value_syntax attrValueSyntax = AVS_NO;
+ size_t hyphenCount = 0;
+ uint_least32_t cp;
+ size_t len = strlen(text);
+ size_t ret, off;
+ for (off = offset; off<len; off += ret)
+ {
+ if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off)
+ {
+ printError("Something wrong with ending of text");
+ }
+ else
+ {
+ // char *the_codepoint = cpToChars(cp, ret);
+ // printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state));
+ // free(the_codepoint);
+ switch (state)
+ {
+ case STATE_INNER_TEXT:
+ if (cp == LESS_THAN_SIGN)
+ {
+ state = STATE_TAG;
+ break;
+ }
+ stillOpenTag = getLastOpenTag(tagList);
+ stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret));
+ break;
+ case STATE_TAG:
+ if (cp == SOLIDUS)
+ {
+ state = STATE_END_TAG_NAME;
+ break;
+ }
+ if (cp == EXCLAMATION_MARK)
+ {
+ state = STATE_COMMENT;
+ break;
+ }
+ stillOpenTag = getLastOpenTag(tagList);
+ struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
+ stillOpenTag->children = realloc(
+ stillOpenTag->children,
+ (stillOpenTag->childrenLen+1) * sizeof(struct tag)
+ );
+ stillOpenTag->children[stillOpenTag->childrenLen] = oneTag;
+ stillOpenTag->childrenLen++;
+ free(endTag);
+ return tag;
+ case STATE_BEGIN_TAG_NAME:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ state = endOfBeginTag(tag, off);
+ break;
+ }
+ if (isASCIIWhitespace(cp))
+ {
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ if (isASCIIDigit(cp) || isASCIIAlpha(cp))
+ {
+ tag->name = stringCat(tag->name, cpToChars(cp, ret));
+ }
+ break;
+ case STATE_END_TAG_NAME:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+ setInnerHtmlEndOffset(closedTag, text, off);
+ free(endTag);
+ endTag = malloc(sizeof(char));
+ endTag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!isASCIIWhitespace(cp))
+ endTag = stringCat(endTag, cpToChars(cp, ret));
+ break;
+ case STATE_ATTR_NAME:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ state = endOfBeginTag(tag, off);
+ break;
+ }
+ if (isASCIIWhitespace(cp))
+ {
+ if (attrNameCount == a+1)
+ a++;
+ break;
+ }
+ if (cp == EQUALS_SIGN)
+ {
+ state = STATE_ATTR_VALUE;
+ break;
+ }
+ if (isValidAttrName(cp))
+ {
+ if (attrNameCount != a+1)
+ {
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct attr)
+ );
+ tag->attrs[a] = initAttr();
+ attrNameCount = a + 1;
+ tag->attrsLen = attrNameCount;
+ }
+ tag->attrs[a]->name = stringCat(
+ tag->attrs[a]->name,
+ cpToChars(cp, ret)
+ );
+ }
+ break;
+ case STATE_ATTR_VALUE:
+ if (isASCIIWhitespace(cp))
+ {
+ if (attrValueSyntax == AVS_UNQUOTED)
+ {
+ attrValueSyntax = AVS_NO;
+ state = STATE_ATTR_NAME;
+ }
+ else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE)
+ {
+ char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
+ strcpy(tmpName, tag->attrs[a]->name);
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct attr)
+ );
+ a++;
+ tag->attrs[a] = initAttr();
+ free(tag->attrs[a]->name);
+ tag->attrs[a]->name = tmpName;
+ tag->attrsLen++;
+ attrNameCount = a + 1;
+ }
+ break;
+ }
+ if (cp == QUOTATION_MARK)
+ {
+ if (attrValueSyntax == AVS_NO)
+ {
+ attrValueSyntax = AVS_QUOTATION_MARK;
+ break;
+ }
+ if (attrValueSyntax == AVS_QUOTATION_MARK)
+ {
+ attrValueSyntax = AVS_NO;
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ }
+ if (cp == APOSTROPHE)
+ {
+ if (attrValueSyntax == AVS_NO)
+ {
+ attrValueSyntax = AVS_APOSTROPHE;
+ break;
+ }
+ if (attrValueSyntax == AVS_APOSTROPHE)
+ {
+ attrValueSyntax = AVS_NO;
+ state = STATE_ATTR_NAME;
+ break;
+ }
+ }
+ if (cp == GREATER_THAN_SIGN)
+ {
+ state = endOfBeginTag(tag, off);
+ break;
+ }
+ if (
+ attrValueSyntax == AVS_NO &&
+ isValidUnquotedAttrValue(cp)
+ )
+ {
+ attrValueSyntax = AVS_UNQUOTED;
+ }
+ if (attrValueSyntax > AVS_NO)
+ {
+ tag->attrs[a]->value = stringCat(
+ tag->attrs[a]->value,
+ cpToChars(cp, ret)
+ );
+ }
+ break;
+ case STATE_COMMENT:
+ if (cp == GREATER_THAN_SIGN && hyphenCount >= 2)
+ {
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (cp == HYPHEN_MINUS)
+ hyphenCount++;
+ else
+ hyphenCount = 0;
+ break;
+ case STATE_STYLE:
+ if (cp == LESS_THAN_SIGN)
+ {
+ state = STATE_STYLE_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_STYLE_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_STYLE_END_TAG;
+ else
+ state = STATE_STYLE;
+ break;
+ case STATE_STYLE_END_TAG:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+ setInnerHtmlEndOffset(closedTag, text, off);
+ free(endTag);
+ endTag = malloc(sizeof(char));
+ endTag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!isASCIIWhitespace(cp))
+ endTag = stringCat(endTag, cpToChars(cp, ret));
+ break;
+ case STATE_SCRIPT:
+ if (cp == LESS_THAN_SIGN)
+ {
+ state = STATE_SCRIPT_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_SCRIPT_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_SCRIPT_END_TAG;
+ else
+ state = STATE_SCRIPT;
+ break;
+ case STATE_SCRIPT_END_TAG:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+ setInnerHtmlEndOffset(closedTag, text, off);
+ free(endTag);
+ endTag = malloc(sizeof(char));
+ endTag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!isASCIIWhitespace(cp))
+ endTag = stringCat(endTag, cpToChars(cp, ret));
+ break;
+ }
+ }
+ }
+ free(endTag);
+ return tag;
+}
+
+void freeTag(struct tag *t)
+{
+ free(t->name);
+ free(t->innerText);
+ for (int i=0; i<t->attrsLen; i++)
+ {
+ free(t->attrs[i]->name);
+ free(t->attrs[i]->value);
+ free(t->attrs[i]);
+ }
+ free(t->attrs);
+ for (int i=0; i<t->childrenLen; i++)
+ {
+ if (t->children[i] != NULL)
+ freeTag(t->children[i]);
+ }
+ free(t->children);
+ free(t);
+}
+
+void freeTagList(struct tag_list *t)
+{
+ free(t->tags);
+ free(t);
+}
+
+void findTag(struct tag *tag, struct find_opts *opt, struct tag_list *foundTags)
+{
+ bool matchesTag = false;
+ bool matchesAttrKey = false;
+ bool matchesAttrValue = false;
+ if (strcmp(tag->name, opt->tag) == 0)
+ matchesTag = true;
+ for (int i=0; i<tag->attrsLen; i++)
+ {
+ if (strcmp(tag->attrs[i]->name, opt->key) == 0)
+ matchesAttrKey = true;
+ if (strcmp(tag->attrs[i]->value, opt->attr) == 0)
+ matchesAttrValue = true;
+ }
+ if (strlen(opt->tag) > 0 && strlen(opt->key) > 0 && strlen(opt->attr) > 0)
+ {
+ if (matchesTag && matchesAttrKey && matchesAttrValue)
+ {
+ foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+ foundTags->tags[foundTags->len] = tag;
+ foundTags->len++;
+ }
+ }
+ else if (strlen(opt->tag) > 0 && strlen(opt->key) > 0)
+ {
+ if (matchesTag && matchesAttrKey)
+ {
+ foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+ foundTags->tags[foundTags->len] = tag;
+ foundTags->len++;
+ }
+ }
+ else if (strlen(opt->tag) > 0)
+ {
+ if (matchesTag)
+ {
+ foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+ foundTags->tags[foundTags->len] = tag;
+ foundTags->len++;
+ }
+ }
+ else if (strlen(opt->key) > 0 && strlen(opt->attr) > 0)
+ {
+ if (matchesAttrKey && matchesAttrValue)
+ {
+ foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+ foundTags->tags[foundTags->len] = tag;
+ foundTags->len++;
+ }
+ }
+ else if (strlen(opt->key) > 0)
+ {
+ if (matchesAttrKey)
+ {
+ foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+ foundTags->tags[foundTags->len] = tag;
+ foundTags->len++;
+ }
+ }
+ else if (strlen(opt->attr) > 0)
+ {
+ if (matchesAttrValue)
+ {
+ foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+ foundTags->tags[foundTags->len] = tag;
+ foundTags->len++;
+ }
+ }
+ for (int i=tag->childrenLen-1; i>-1; i--)
+ {
+ findTag(tag->children[i], opt, foundTags);
+ }
+}
+
+void printHtml(struct tag *t, int indent)
+{
+ for (int i=0; i<indent; i++)
+ putchar(' ');
+ printf("%s\n", t->name);
+ indent++;
+ for (int i=t->childrenLen-1; i>-1; i--)
+ {
+ printHtml(t->children[i], indent);
+ }
+}
+
+void printTag(char *text, struct tag *t, enum output_type out, struct tag_list *foundTags)
+{
+ switch (out)
+ {
+ case OUT_INNER_HTML:
+ break;
+ case OUT_OUTER_HTML:
+ break;
+ case OUT_INNER_TEXT:
+ break;
+ }
+ /* unsigned int p = *(unsigned int *)t;
+ unsigned int cp;
+ bool isMatch = false;
+ for (int i=0; i<foundTags->len; i++)
+ {
+ cp = *(unsigned int *)foundTags->tags[i];
+ if (p == cp)
+ isMatch = true;
+ }
+ if (!isMatch)
+ {
+ char *trimmedText = NULL;
+ switch (out)
+ {
+ case OUT_INNER_HTML:
+ trimmedText = trim(getInnerHtml(text, t));
+ break;
+ case OUT_OUTER_HTML:
+ trimmedText = trim(getOuterHtml(text, t));
+ break;
+ }
+ if (strlen(trimmedText) > 0)
+ printf("%s\n", trimmedText);
+ free(trimmedText);
+ } */
+ for (int i=t->childrenLen-1; i>-1; i--)
+ {
+ printTag(text, t->children[i], out, foundTags);
+ }
+}
+
+void printResult
+(
+ char *text,
+ struct tag *rootTag,
+ struct find_opts *opts,
+ struct tag_list *foundTags
+)
+{
+ if (opts->isExcept)
+ {
+ // printTag(text, rootTag, opts->out, foundTags);
+ }
+ else
+ {
+ char *requestedText = NULL;
+ char *trimmedText = NULL;
+ for (int i=0; i<foundTags->len; i++)
+ {
+ switch (opts->out)
+ {
+ case OUT_INNER_HTML:
+ requestedText = getInnerHtml(text, foundTags->tags[i]);
+ trimmedText = trim(requestedText);
+ free(requestedText);
+ break;
+ case OUT_OUTER_HTML:
+ requestedText = getOuterHtml(text, foundTags->tags[i]);
+ trimmedText = trim(requestedText);
+ free(requestedText);
+ break;
+ case OUT_INNER_TEXT:
+ trimmedText = trim(foundTags->tags[i]->innerText);
+ break;
+ }
+ if (strlen(trimmedText) > 0)
+ printf("%s\n", trimmedText);
+ free(trimmedText);
+ }
+ }
+}
+
+bool existFindPattern(struct find_opts *opts)
+{
+ if (strlen(opts->tag) > 0)
+ return true;
+ if (strlen(opts->attr) > 0)
+ return true;
+ if (strlen(opts->key) > 0)
+ return true;
+ return false;
+}
+
+void filterHtml(char *text, struct find_opts *opts)
+{
+ struct tag_list *tagList = initTagList();
+ struct tag_list *foundTags = initTagList();
+ size_t len = parseDoctype(text);
+ if (len)
+ text += len;
+ struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
+ if (!existFindPattern(opts))
+ {
+ foundTags->tags = realloc(foundTags->tags, sizeof(struct tag));
+ foundTags->tags[0] = rootTag;
+ foundTags->len = 1;
+ }
+ else
+ findTag(rootTag, opts, foundTags);
+ printResult(text, rootTag, opts, foundTags);
+ freeTag(rootTag);
+ freeTagList(tagList);
+ freeTagList(foundTags);
+}
diff --git a/html.h b/html.h
@@ -0,0 +1,90 @@
+#define printError(msg) do { fprintf(stderr, "%s: %s\n", __func__, msg); } while (0)
+
+#define LESS_THAN_SIGN 0x3C
+#define GREATER_THAN_SIGN 0x3E
+#define EQUALS_SIGN 0x3D
+#define TAB 0x09
+#define LF 0x0A
+#define FF 0x0C
+#define CR 0x0D
+#define SPACE 0x20
+#define SOLIDUS 0x2F
+#define EXCLAMATION_MARK 0x21
+#define QUOTATION_MARK 0x22
+#define APOSTROPHE 0x27
+#define GRAVE_ACCENT 0x60
+#define HYPHEN_MINUS 0x2D
+
+const char *voidElements[] = {
+ "area", "base", "br", "col", "embed", "hr", "img",
+ "input", "link", "meta", "source", "track", "wbr"
+};
+
+enum output_type
+{
+ OUT_INNER_HTML,
+ OUT_OUTER_HTML,
+ OUT_INNER_TEXT
+};
+
+struct find_opts
+{
+ char *tag;
+ char *attr;
+ char *key;
+ enum output_type out;
+ bool isExcept;
+};
+
+struct attr
+{
+ char *name;
+ char *value; // optional
+};
+
+struct tag
+{
+ char *name;
+ struct attr **attrs;
+ struct tag **children;
+ char *innerText;
+ size_t attrsLen;
+ size_t childrenLen;
+ bool _isVoidElement; // means there is no closing tag
+ bool _isClosed;
+ size_t _outerHtmlBeginOffset;
+ size_t _outerHtmlEndOffset;
+ size_t _innerHtmlBeginOffset;
+ size_t _innerHtmlEndOffset;
+};
+
+struct tag_list
+{
+ struct tag **tags;
+ size_t len;
+};
+
+enum state
+{
+ STATE_INNER_TEXT,
+ STATE_TAG,
+ STATE_BEGIN_TAG_NAME,
+ STATE_END_TAG_NAME,
+ STATE_ATTR_NAME,
+ STATE_ATTR_VALUE,
+ STATE_COMMENT,
+ STATE_SCRIPT,
+ STATE_SCRIPT_POSSIBLE_END_TAG,
+ STATE_SCRIPT_END_TAG,
+ STATE_STYLE,
+ STATE_STYLE_POSSIBLE_END_TAG,
+ STATE_STYLE_END_TAG
+};
+
+enum attr_value_syntax
+{
+ AVS_NO,
+ AVS_QUOTATION_MARK,
+ AVS_APOSTROPHE,
+ AVS_UNQUOTED
+};
diff --git a/lib.c b/lib.c
@@ -0,0 +1,112 @@
+char *stringCat(char *str1, char *str2)
+{
+ int str1Len = 0;
+ int str2Len = 0;
+ if (str1)
+ str1Len = strlen(str1);
+ if (str2)
+ str2Len = strlen(str2);
+ char *string = malloc((str1Len+str2Len+1) * sizeof(char));
+ int i = 0;
+ int k = 0;
+ for (; i<str1Len; i++)
+ {
+ string[i] = str1[i];
+ }
+ for (; k<str2Len; k++)
+ {
+ string[i+k] = str2[k];
+ }
+ string[i+k] = '\0';
+ free(str1);
+ free(str2);
+ return string;
+}
+
+char *cpToChars(uint_least32_t cp, size_t len)
+{
+ char *str = malloc((len+1) * sizeof(char));
+ grapheme_encode_utf8(cp, str, len);
+ str[len] = 0;
+ return str;
+}
+
+char *trim(char *text)
+{
+ char *trimmedText = NULL;
+ int begin = 0;
+ int end = 0;
+ for (int i=0; i<strlen(text); i++)
+ {
+ if
+ (
+ text[i] == ' ' ||
+ text[i] == '\n' ||
+ text[i] == '\t' ||
+ text[i] == '\r'
+ )
+ begin++;
+ else
+ break;
+ }
+ for (int i=strlen(text)-1; i>=0; i--)
+ {
+ if
+ (
+ text[i] == ' '||
+ text[i] == '\n' ||
+ text[i] == '\t' ||
+ text[i] == '\r'
+ )
+ end++;
+ else
+ break;
+ }
+ int k = 0;
+ for (int i=0; i<strlen(text); i++)
+ {
+ if (i >= begin && i < strlen(text) - end)
+ {
+ trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+ trimmedText[k] = text[i];
+ k++;
+ }
+ }
+ trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+ trimmedText[k] = 0;
+ return trimmedText;
+}
+
+// Do not use for reading from a socket fd
+bool tryRead(char *buf, FILE *fp)
+{
+ size_t bytesRead = fread(buf, 1, 1, fp);
+ if (feof(fp) != 0)
+ return false;
+ if (ferror(fp) != 0)
+ tryRead(buf, fp);
+ if (bytesRead != 1)
+ tryRead(buf, fp);
+ return true;
+}
+
+char *readFile(FILE *fp)
+{
+ char *text = NULL;
+ int i = 0;
+ char buf;
+ while (1)
+ {
+ if (tryRead(&buf, fp))
+ {
+ text = realloc(text, (i+1) * sizeof(char));
+ text[i] = buf;
+ i++;
+ }
+ else
+ break;
+ }
+ text = realloc(text, (i+1) * sizeof(char));
+ text[i] = 0;
+ return text;
+}
diff --git a/todo b/todo
@@ -1,5 +1 @@
-refactor; heavy
-implement find_attribute_value_by_*
-implement filtering not only by class or id, also like this .test[data="asdf"]
-implement finding tags that have no end tag, e.g. the img tag
-Actually correctly parse html according to spec ;-)
+support --except argument