Merge branch 'big_change' - htex - simple incorrect html parser

commit 3be05c0cb6825138a7a7fdaa28466b90713d56e3
parent 54b54cb00e4500f10d254e7ec694f45f3fbeab58
Author: Robin <kroekerrobin@gmail.com>
Date:   Sun, 13 Aug 2023 21:40:21 +0200

Merge branch 'big_change'

Diffstat:
M .gitignore  | 2 +-
M Makefile  | 7 ++++---
M htex.c  | 648 ++++++++++++++++++++++++-------------------------------------------------------
A html.c  | 790 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A html.h  | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lib.c  | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M todo  | 6 +-----

7 files changed, 1195 insertions(+), 460 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,2 @@
-test*.html
+test/*
 htex
diff --git a/Makefile b/Makefile
@@ -2,7 +2,9 @@ PREFIX = /usr/local
 MANPREFIX = $(PREFIX)/share/man
 
 all:
-	$(CC) -O -Wall -Werror -o htex htex.c
+	$(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
+debug:
+	$(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
 clean:
 	rm htex
 install: all
@@ -14,4 +16,4 @@ install: all
 	chmod 644 "$(MANPREFIX)/man1/htex.1"
 uninstall:
 	rm "$(PREFIX)/bin/htex"
-	rm "$(MANPREFIX)/man1/htex.1"
-\ No newline at end of file
+	rm "$(MANPREFIX)/man1/htex.1"
diff --git a/htex.c b/htex.c
@@ -1,461 +1,207 @@
 #include <stdio.h>
+#include <string.h>
 #include <stdbool.h>
 #include <stdlib.h>
-#include <unistd.h>
 #include <getopt.h>
-#include <string.h>
-#include <fcntl.h>
-
-char *text;
-char attribute_name[200];
-char tag_name[50];
-bool inner_html = false;
-bool except = false;
-struct match {
-    int start;
-    int end;
-};
-struct match *matches;
-
-int find_start_of_opening_tag_pos(int class_position) {
-    int i = 1;
-    while (1) {
-        int pos = class_position - i;
-        if (pos < 0)
-            return -1;
-        if (text[pos] == '<') {
-            return pos;
-        }
-        i++;
-    }
-}
-
-int find_end_of_opening_tag_pos(int class_position) {
-    int i = 1;
-    while (1) {
-        int pos = class_position + i;
-        if (pos < 0)
-            return -1;
-        if (text[pos] == '>') {
-            return pos + 1;
-        }
-        i++;
-    }
-}
-
-/*
-    This function works only if the html tag
-    has attributes.
-*/
-void find_tag_name(int open_tag_pos) {
-    int i = 1;
-    int end_of_tag_name = 0;
-    while (1) {
-        int pos = open_tag_pos + i;
-        if (pos > strlen(text))
-            return;
-        if (text[pos] == ' ' || text[pos] == '\n') {
-            end_of_tag_name = pos-1;
-            break;
-        }
-        i++;
-    }
-    int length_tag_name = end_of_tag_name - open_tag_pos;
-    for (int k=0; k<length_tag_name; k++) {
-        tag_name[k] = text[open_tag_pos+k+1];
-    }
-    tag_name[length_tag_name] = '\0';
-}
-
-int find_closing_tag_pos(int open_tag_pos, bool inner_html) {
-    int level = 1;
-    int failure = 0;
-    char close_tag[strlen(tag_name)+3];
-    close_tag[0] = '<';
-    close_tag[1] = '/';
-    for (int k=0; k<strlen(tag_name); k++) {
-        close_tag[2+k] = tag_name[k];
-    }
-    close_tag[sizeof(close_tag)-1] = '>';
-    close_tag[sizeof(close_tag)] = '\0';
+#include <inttypes.h>
+#include <grapheme.h>
+#include "lib.c"
+#include "html.c"
 
-    for (int l=open_tag_pos; l<strlen(text); l++) { // Could be more precise
-        if (text[l] == '<') {
-            for (int o=0; o<strlen(tag_name); o++) {
-                if (tag_name[o] != text[l+o+1]) {
-                    failure = 1;
-                    break;
-                }
-            }
-            if (failure == 0) {
-                if (
-                    text[l+strlen(tag_name)+1] != ' ' &&
-                    text[l+strlen(tag_name)+1] != '>'
-                ) {
-                    failure = 1;
-                }
-            }
-            if (failure == 0) {
-                level++;
-            }
-            failure = 0;
-            if (text[l+1] == '/') {
-                for (int o=2; o<strlen(close_tag); o++) {
-                    if (close_tag[o] != text[l+o]) {
-                        failure = 1;
-                        break;
-                    }
-                }
-                if (failure == 0) {
-                    level--;
-                    if (level == 0) {
-                        if (inner_html) {
-                            return l;
-                        } else {
-                            return l + strlen(tag_name) + 3;
-                        }
-                    }
-                }
-                failure = 0;
-            }
-        }
-    }
-    return -1;
+struct find_opts *parseFilterOpts(const char *pattern)
+{
+	struct find_opts *opt = malloc(sizeof(struct find_opts));
+	opt->out = OUT_OUTER_HTML;
+	opt->tag = malloc(sizeof(char));
+	opt->tag[0] = 0;
+	opt->attr = malloc(sizeof(char));
+	opt->attr[0] = 0;
+	opt->key = malloc(sizeof(char));
+	opt->key[0] = 0;
+	bool isClassValue = false;
+	bool isIdValue = false;
+	int i = 0;
+	bool isAttrKey = false;
+	bool isAttrOrTag = true;
+	char *attrOrTag = NULL;
+	int aot = 0;
+	int ak = 0;
+	int av = 0;
+	switch (pattern[0])
+	{
+		case '.':
+			isClassValue = true;
+			i = 1;
+			break;
+		case '#':
+			isIdValue = true;
+			i = 1;
+			break;
+	}
+	for (; i<strlen(pattern); i++)
+	{
+		if (pattern[i] == ']')
+			break;
+		if (
+			!isAttrKey &&
+			!isAttrOrTag &&
+			pattern[i] != ']' &&
+			pattern[i] != '"'
+		)
+		{
+			opt->attr = realloc(opt->attr, (av+1) * sizeof(char));
+			opt->attr[av] = pattern[i];
+			av++;
+		}
+		if (pattern[i] == '=')
+			isAttrKey = false;
+		if (isAttrKey && !isAttrOrTag)
+		{
+			opt->key = realloc(opt->key, (ak+1) * sizeof(char));
+			opt->key[ak] = pattern[i];
+			ak++;
+		}
+		if (pattern[i] == '[')
+		{
+			isAttrKey = true;
+			isAttrOrTag = false;
+		}
+		if (isAttrOrTag)
+		{
+			attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char));
+			attrOrTag[aot] = pattern[i];
+			aot++;
+		}
+	}
+	attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char));
+	attrOrTag[aot] = 0;
+	if (isIdValue)
+	{
+		free(opt->key);
+		opt->key = NULL;
+		free(opt->attr);
+		opt->attr = NULL;
+		opt->attr = attrOrTag;
+		opt->key = realloc(opt->key, 3 * sizeof(char));
+		opt->key[0] = 'i';
+		opt->key[1] = 'd';
+		opt->key[2] = 0;
+	}
+	else if (isClassValue)
+	{
+		free(opt->key);
+		opt->key = NULL;
+		free(opt->attr);
+		opt->attr = NULL;
+		opt->attr = attrOrTag;
+		opt->key = realloc(opt->key, 6 * sizeof(char));
+		opt->key[0] = 'c';
+		opt->key[1] = 'l';
+		opt->key[2] = 'a';
+		opt->key[3] = 's';
+		opt->key[4] = 's';
+		opt->key[5] = 0;
+	}
+	else
+	{
+		free(opt->tag);
+		opt->tag = attrOrTag;
+		if (av > 0)
+		{
+			opt->attr = realloc(opt->attr, (av+1) * sizeof(char));
+			opt->attr[av] = 0;
+		}
+		if (ak > 0)
+		{
+			opt->key = realloc(opt->key, (ak+1) * sizeof(char));
+			opt->key[ak] = 0;
+		}
+	}
+	return opt;
 }
 
-bool correct_name_begin_or_end(char prev_char) {
-    switch(prev_char) {
-        case '"':
-            return true;
-        case '\'':
-            return true;
-        case ' ':
-            return true;
-        default:
-            return false;
-    }
+void freeOpts(struct find_opts *opt)
+{
+	free(opt->tag);
+	free(opt->attr);
+	free(opt->key);
+	free(opt);
 }
 
-void find_html_tag_by_class(char *class_name) {
-    int o = 0;
-    int failure = 0;
-    int counter = 0;
-    int is_not_quotation_mark = 1;
-
-    for (int k=0; k<strlen(text); k++) {
-        if (
-            text[k] == 'c' &&
-            text[k+1] == 'l' &&
-            text[k+2] == 'a' &&
-            text[k+3] == 's' &&
-            text[k+4] == 's'
-        ) {
-            while (is_not_quotation_mark == 1) {
-                if (text[k+7+o] == '"' || text[k+7+o] == '\'') {
-                    is_not_quotation_mark = 0;
-                    break;
-                }
-                if (class_name[0] == text[k+7+o]) {
-                    for (int l=1; l<strlen(class_name); l++) {
-                        if (class_name[l] != text[k+7+o+l]) {
-                            failure = 1;
-                            break;
-                        }
-                    }
-                    if (failure == 0) {
-                        if (
-                            !correct_name_begin_or_end(text[k+6+o]) ||
-                            !correct_name_begin_or_end(text[k+7+o+strlen(class_name)])
-                        ) {
-                            failure = 1;
-                        }
-                    }
-                    if (failure == 0) {
-                        if (inner_html) {
-                            int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
-                            find_tag_name(start_of_open_tag_pos);
-                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
-                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
-                            for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
-                                printf("%c", text[e]);
-                            }
-                            printf("\n");
-                        } else {
-                            int open_tag_pos = find_start_of_opening_tag_pos(k);
-                            find_tag_name(open_tag_pos);
-                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
-                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false);
-                            if (except) {
-                                matches = realloc(matches, (counter+1) * sizeof(struct match));
-                                matches[counter].start = open_tag_pos;
-                                matches[counter].end = close_tag_pos;
-                                counter++;
-                            } else {
-                                for (int e=open_tag_pos; e<close_tag_pos; e++) {
-                                    printf("%c", text[e]);
-                                }
-                                printf("\n");
-                            }
-                        }
-                    }
-                    failure = 0;
-                }
-                o++;
-            }
-            is_not_quotation_mark = 1;
-            o = 0;
-        }
-    }
-    if (except) {
-        int start = 0;
-        for (int i=0; i<counter; i++) {
-            for (int e=start; e<matches[i].start; e++) {
-                printf("%c", text[e]);
-            }
-            start = matches[i].end;
-        }
-        for (int i=start; i<strlen(text); i++) {
-            printf("%c", text[i]);
-        }
-        printf("\n");
-        free(matches);
-    }
-}
-
-void find_html_tag_by_id(char *id_name) {
-    int o = 0;
-    int failure = 0;
-    int counter = 0;
-    int is_not_quotation_mark = 1;
-
-    for (int k=0; k<strlen(text); k++) {
-        if (
-            text[k] == 'i' &&
-            text[k+1] == 'd'
-        ) {
-            while (is_not_quotation_mark == 1) {
-                if (text[k+4+o] == '"' || text[k+4+o] == '\'') {
-                    is_not_quotation_mark = 0;
-                    break;
-                }
-                if (id_name[0] == text[k+4+o]) {
-                    for (int l=1; l<strlen(id_name); l++) {
-                        if (id_name[l] != text[k+4+o+l]) {
-                            failure = 1;
-                            break;
-                        }
-                    }
-                    if (failure == 0) {
-                        if (
-                            !correct_name_begin_or_end(text[k+3+o]) ||
-                            !correct_name_begin_or_end(text[k+4+o+strlen(id_name)])
-                        ) {
-                            failure = 1;
-                        }
-                    }
-                    if (failure == 0) {
-                        if (inner_html) {
-                            int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
-                            find_tag_name(start_of_open_tag_pos);
-                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
-                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
-                            for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
-                                printf("%c", text[e]);
-                            }
-                            printf("\n");
-                        } else {
-                            int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
-                            find_tag_name(start_of_open_tag_pos);
-                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
-                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false);
-                            if (except) {
-                                matches = realloc(matches, (counter+1) * sizeof(struct match));
-                                matches[counter].start = start_of_open_tag_pos;
-                                matches[counter].end = close_tag_pos;
-                                counter++;
-                            } else {
-                                for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) {
-                                    printf("%c", text[e]);
-                                }
-                                printf("\n");   
-                            }
-                        }
-                    }
-                    failure = 0;
-                }
-                o++;
-            }
-            is_not_quotation_mark = 1;
-            o = 0;
-        }
-    }
-    if (except) {
-        int start = 0;
-        for (int i=0; i<counter; i++) {
-            for (int e=start; e<matches[i].start; e++) {
-                printf("%c", text[e]);
-            }
-            start = matches[i].end;
-        }
-        for (int i=start; i<strlen(text); i++) {
-            printf("%c", text[i]);
-        }
-        printf("\n");
-        free(matches);
-    }
-}
-void find_html_tag_by_tag() {
-    int failure = 0;
-    int counter = 0;
-    for (int k=0; k<strlen(text); k++) {
-        if (text[k] == '<' && text[k+1] != '/') {
-            for (int o=0; o<strlen(attribute_name); o++) {
-                if (attribute_name[o] != text[k+1+o]) {
-                    failure = 1;
-                    break;
-                }
-            }
-            if (failure == 0) {
-                if (
-                    text[k+1+strlen(attribute_name)] == '>' ||
-                    text[k+1+strlen(attribute_name)] == ' ' ||
-                    text[k+1+strlen(attribute_name)] == '\n'
-                ) {
-                    int open_tag_pos = k;
-                    int after_tag_pos = k+1+strlen(attribute_name)+1;
-                    if (inner_html) {
-                        int close_tag_pos = find_closing_tag_pos(after_tag_pos, true);
-                        int end_of_open_tag_pos = find_end_of_opening_tag_pos(k+strlen(attribute_name));
-                        for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
-                            printf("%c", text[e]);
-                        }
-                        printf("\n");
-                    } else {
-                        int close_tag_pos = find_closing_tag_pos(after_tag_pos, false);
-                        if (except) {
-                            matches = realloc(matches, (counter+1) * sizeof(struct match));
-                            matches[counter].start = open_tag_pos;
-                            matches[counter].end = close_tag_pos;
-                            counter++;
-                        } else {
-                            for (int e=open_tag_pos; e<close_tag_pos; e++) {
-                                printf("%c", text[e]);
-                            }
-                            printf("\n");
-                        }
-                    }
-                }
-            }
-            failure = 0;
-        }
-    }
-    if (except) {
-        int start = 0;
-        for (int i=0; i<counter; i++) {
-            for (int e=start; e<matches[i].start; e++) {
-                printf("%c", text[e]);
-            }
-            start = matches[i].end;
-        }
-        for (int i=start; i<strlen(text); i++) {
-            printf("%c", text[i]);
-        }
-        printf("\n");
-        free(matches);
-    }
-}
-
-void find_html_tag() {
-    char identifier[200];
-    for (int i=0; i<strlen(attribute_name); i++) {
-        identifier[i] = attribute_name[i+1];
-    }
-    switch(attribute_name[0]) {
-        case '.':
-            find_html_tag_by_class(identifier);
-            break;
-        case '#':
-            find_html_tag_by_id(identifier);
-            break;
-        default:
-            for (int i=0; i<strlen(attribute_name); i++) {
-                tag_name[i] = attribute_name[i];
-            }
-            find_html_tag_by_tag();
-    }
-}
-
-int main(int argc, char *argv[]) {
-    int i = 0;
-    char buffer;
-    int o;
-    text = malloc(sizeof(char));
-    if (!text) {
-        printf("malloc error.\n");
-        return -1;
-    }
-
-    static struct option long_options[] = {
-        { "attribute", required_argument, 0, 'a' },
-        { "innerhtml", no_argument, 0, 'i' },
-        { "except", no_argument, 0, 'e' },
-        { 0, 0, 0, 0 }
-    };
-    int option_index = 0;
-    while ((o = getopt_long(argc, argv, "eia:", long_options, &option_index)) != -1) {
-        switch(o) {
-            case 'a':
-                for (int j=0; j<strlen(optarg); j++) {
-                    attribute_name[j] = optarg[j];
-                }
-                break;
-            case 'i':
-                inner_html = true;
-                break;
-            case 'e':
-                except = true;
-                break;
-        }
-    }
-    if (inner_html && except) {
-        printf("You can't use the options -i (--innerhtml) and -e (--except) at the same time.\n");
-        return -1;
-    }
-    if (argc == (optind + 1)) {
-        if (*argv[argc-1] == '-') {
-            while (read(0, &buffer, 1) > 0) {
-                text[i] = buffer;
-                i++;
-                text = realloc(text, (i+1) * sizeof(char));
-                if (!text || text == NULL) {
-                    printf("realloc error.\n");
-                    return -1;
-                }
-            }
-            text[i] = '\0';
-            find_html_tag();
-            free(text);
-        } else {
-            int fd = open(argv[argc-1], O_RDONLY);
-            if (fd != -1) {
-                while (read(fd, &buffer, 1) > 0) {
-                    text[i] = buffer;
-                    i++;
-                    text = realloc(text, (i+1) * sizeof(char));
-                    if (!text || text == NULL) {
-                        printf("realloc error.\n");
-                        return -1;
-                    }
-                }
-                text[i] = '\0';
-                find_html_tag();
-                free(text);
-            } else {
-                printf("Couldn't read file \"%s\"\n", argv[argc-1]);
-            }
-        }
-    } else {
-        printf("Nothing to read from.\n");
-    }
-    return 0;
+int main(int argc, char *argv[])
+{
+	int o = 0;
+	int option_index = 0;
+	bool isInnerHtml = false;
+	bool isInnerText = false;
+	bool isExcept = false;
+	char *text = NULL;
+	char *searchPattern = NULL;
+	static struct option long_options[] = {
+		{ "innerhtml", no_argument, 0, 'i' },
+		{ "innertext", no_argument, 0, 't' },
+		{ "except", no_argument, 0, 'e' },
+		{ 0, 0, 0, 0 }
+	};
+	while ((o = getopt_long(argc, argv, "ite", long_options, &option_index)) != -1) {
+		switch(o) {
+			case 'i':
+				isInnerHtml = true;
+				break;
+			case 't':
+				isInnerText = true;
+				break;
+			case 'e':
+				isExcept = true;
+				break;
+		}
+	}
+	if (isInnerHtml && isInnerText)
+	{
+		fprintf(stderr, "Provide either --innerhtml or --innertext.\n");
+		return -1;
+	}
+	if (argc == optind)
+	{
+		fprintf(stderr, "Provide a search pattern!\n");
+		return -1;
+	}
+	if (argc > optind+2)
+	{
+		fprintf(stderr, "Provide only one file!\n");
+		return -1;
+	}
+	if (argc == optind+1)
+	{
+		searchPattern = argv[argc-1];
+		text = readFile(stdin);
+	}
+	else if (argc == optind+2)
+	{
+		searchPattern = argv[argc-2];
+		char *filepath = argv[argc-1];
+		FILE *fp = fopen(filepath, "r");
+		if (fp == NULL)
+		{
+			perror("fopen failed: ");
+			return -1;
+		}
+		text = readFile(fp);
+		fclose(fp);
+		if (strlen(text) == 0)
+		{
+			printf("No data in file.\n");
+			return 0;
+		}
+	}
+	struct find_opts *options = parseFilterOpts(searchPattern);
+	options->isExcept = isExcept;
+	if (isInnerHtml)
+		options->out = OUT_INNER_HTML;
+	if (isInnerText)
+		options->out = OUT_INNER_TEXT;
+	filterHtml(text, options);
+	freeOpts(options);
+	free(text);
+	return 0;
 }
diff --git a/html.c b/html.c
@@ -0,0 +1,790 @@
+#include "html.h"
+
+const char *stateToString(enum state s)
+{
+	switch(s)
+	{
+		case STATE_INNER_TEXT: return "STATE_INNER_TEXT";
+		case STATE_TAG: return "STATE_TAG";
+		case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME";
+		case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME";
+		case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
+		case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
+		case STATE_COMMENT: return "STATE_COMMENT";
+		case STATE_SCRIPT: return "STATE_SCRIPT";
+		case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
+		case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
+		case STATE_STYLE: return "STATE_STYLE";
+		case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
+		case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
+	}
+	return "";
+}
+
+struct attr *initAttr()
+{
+	struct attr *a = malloc(sizeof(struct attr));
+	a->name = malloc(sizeof(char));
+	a->name[0] = 0;
+	a->value = malloc(sizeof(char));
+	a->value[0] = 0;
+	return a;
+}
+
+struct tag *initTag()
+{
+	struct tag *t = malloc(sizeof(struct tag));
+	t->name = malloc(sizeof(char));
+	t->name[0] = 0;
+	t->innerText = malloc(sizeof(char));
+	t->innerText[0] = 0;
+	t->attrs = NULL;
+	t->children = NULL;
+	t->attrsLen = 0;
+	t->childrenLen = 0;
+	t->_isVoidElement = false;
+	t->_isClosed = false;
+	return t;
+}
+
+struct tag_list *initTagList()
+{
+	struct tag_list *t = malloc(sizeof(struct tag_list));
+	t->tags = NULL;
+	t->len = 0;
+	return t;
+}
+
+static inline bool isASCIIDigit(uint_least32_t cp)
+{
+	if (cp >= 0x30 && cp <= 0x39)
+		return true;
+	return false;
+}
+
+static inline bool isASCIIAlphaUpper(uint_least32_t cp)
+{
+	if (cp >= 0x41 && cp <= 0x5A)
+		return true;
+	return false;
+}
+
+static inline bool isASCIIAlphaLower(uint_least32_t cp)
+{
+	if (cp >= 0x61 && cp <= 0x7A)
+		return true;
+	return false;
+}
+
+static inline bool isASCIIAlpha(uint_least32_t cp)
+{
+	if (isASCIIAlphaLower(cp) || isASCIIAlphaUpper(cp))
+		return true;
+	return false;
+}
+
+static inline bool isASCIIWhitespace(uint_least32_t cp)
+{
+	if (
+			cp == TAB	||
+			cp == LF	||
+			cp == FF	||
+			cp == CR	||
+			cp == SPACE
+	)
+		return true;
+	return false;
+}
+
+static inline bool isVoidElement(const char *tagName)
+{
+	for (int i=0; i<13; i++)
+	{
+		if (strcmp(tagName, voidElements[i]) == 0)
+			return true;
+	}
+	return false;
+}
+
+static inline bool isC0Control(uint_least32_t cp)
+{
+	if (cp >= 0x00 && cp <= 0x1F)
+		return true;
+	return false;
+}
+
+static inline bool isControl(uint_least32_t cp)
+{
+	if (isC0Control(cp))
+		return true;
+	if (cp >= 0x7F && cp <= 0x9F)
+		return true;
+	return false;
+}
+
+static inline bool isNonChar(uint_least32_t cp)
+{
+	if (cp >= 0xFDD0 && cp <= 0xFDEF)
+		return true;
+	if (
+			cp == 0xFFFE		|| cp == 0xFFFF		||
+			cp == 0x1FFFE		|| cp == 0x1FFFF	||
+			cp == 0x2FFFE		|| cp == 0x2FFFF	||
+			cp == 0x3FFFE		|| cp == 0x3FFFF	||
+			cp == 0x4FFFE		|| cp == 0x4FFFF	||
+			cp == 0x5FFFE		|| cp == 0x5FFFF	||
+			cp == 0x6FFFE		|| cp == 0x6FFFF	||
+			cp == 0x7FFFE		|| cp == 0x7FFFF	||
+			cp == 0x8FFFE		|| cp == 0x8FFFF	||
+			cp == 0x9FFFE		|| cp == 0x9FFFF	||
+			cp == 0xAFFFE		|| cp == 0xAFFFF	||
+			cp == 0xBFFFE		|| cp == 0xBFFFF	||
+			cp == 0xCFFFE		|| cp == 0xCFFFF	||
+			cp == 0xDFFFE		|| cp == 0xDFFFF	||
+			cp == 0xEFFFE		|| cp == 0xEFFFF	||
+			cp == 0xFFFFE		|| cp == 0xFFFFF	||
+			cp == 0x10FFFE	|| cp == 0x10FFFF
+	)
+		return true;
+	return false;
+}
+
+static inline bool isValidAttrName(uint_least32_t cp)
+{
+	if (isControl(cp))
+		return false;
+	if (isNonChar(cp))
+		return false;
+	if (
+			cp == SPACE							||
+			cp == QUOTATION_MARK		||
+			cp == APOSTROPHE				||
+			cp == GREATER_THAN_SIGN	||
+			cp == SOLIDUS						||
+			cp == EQUALS_SIGN
+	)
+		return false;
+	return true;
+}
+
+static inline bool
+isValidUnquotedAttrValue(uint_least32_t cp)
+{
+	/*
+			Not mentioned invalid characters.
+			They are already handled before
+			funtion call.
+	*/
+	if (
+		cp == EQUALS_SIGN					||
+		cp == LESS_THAN_SIGN			||
+		cp == GREATER_THAN_SIGN		||
+		cp == GRAVE_ACCENT
+	)
+		return false;
+	return true;
+}
+
+size_t parseDoctype(const char *text)
+{
+	char *firstLine = NULL;
+	int i = 0;
+	while (text[i] != '\n')
+	{
+		firstLine = realloc(firstLine, (i+1) * sizeof(char));
+		firstLine[i] = text[i];
+		i++;
+	}
+	firstLine = realloc(firstLine, (i+1) * sizeof(char));
+	firstLine[i] = 0;
+	if (strcmp("<!DOCTYPE html>", firstLine) == 0)
+	{
+		free(firstLine);
+		return i+1;
+	}
+	if (strcmp("<!doctype html>", firstLine) == 0)
+	{
+		free(firstLine);
+		return i+1;
+	}
+	free(firstLine);
+	return 0;
+}
+
+struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset)
+{
+	for (int i=tagList->len-1; i>-1; i--)
+	{
+		if (strcmp(tagList->tags[i]->name, endTag) == 0 && !tagList->tags[i]->_isClosed)
+		{
+			tagList->tags[i]->_isClosed = true;
+			tagList->tags[i]->_outerHtmlEndOffset = endOffset;
+			return tagList->tags[i];
+		}
+	}
+	return NULL;
+}
+
+struct tag *getLastOpenTag(struct tag_list *tagList)
+{
+	for (int i=tagList->len-1; i>-1; i--)
+	{
+		if (!tagList->tags[i]->_isVoidElement && !tagList->tags[i]->_isClosed)
+		{
+			return tagList->tags[i];
+		}
+	}
+	return tagList->tags[0];
+}
+
+char *getOuterHtml(char *text, struct tag *t)
+{
+	char *outerHtml = NULL;
+	int o = 0;
+	for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++)
+	{
+		outerHtml = realloc(outerHtml, (o+1) * sizeof(char));
+		outerHtml[o] = text[i];
+		o++;
+	}
+	outerHtml = realloc(outerHtml, (o+1) * sizeof(char));
+	outerHtml[o] = 0;
+	return outerHtml;
+}
+
+char *getInnerHtml(char *text, struct tag *t)
+{
+	char *innerHtml = NULL;
+	int o = 0;
+	for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++)
+	{
+		innerHtml = realloc(innerHtml, (o+1) * sizeof(char));
+		innerHtml[o] = text[i];
+		o++;
+	}
+	innerHtml = realloc(innerHtml, (o+1) * sizeof(char));
+	innerHtml[o] = 0;
+	return innerHtml;
+}
+
+void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off)
+{
+	int i = off;
+	while (text[i] != '<')
+	{
+		i--;
+	}
+	closedTag->_innerHtmlEndOffset = i;
+}
+
+enum state endOfBeginTag(struct tag *t, size_t offset)
+{
+	t->_innerHtmlBeginOffset = offset+1;
+	t->_isVoidElement = isVoidElement(t->name);
+	if (t->_isVoidElement)
+		t->_outerHtmlEndOffset = offset+1;
+	if (strcmp(t->name, "script") == 0)
+		return STATE_SCRIPT;
+	else if (strcmp(t->name, "style") == 0)
+		return STATE_STYLE;
+	else
+		return STATE_INNER_TEXT;
+}
+
+struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList)
+{
+	struct tag *tag = initTag();
+	tag->_outerHtmlBeginOffset= offset-1;
+	tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag));
+	tagList->tags[tagList->len] = tag;
+	tagList->len++;
+	struct tag *stillOpenTag = tag;
+	char *endTag = malloc(sizeof(char));
+	endTag[0] = 0;
+	size_t a = 0;
+	size_t attrNameCount = 0;
+	enum attr_value_syntax attrValueSyntax = AVS_NO;
+	size_t hyphenCount = 0;
+	uint_least32_t cp;
+	size_t len = strlen(text);
+	size_t ret, off;
+	for (off = offset; off<len; off += ret)
+	{
+		if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off)
+		{
+			printError("Something wrong with ending of text");
+		}
+		else
+		{
+			// char *the_codepoint = cpToChars(cp, ret);
+			// printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state));
+			// free(the_codepoint);
+			switch (state)
+			{
+				case STATE_INNER_TEXT:
+					if (cp == LESS_THAN_SIGN)
+					{
+						state = STATE_TAG;
+						break;
+					}
+					stillOpenTag = getLastOpenTag(tagList);
+					stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret));
+					break;
+				case STATE_TAG:
+					if (cp == SOLIDUS)
+					{
+						state = STATE_END_TAG_NAME;
+						break;
+					}
+					if (cp == EXCLAMATION_MARK)
+					{
+						state = STATE_COMMENT;
+						break;
+					}
+					stillOpenTag = getLastOpenTag(tagList);
+					struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
+					stillOpenTag->children = realloc(
+						stillOpenTag->children,
+						(stillOpenTag->childrenLen+1) * sizeof(struct tag)
+					);
+					stillOpenTag->children[stillOpenTag->childrenLen] = oneTag;
+					stillOpenTag->childrenLen++;
+					free(endTag);
+					return tag;
+				case STATE_BEGIN_TAG_NAME:
+					if (cp == GREATER_THAN_SIGN)
+					{
+						state = endOfBeginTag(tag, off);
+						break;
+					}
+					if (isASCIIWhitespace(cp))
+					{
+						state = STATE_ATTR_NAME;
+						break;
+					}
+					if (isASCIIDigit(cp) || isASCIIAlpha(cp))
+					{
+						tag->name = stringCat(tag->name, cpToChars(cp, ret));
+					}
+					break;
+				case STATE_END_TAG_NAME:
+					if (cp == GREATER_THAN_SIGN)
+					{
+						struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+						setInnerHtmlEndOffset(closedTag, text, off);
+						free(endTag);
+						endTag = malloc(sizeof(char));
+						endTag[0] = 0;
+						state = STATE_INNER_TEXT;
+						break;
+					}
+					if (!isASCIIWhitespace(cp))
+						endTag = stringCat(endTag, cpToChars(cp, ret));
+					break;
+				case STATE_ATTR_NAME:
+					if (cp == GREATER_THAN_SIGN)
+					{
+						state = endOfBeginTag(tag, off);
+						break;
+					}
+					if (isASCIIWhitespace(cp))
+					{
+						if (attrNameCount == a+1)
+							a++;
+						break;
+					}
+					if (cp == EQUALS_SIGN)
+					{
+						state = STATE_ATTR_VALUE;
+						break;
+					}
+					if (isValidAttrName(cp))
+					{
+						if (attrNameCount != a+1)
+						{
+							tag->attrs = realloc(
+								tag->attrs,
+								(a+1) * sizeof(struct attr)
+							);
+							tag->attrs[a] = initAttr();
+							attrNameCount = a + 1;
+							tag->attrsLen = attrNameCount;
+						}
+						tag->attrs[a]->name = stringCat(
+							tag->attrs[a]->name,
+							cpToChars(cp, ret)
+						);
+					}
+					break;
+				case STATE_ATTR_VALUE:
+					if (isASCIIWhitespace(cp))
+					{
+						if (attrValueSyntax == AVS_UNQUOTED)
+						{
+							attrValueSyntax = AVS_NO;
+							state = STATE_ATTR_NAME;
+						}
+						else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE)
+						{
+							char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
+							strcpy(tmpName, tag->attrs[a]->name);
+							tag->attrs = realloc(
+								tag->attrs,
+								(a+1) * sizeof(struct attr)
+							);
+							a++;
+							tag->attrs[a] = initAttr();
+							free(tag->attrs[a]->name);
+							tag->attrs[a]->name = tmpName;
+							tag->attrsLen++;
+							attrNameCount = a + 1;
+						}
+						break;
+					}
+					if (cp == QUOTATION_MARK)
+					{
+						if (attrValueSyntax == AVS_NO)
+						{
+							attrValueSyntax = AVS_QUOTATION_MARK;
+							break;
+						}
+						if (attrValueSyntax == AVS_QUOTATION_MARK)
+						{
+							attrValueSyntax = AVS_NO;
+							state = STATE_ATTR_NAME;
+							break;
+						}
+					}
+					if (cp == APOSTROPHE)
+					{
+						if (attrValueSyntax == AVS_NO)
+						{
+							attrValueSyntax = AVS_APOSTROPHE;
+							break;
+						}
+						if (attrValueSyntax == AVS_APOSTROPHE)
+						{
+							attrValueSyntax = AVS_NO;
+							state = STATE_ATTR_NAME;
+							break;
+						}
+					}
+					if (cp == GREATER_THAN_SIGN)
+					{
+						state = endOfBeginTag(tag, off);
+						break;
+					}
+					if (
+						attrValueSyntax == AVS_NO &&
+						isValidUnquotedAttrValue(cp)
+					)
+					{
+						attrValueSyntax = AVS_UNQUOTED;
+					}
+					if (attrValueSyntax > AVS_NO)
+					{
+						tag->attrs[a]->value = stringCat(
+							tag->attrs[a]->value,
+							cpToChars(cp, ret)
+						);
+					}
+					break;
+				case STATE_COMMENT:
+					if (cp == GREATER_THAN_SIGN && hyphenCount >= 2)
+					{
+						state = STATE_INNER_TEXT;
+						break;
+					}
+					if (cp == HYPHEN_MINUS)
+						hyphenCount++;
+					else
+						hyphenCount = 0;
+					break;
+				case STATE_STYLE:
+					if (cp == LESS_THAN_SIGN)
+					{
+						state = STATE_STYLE_POSSIBLE_END_TAG;
+						break;
+					}
+					break;
+				case STATE_STYLE_POSSIBLE_END_TAG:
+					if (cp == SOLIDUS)
+						state = STATE_STYLE_END_TAG;
+					else
+						state = STATE_STYLE;
+					break;
+				case STATE_STYLE_END_TAG:
+					if (cp == GREATER_THAN_SIGN)
+					{
+						struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+						setInnerHtmlEndOffset(closedTag, text, off);
+						free(endTag);
+						endTag = malloc(sizeof(char));
+						endTag[0] = 0;
+						state = STATE_INNER_TEXT;
+						break;
+					}
+					if (!isASCIIWhitespace(cp))
+						endTag = stringCat(endTag, cpToChars(cp, ret));
+					break;
+				case STATE_SCRIPT:
+					if (cp == LESS_THAN_SIGN)
+					{
+						state = STATE_SCRIPT_POSSIBLE_END_TAG;
+						break;
+					}
+					break;
+				case STATE_SCRIPT_POSSIBLE_END_TAG:
+					if (cp == SOLIDUS)
+						state = STATE_SCRIPT_END_TAG;
+					else
+						state = STATE_SCRIPT;
+					break;
+				case STATE_SCRIPT_END_TAG:
+					if (cp == GREATER_THAN_SIGN)
+					{
+						struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+						setInnerHtmlEndOffset(closedTag, text, off);
+						free(endTag);
+						endTag = malloc(sizeof(char));
+						endTag[0] = 0;
+						state = STATE_INNER_TEXT;
+						break;
+					}
+					if (!isASCIIWhitespace(cp))
+						endTag = stringCat(endTag, cpToChars(cp, ret));
+					break;
+			}
+		}
+	}
+	free(endTag);
+	return tag;
+}
+
+void freeTag(struct tag *t)
+{
+	free(t->name);
+	free(t->innerText);
+	for (int i=0; i<t->attrsLen; i++)
+	{
+		free(t->attrs[i]->name);
+		free(t->attrs[i]->value);
+		free(t->attrs[i]);
+	}
+	free(t->attrs);
+	for (int i=0; i<t->childrenLen; i++)
+	{
+		if (t->children[i] != NULL)
+			freeTag(t->children[i]);
+	}
+	free(t->children);
+	free(t);
+}
+
+void freeTagList(struct tag_list *t)
+{
+	free(t->tags);
+	free(t);
+}
+
+void findTag(struct tag *tag, struct find_opts *opt, struct tag_list *foundTags)
+{
+	bool matchesTag = false;
+	bool matchesAttrKey = false;
+	bool matchesAttrValue = false;
+	if (strcmp(tag->name, opt->tag) == 0)
+		matchesTag = true;
+	for (int i=0; i<tag->attrsLen; i++)
+	{
+		if (strcmp(tag->attrs[i]->name, opt->key) == 0)
+			matchesAttrKey = true;
+		if (strcmp(tag->attrs[i]->value, opt->attr) == 0)
+			matchesAttrValue = true;
+	}
+	if (strlen(opt->tag) > 0 && strlen(opt->key) > 0 && strlen(opt->attr) > 0)
+	{
+		if (matchesTag && matchesAttrKey && matchesAttrValue)
+		{
+			foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+			foundTags->tags[foundTags->len] = tag;
+			foundTags->len++;
+		}
+	}
+	else if (strlen(opt->tag) > 0 && strlen(opt->key) > 0)
+	{
+		if (matchesTag && matchesAttrKey)
+		{
+			foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+			foundTags->tags[foundTags->len] = tag;
+			foundTags->len++;
+		}
+	}
+	else if (strlen(opt->tag) > 0)
+	{
+		if (matchesTag)
+		{
+			foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+			foundTags->tags[foundTags->len] = tag;
+			foundTags->len++;
+		}
+	}
+	else if (strlen(opt->key) > 0 && strlen(opt->attr) > 0)
+	{
+		if (matchesAttrKey && matchesAttrValue)
+		{
+			foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+			foundTags->tags[foundTags->len] = tag;
+			foundTags->len++;
+		}
+	}
+	else if (strlen(opt->key) > 0)
+	{
+		if (matchesAttrKey)
+		{
+			foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+			foundTags->tags[foundTags->len] = tag;
+			foundTags->len++;
+		}
+	}
+	else if (strlen(opt->attr) > 0)
+	{
+		if (matchesAttrValue)
+		{
+			foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag));
+			foundTags->tags[foundTags->len] = tag;
+			foundTags->len++;
+		}
+	}
+	for (int i=tag->childrenLen-1; i>-1; i--)
+	{
+		findTag(tag->children[i], opt, foundTags);
+	}
+}
+
+void printHtml(struct tag *t, int indent)
+{
+	for (int i=0; i<indent; i++)
+		putchar(' ');
+	printf("%s\n", t->name);
+	indent++;
+	for (int i=t->childrenLen-1; i>-1; i--)
+	{
+		printHtml(t->children[i], indent);
+	}
+}
+
+void printTag(char *text, struct tag *t, enum output_type out, struct tag_list *foundTags)
+{
+	switch (out)
+	{
+		case OUT_INNER_HTML:
+			break;
+		case OUT_OUTER_HTML:
+			break;
+		case OUT_INNER_TEXT:
+			break;
+	}
+	/* unsigned int p = *(unsigned int *)t;
+	unsigned int cp;
+	bool isMatch = false;
+	for (int i=0; i<foundTags->len; i++)
+	{
+		cp = *(unsigned int *)foundTags->tags[i];
+		if (p == cp)
+			isMatch = true;
+	}
+	if (!isMatch)
+	{
+		char *trimmedText = NULL;
+		switch (out)
+		{
+			case OUT_INNER_HTML:
+				trimmedText = trim(getInnerHtml(text, t));
+				break;
+			case OUT_OUTER_HTML:
+				trimmedText = trim(getOuterHtml(text, t));
+				break;
+		}
+		if (strlen(trimmedText) > 0)
+			printf("%s\n", trimmedText);
+		free(trimmedText);
+	} */
+	for (int i=t->childrenLen-1; i>-1; i--)
+	{
+		printTag(text, t->children[i], out, foundTags);
+	}
+}
+
+void printResult
+(
+	char *text,
+	struct tag *rootTag,
+	struct find_opts *opts,
+	struct tag_list *foundTags
+)
+{
+	if (opts->isExcept)
+	{
+		// printTag(text, rootTag, opts->out, foundTags);
+	}
+	else
+	{
+		char *requestedText = NULL;
+		char *trimmedText = NULL;
+		for (int i=0; i<foundTags->len; i++)
+		{
+			switch (opts->out)
+			{
+				case OUT_INNER_HTML:
+					requestedText = getInnerHtml(text, foundTags->tags[i]);
+					trimmedText = trim(requestedText);
+					free(requestedText);
+					break;
+				case OUT_OUTER_HTML:
+					requestedText = getOuterHtml(text, foundTags->tags[i]);
+					trimmedText = trim(requestedText);
+					free(requestedText);
+					break;
+				case OUT_INNER_TEXT:
+					trimmedText = trim(foundTags->tags[i]->innerText);
+					break;
+			}
+			if (strlen(trimmedText) > 0)
+				printf("%s\n", trimmedText);
+			free(trimmedText);
+		}
+	}
+}
+
+bool existFindPattern(struct find_opts *opts)
+{
+	if (strlen(opts->tag) > 0)
+		return true;
+	if (strlen(opts->attr) > 0)
+		return true;
+	if (strlen(opts->key) > 0)
+		return true;
+	return false;
+}
+
+void filterHtml(char *text, struct find_opts *opts)
+{
+	struct tag_list *tagList = initTagList();
+	struct tag_list *foundTags = initTagList();
+	size_t len = parseDoctype(text);
+	if (len)
+		text += len;
+	struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
+	if (!existFindPattern(opts))
+	{
+		foundTags->tags = realloc(foundTags->tags, sizeof(struct tag));
+		foundTags->tags[0] = rootTag;
+		foundTags->len = 1;
+	}
+	else
+		findTag(rootTag, opts, foundTags);
+	printResult(text, rootTag, opts, foundTags);
+	freeTag(rootTag);
+	freeTagList(tagList);
+	freeTagList(foundTags);
+}
diff --git a/html.h b/html.h
@@ -0,0 +1,90 @@
+#define printError(msg) do { fprintf(stderr, "%s: %s\n", __func__, msg); } while (0)
+
+#define LESS_THAN_SIGN			0x3C
+#define GREATER_THAN_SIGN		0x3E
+#define EQUALS_SIGN					0x3D
+#define TAB									0x09
+#define LF									0x0A
+#define FF									0x0C
+#define CR									0x0D
+#define SPACE								0x20
+#define SOLIDUS							0x2F
+#define EXCLAMATION_MARK		0x21
+#define QUOTATION_MARK			0x22
+#define APOSTROPHE					0x27
+#define GRAVE_ACCENT				0x60
+#define HYPHEN_MINUS				0x2D
+
+const char *voidElements[] = {
+	"area", "base", "br", "col", "embed", "hr", "img",
+	"input", "link", "meta", "source", "track", "wbr"
+};
+
+enum output_type
+{
+	OUT_INNER_HTML,
+	OUT_OUTER_HTML,
+	OUT_INNER_TEXT
+};
+
+struct find_opts
+{
+	char *tag;
+	char *attr;
+	char *key;
+	enum output_type out;
+	bool isExcept;
+};
+
+struct attr
+{
+	char *name;
+	char *value; // optional
+};
+
+struct tag
+{
+	char *name;
+	struct attr **attrs;
+	struct tag **children;
+	char *innerText;
+	size_t attrsLen;
+	size_t childrenLen;
+	bool _isVoidElement; // means there is no closing tag
+	bool _isClosed;
+	size_t _outerHtmlBeginOffset;
+	size_t _outerHtmlEndOffset;
+	size_t _innerHtmlBeginOffset;
+	size_t _innerHtmlEndOffset;
+};
+
+struct tag_list
+{
+	struct tag **tags;
+	size_t len;
+};
+
+enum state
+{
+	STATE_INNER_TEXT,
+	STATE_TAG,
+	STATE_BEGIN_TAG_NAME,
+	STATE_END_TAG_NAME,
+	STATE_ATTR_NAME,
+	STATE_ATTR_VALUE,
+	STATE_COMMENT,
+	STATE_SCRIPT,
+	STATE_SCRIPT_POSSIBLE_END_TAG,
+	STATE_SCRIPT_END_TAG,
+	STATE_STYLE,
+	STATE_STYLE_POSSIBLE_END_TAG,
+	STATE_STYLE_END_TAG
+};
+
+enum attr_value_syntax
+{
+	AVS_NO,
+	AVS_QUOTATION_MARK,
+	AVS_APOSTROPHE,
+	AVS_UNQUOTED
+};
diff --git a/lib.c b/lib.c
@@ -0,0 +1,112 @@
+char *stringCat(char *str1, char *str2)
+{
+	int str1Len = 0;
+	int str2Len = 0;
+	if (str1)
+		str1Len = strlen(str1);
+	if (str2)
+  	str2Len = strlen(str2);
+  char *string = malloc((str1Len+str2Len+1) * sizeof(char));
+  int i = 0;
+  int k = 0;
+  for (; i<str1Len; i++)
+  {
+    string[i] = str1[i];
+  }
+  for (; k<str2Len; k++)
+  {
+    string[i+k] = str2[k];
+  }
+  string[i+k] = '\0';
+	free(str1);
+	free(str2);
+  return string;
+}
+
+char *cpToChars(uint_least32_t cp, size_t len)
+{
+	char *str = malloc((len+1) * sizeof(char));
+	grapheme_encode_utf8(cp, str, len);
+	str[len] = 0;
+	return str;
+}
+
+char *trim(char *text)
+{
+	char *trimmedText = NULL;
+	int begin = 0;
+	int end = 0;
+	for (int i=0; i<strlen(text); i++)
+	{
+		if
+		(
+				text[i] == ' ' ||
+				text[i] == '\n' ||
+				text[i] == '\t' ||
+				text[i] == '\r'
+		)
+			begin++;
+		else
+			break;
+	}
+	for (int i=strlen(text)-1; i>=0; i--)
+	{
+		if
+		(
+			text[i] == ' '||
+			text[i] == '\n' ||
+			text[i] == '\t' ||
+			text[i] == '\r'
+		)
+			end++;
+		else
+			break;
+	}
+	int k = 0;
+	for (int i=0; i<strlen(text); i++)
+	{
+		if (i >= begin && i < strlen(text) - end)
+		{
+			trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+			trimmedText[k] = text[i];
+			k++;
+		}
+	}
+	trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+	trimmedText[k] = 0;
+	return trimmedText;
+}
+
+// Do not use for reading from a socket fd
+bool tryRead(char *buf, FILE *fp)
+{
+	size_t bytesRead = fread(buf, 1, 1, fp);
+	if (feof(fp) != 0)
+		return false;
+	if (ferror(fp) != 0)
+		tryRead(buf, fp);
+	if (bytesRead != 1)
+		tryRead(buf, fp);
+	return true;
+}
+
+char *readFile(FILE *fp)
+{
+	char *text = NULL;
+	int i = 0;
+	char buf;
+	while (1)
+	{
+		if (tryRead(&buf, fp))
+		{
+			text = realloc(text, (i+1) * sizeof(char));
+			text[i] = buf;
+			i++;
+		}
+		else
+			break;
+	}
+	text = realloc(text, (i+1) * sizeof(char));
+	text[i] = 0;
+	return text;
+}
diff --git a/todo b/todo
@@ -1,5 +1 @@
-refactor; heavy
-implement find_attribute_value_by_*
-implement filtering not only by class or id, also like this .test[data="asdf"]
-implement finding tags that have no end tag, e.g. the img tag
-Actually correctly parse html according to spec ;-)
+support --except argument

	htex simple incorrect html parser
	git clone git://git.relim.de/htex.git
	Log \| Files \| Refs \| README

M	.gitignore	\|	2	+-
M	Makefile	\|	7	++++---
M	htex.c	\|	648	++++++++++++++++++++++++-------------------------------------------------------
A	html.c	\|	790	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	html.h	\|	90	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lib.c	\|	112	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	todo	\|	6	+-----