Initial commit - htex - simple incorrect html parser

commit 4007840a37366244a2a5b5041d0ebb2853863eaf
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat, 27 Aug 2022 21:55:33 +0200

Initial commit

Diffstat:
A Makefile  | 18 ++++++++++++++++++
A htex.1  | 34 ++++++++++++++++++++++++++++++++++
A htex.c  | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 296 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,17 @@
+PREFIX = /usr/local
+MANPREFIX = $(PREFIX)/share/man
+
+all:
+	$(CC) -O -Wall -Werror -o htex htex.c
+clean:
+	rm htex
+install: all
+	mkdir -p "$(PREFIX)/bin"
+	cp -f htex "$(PREFIX)/bin"
+	chmod 755 "$(PREFIX)/bin/htex"
+	mkdir -p "$(MANPREFIX)/man1"
+	cp -f htex.1 "$(MANPREFIX)/man1/htex.1"
+	chmod 644 "$(MANPREFIX)/man1/htex.1"
+uninstall:
+	rm "$(PREFIX)/bin/htex"
+	rm "$(MANPREFIX)/man1/htex.1"
+\ No newline at end of file
diff --git a/htex.1 b/htex.1
@@ -0,0 +1,33 @@
+.TH HTEX "1" "August 2022" "User Commands"
+.SH NAME
+htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml
+.SH SYNOPSIS
+.B htex
+-c \fI\,class_name\/\fR [-i] -|\fI\,filename\/\fR
+.SH DESCRIPTION
+.PP
+Receives text from stdin or a file
+and interprets it as html. You provide
+a \fI\,class_name\/\fR via the
+.B -c
+option and htex will write the html tag
+found by \fI\,class_name\/\fR to stdout.
+Pass the
+.B -i
+option to only output the content (innerHTML) of the
+html tag.
+.TP
+\fB\,-c\/\fR, \fB\,--class\/\fR \fI\,class_name\/\fR
+Filter html by the class name of a html tag
+.TP
+\fB\,-i\/\fR, \fB\,--innerhtml\/\fR
+Instead of returning the html tag only return
+the content (innerHTML) of the tag
+.SH EXAMPLES
+.sp
+.RS 4
+cat test.html | htex -i -c "o-headline" -
+
+htex -c "o-headline" test.html
+
+htex --class "o-headline" test.html
+\ No newline at end of file
diff --git a/htex.c b/htex.c
@@ -0,0 +1,243 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define ONE_MILLION 1000000
+char text[ONE_MILLION];
+char class_name[200];
+char tag_name[50];
+
+int find_start_of_opening_tag_pos(int class_position) {
+    int i = 1;
+    while (1) {
+        int pos = class_position - i;
+        if (pos < 0)
+            return -1;
+        if (text[pos] == '<') {
+            return pos;
+        }
+        i++;
+    }
+}
+
+int find_end_of_opening_tag_pos(int class_position) {
+    int i = 1;
+    while (1) {
+        int pos = class_position + i;
+        if (pos < 0)
+            return -1;
+        if (text[pos] == '>') {
+            return pos + 1;
+        }
+        i++;
+    }
+}
+
+void find_tag_name(int open_tag_pos) {
+    int i = 1;
+    int end_of_tag_name = 0;
+    while (1) {
+        int pos = open_tag_pos + i;
+        if (pos > strlen(text))
+            return;
+        if (text[pos] == ' ') {
+            end_of_tag_name = pos-1;
+            break;
+        }
+        i++;
+    }
+    int length_tag_name = end_of_tag_name - open_tag_pos;
+    for (int k=0; k<length_tag_name; k++) {
+        tag_name[k] = text[open_tag_pos+k+1];
+    }
+    tag_name[length_tag_name] = '\0';
+}
+
+int find_closing_tag_pos(int open_tag_pos, bool inner_html) {
+    int level = 0;
+    int failure = 0;
+    char close_tag[strlen(tag_name)+3];
+    close_tag[0] = '<';
+    close_tag[1] = '/';
+    for (int k=0; k<strlen(tag_name); k++) {
+        close_tag[2+k] = tag_name[k];
+    }
+    close_tag[sizeof(close_tag)-1] = '>';
+    close_tag[sizeof(close_tag)] = '\0';
+    
+    for (int l=open_tag_pos; l<strlen(text); l++) { // Could be more precise
+        if (text[l] == '<') {
+            for (int o=0; o<strlen(tag_name); o++) {
+                if (tag_name[o] != text[l+o+1]) {
+                    failure = 1;
+                    break;
+                }
+            }
+            if (failure == 0) {
+                if (
+                    text[l+strlen(tag_name)+1] != ' ' &&
+                    text[l+strlen(tag_name)+1] != '>'
+                ) {
+                    failure = 1;
+                }
+            }
+            if (failure == 0) {
+                level++;
+            }
+            failure = 0;
+            if (text[l+1] == '/') {
+                for (int o=2; o<strlen(close_tag); o++) {
+                    if (close_tag[o] != text[l+o]) {
+                        failure = 1;
+                        break;
+                    }
+                }
+                if (failure == 0) {
+                    if (level > 0) {
+                        level--;
+                    }
+                    if (level == 0) {
+                        if (inner_html) {
+                            return l;
+                        } else {
+                            return l + strlen(tag_name) + 3;
+                        }
+                    }
+                }
+                failure = 0;
+            }
+        }
+    }
+    return -1;
+}
+
+bool correct_class_name_begin_or_end(char prev_char) {
+    switch(prev_char) {
+        case '"':
+            return true;
+        case '\'':
+            return true;
+        case ' ':
+            return true;
+        default:
+            return false;
+    }
+}
+
+void find_html_tag_by_class_name(bool inner_html) {
+    int o = 0;
+    int failure = 0;
+    // int counter = 0;
+    int is_not_quotation_mark = 1;
+
+    for (int k=0; k<strlen(text); k++) {
+        if (
+            text[k] == 'c' &&
+            text[k+1] == 'l' &&
+            text[k+2] == 'a' &&
+            text[k+3] == 's' &&
+            text[k+4] == 's'
+        ) {
+            while (is_not_quotation_mark == 1) {
+                if (text[k+7+o] == '"' || text[k+7+o] == '\'') {
+                    is_not_quotation_mark = 0;
+                    break;
+                }
+                if (class_name[0] == text[k+7+o]) {
+                    for (int l=1; l<strlen(class_name); l++) {
+                        if (class_name[l] != text[k+7+o+l]) {
+                            failure = 1;
+                            break;
+                        }
+                    }
+                    if (failure == 0) {
+                        if (
+                            !correct_class_name_begin_or_end(text[k+6+o]) ||
+                            !correct_class_name_begin_or_end(text[k+7+o+strlen(class_name)])
+                        ) {
+                            failure = 1;
+                        }
+                    }
+                    if (failure == 0) {
+                        // counter++;
+                        if (inner_html) {
+                            int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
+                            find_tag_name(start_of_open_tag_pos);
+                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
+                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
+                            for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
+                                printf("%c", text[e]);
+                            }
+                            printf("\n");
+                        } else {
+                            int open_tag_pos = find_start_of_opening_tag_pos(k);
+                            find_tag_name(open_tag_pos);
+                            int close_tag_pos = find_closing_tag_pos(open_tag_pos, false);
+                            for (int e=open_tag_pos; e<close_tag_pos; e++) {
+                                printf("%c", text[e]);
+                            }
+                            printf("\n");
+                        }
+                    }
+                    failure = 0;
+                }
+                o++;
+            }
+            is_not_quotation_mark = 1;
+            o = 0;
+        }
+    }
+    // printf("counter: %d\n", counter);
+}
+
+int main(int argc, char *argv[]) {
+    int i = 0;
+    bool inner_html = false;
+    char buffer;
+    int o;
+
+    static struct option long_options[] = {
+        { "class", required_argument, 0, 'c' },
+        { "innerhtml", no_argument, 0, 'i' },
+        { 0, 0, 0, 0 }
+    };
+    int option_index = 0;
+    while ((o = getopt_long(argc, argv, "ic:", long_options, &option_index)) != -1) {
+        switch(o) {
+            case 'c':
+                for (int j=0; j<strlen(optarg); j++) {
+                    class_name[j] = optarg[j];
+                }
+                break;
+            case 'i':
+                inner_html = true;
+                break;
+        }
+    }
+    if (argc == (optind + 1)) {
+        if (*argv[argc-1] == '-') {
+            while (read(0, &buffer, 1) > 0) {
+                text[i] = buffer;
+                i++;
+            }
+            find_html_tag_by_class_name(inner_html);
+        } else {
+            int fd = open(argv[argc-1], O_RDONLY);
+            if (fd != -1) {
+                while (read(fd, &buffer, 1) > 0) {
+                    text[i] = buffer;
+                    i++;
+                }
+                find_html_tag_by_class_name(inner_html);
+            } else {
+                printf("Couldn't read file \"%s\"\n", argv[argc-1]);
+            }
+        }
+    } else {
+        printf("Nothing to read from.\n");
+    }
+    return 0;
+}
+\ No newline at end of file

	htex simple incorrect html parser
	git clone git://git.relim.de/htex.git
	Log \| Files \| Refs \| README

A	Makefile	\|	18	++++++++++++++++++
A	htex.1	\|	34	++++++++++++++++++++++++++++++++++
A	htex.c	\|	244	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++