htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 4007840a37366244a2a5b5041d0ebb2853863eaf
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat, 27 Aug 2022 21:55:33 +0200

Initial commit

Diffstat:
AMakefile | 18++++++++++++++++++
Ahtex.1 | 34++++++++++++++++++++++++++++++++++
Ahtex.c | 244+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 296 insertions(+), 0 deletions(-)

diff --git a/Makefile b/Makefile @@ -0,0 +1,17 @@ +PREFIX = /usr/local +MANPREFIX = $(PREFIX)/share/man + +all: + $(CC) -O -Wall -Werror -o htex htex.c +clean: + rm htex +install: all + mkdir -p "$(PREFIX)/bin" + cp -f htex "$(PREFIX)/bin" + chmod 755 "$(PREFIX)/bin/htex" + mkdir -p "$(MANPREFIX)/man1" + cp -f htex.1 "$(MANPREFIX)/man1/htex.1" + chmod 644 "$(MANPREFIX)/man1/htex.1" +uninstall: + rm "$(PREFIX)/bin/htex" + rm "$(MANPREFIX)/man1/htex.1" +\ No newline at end of file diff --git a/htex.1 b/htex.1 @@ -0,0 +1,33 @@ +.TH HTEX "1" "August 2022" "User Commands" +.SH NAME +htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml +.SH SYNOPSIS +.B htex +-c \fI\,class_name\/\fR [-i] -|\fI\,filename\/\fR +.SH DESCRIPTION +.PP +Receives text from stdin or a file +and interprets it as html. You provide +a \fI\,class_name\/\fR via the +.B -c +option and htex will write the html tag +found by \fI\,class_name\/\fR to stdout. +Pass the +.B -i +option to only output the content (innerHTML) of the +html tag. +.TP +\fB\,-c\/\fR, \fB\,--class\/\fR \fI\,class_name\/\fR +Filter html by the class name of a html tag +.TP +\fB\,-i\/\fR, \fB\,--innerhtml\/\fR +Instead of returning the html tag only return +the content (innerHTML) of the tag +.SH EXAMPLES +.sp +.RS 4 +cat test.html | htex -i -c "o-headline" - + +htex -c "o-headline" test.html + +htex --class "o-headline" test.html +\ No newline at end of file diff --git a/htex.c b/htex.c @@ -0,0 +1,243 @@ +#include <stdio.h> +#include <unistd.h> +#include <getopt.h> +#include <string.h> +#include <stdbool.h> +#include <fcntl.h> + +#define ONE_MILLION 1000000 +char text[ONE_MILLION]; +char class_name[200]; +char tag_name[50]; + +int find_start_of_opening_tag_pos(int class_position) { + int i = 1; + while (1) { + int pos = class_position - i; + if (pos < 0) + return -1; + if (text[pos] == '<') { + return pos; + } + i++; + } +} + +int find_end_of_opening_tag_pos(int class_position) { + int i = 1; + while (1) { + int pos = class_position + i; + if (pos < 0) + return -1; + if (text[pos] == '>') { + return pos + 1; + } + i++; + } +} + +void find_tag_name(int open_tag_pos) { + int i = 1; + int end_of_tag_name = 0; + while (1) { + int pos = open_tag_pos + i; + if (pos > strlen(text)) + return; + if (text[pos] == ' ') { + end_of_tag_name = pos-1; + break; + } + i++; + } + int length_tag_name = end_of_tag_name - open_tag_pos; + for (int k=0; k<length_tag_name; k++) { + tag_name[k] = text[open_tag_pos+k+1]; + } + tag_name[length_tag_name] = '\0'; +} + +int find_closing_tag_pos(int open_tag_pos, bool inner_html) { + int level = 0; + int failure = 0; + char close_tag[strlen(tag_name)+3]; + close_tag[0] = '<'; + close_tag[1] = '/'; + for (int k=0; k<strlen(tag_name); k++) { + close_tag[2+k] = tag_name[k]; + } + close_tag[sizeof(close_tag)-1] = '>'; + close_tag[sizeof(close_tag)] = '\0'; + + for (int l=open_tag_pos; l<strlen(text); l++) { // Could be more precise + if (text[l] == '<') { + for (int o=0; o<strlen(tag_name); o++) { + if (tag_name[o] != text[l+o+1]) { + failure = 1; + break; + } + } + if (failure == 0) { + if ( + text[l+strlen(tag_name)+1] != ' ' && + text[l+strlen(tag_name)+1] != '>' + ) { + failure = 1; + } + } + if (failure == 0) { + level++; + } + failure = 0; + if (text[l+1] == '/') { + for (int o=2; o<strlen(close_tag); o++) { + if (close_tag[o] != text[l+o]) { + failure = 1; + break; + } + } + if (failure == 0) { + if (level > 0) { + level--; + } + if (level == 0) { + if (inner_html) { + return l; + } else { + return l + strlen(tag_name) + 3; + } + } + } + failure = 0; + } + } + } + return -1; +} + +bool correct_class_name_begin_or_end(char prev_char) { + switch(prev_char) { + case '"': + return true; + case '\'': + return true; + case ' ': + return true; + default: + return false; + } +} + +void find_html_tag_by_class_name(bool inner_html) { + int o = 0; + int failure = 0; + // int counter = 0; + int is_not_quotation_mark = 1; + + for (int k=0; k<strlen(text); k++) { + if ( + text[k] == 'c' && + text[k+1] == 'l' && + text[k+2] == 'a' && + text[k+3] == 's' && + text[k+4] == 's' + ) { + while (is_not_quotation_mark == 1) { + if (text[k+7+o] == '"' || text[k+7+o] == '\'') { + is_not_quotation_mark = 0; + break; + } + if (class_name[0] == text[k+7+o]) { + for (int l=1; l<strlen(class_name); l++) { + if (class_name[l] != text[k+7+o+l]) { + failure = 1; + break; + } + } + if (failure == 0) { + if ( + !correct_class_name_begin_or_end(text[k+6+o]) || + !correct_class_name_begin_or_end(text[k+7+o+strlen(class_name)]) + ) { + failure = 1; + } + } + if (failure == 0) { + // counter++; + if (inner_html) { + int start_of_open_tag_pos = find_start_of_opening_tag_pos(k); + find_tag_name(start_of_open_tag_pos); + int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); + int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true); + for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); + } else { + int open_tag_pos = find_start_of_opening_tag_pos(k); + find_tag_name(open_tag_pos); + int close_tag_pos = find_closing_tag_pos(open_tag_pos, false); + for (int e=open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); + } + } + failure = 0; + } + o++; + } + is_not_quotation_mark = 1; + o = 0; + } + } + // printf("counter: %d\n", counter); +} + +int main(int argc, char *argv[]) { + int i = 0; + bool inner_html = false; + char buffer; + int o; + + static struct option long_options[] = { + { "class", required_argument, 0, 'c' }, + { "innerhtml", no_argument, 0, 'i' }, + { 0, 0, 0, 0 } + }; + int option_index = 0; + while ((o = getopt_long(argc, argv, "ic:", long_options, &option_index)) != -1) { + switch(o) { + case 'c': + for (int j=0; j<strlen(optarg); j++) { + class_name[j] = optarg[j]; + } + break; + case 'i': + inner_html = true; + break; + } + } + if (argc == (optind + 1)) { + if (*argv[argc-1] == '-') { + while (read(0, &buffer, 1) > 0) { + text[i] = buffer; + i++; + } + find_html_tag_by_class_name(inner_html); + } else { + int fd = open(argv[argc-1], O_RDONLY); + if (fd != -1) { + while (read(fd, &buffer, 1) > 0) { + text[i] = buffer; + i++; + } + find_html_tag_by_class_name(inner_html); + } else { + printf("Couldn't read file \"%s\"\n", argv[argc-1]); + } + } + } else { + printf("Nothing to read from.\n"); + } + return 0; +} +\ No newline at end of file