commit 4007840a37366244a2a5b5041d0ebb2853863eaf
Author: Robin <kroekerrobin@gmail.com>
Date: Sat, 27 Aug 2022 21:55:33 +0200
Initial commit
Diffstat:
| A | Makefile | | | 18 | ++++++++++++++++++ |
| A | htex.1 | | | 34 | ++++++++++++++++++++++++++++++++++ |
| A | htex.c | | | 244 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 296 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,17 @@
+PREFIX = /usr/local
+MANPREFIX = $(PREFIX)/share/man
+
+all:
+ $(CC) -O -Wall -Werror -o htex htex.c
+clean:
+ rm htex
+install: all
+ mkdir -p "$(PREFIX)/bin"
+ cp -f htex "$(PREFIX)/bin"
+ chmod 755 "$(PREFIX)/bin/htex"
+ mkdir -p "$(MANPREFIX)/man1"
+ cp -f htex.1 "$(MANPREFIX)/man1/htex.1"
+ chmod 644 "$(MANPREFIX)/man1/htex.1"
+uninstall:
+ rm "$(PREFIX)/bin/htex"
+ rm "$(MANPREFIX)/man1/htex.1"
+\ No newline at end of file
diff --git a/htex.1 b/htex.1
@@ -0,0 +1,33 @@
+.TH HTEX "1" "August 2022" "User Commands"
+.SH NAME
+htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml
+.SH SYNOPSIS
+.B htex
+-c \fI\,class_name\/\fR [-i] -|\fI\,filename\/\fR
+.SH DESCRIPTION
+.PP
+Receives text from stdin or a file
+and interprets it as html. You provide
+a \fI\,class_name\/\fR via the
+.B -c
+option and htex will write the html tag
+found by \fI\,class_name\/\fR to stdout.
+Pass the
+.B -i
+option to only output the content (innerHTML) of the
+html tag.
+.TP
+\fB\,-c\/\fR, \fB\,--class\/\fR \fI\,class_name\/\fR
+Filter html by the class name of a html tag
+.TP
+\fB\,-i\/\fR, \fB\,--innerhtml\/\fR
+Instead of returning the html tag only return
+the content (innerHTML) of the tag
+.SH EXAMPLES
+.sp
+.RS 4
+cat test.html | htex -i -c "o-headline" -
+
+htex -c "o-headline" test.html
+
+htex --class "o-headline" test.html
+\ No newline at end of file
diff --git a/htex.c b/htex.c
@@ -0,0 +1,243 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define ONE_MILLION 1000000
+char text[ONE_MILLION];
+char class_name[200];
+char tag_name[50];
+
+int find_start_of_opening_tag_pos(int class_position) {
+ int i = 1;
+ while (1) {
+ int pos = class_position - i;
+ if (pos < 0)
+ return -1;
+ if (text[pos] == '<') {
+ return pos;
+ }
+ i++;
+ }
+}
+
+int find_end_of_opening_tag_pos(int class_position) {
+ int i = 1;
+ while (1) {
+ int pos = class_position + i;
+ if (pos < 0)
+ return -1;
+ if (text[pos] == '>') {
+ return pos + 1;
+ }
+ i++;
+ }
+}
+
+void find_tag_name(int open_tag_pos) {
+ int i = 1;
+ int end_of_tag_name = 0;
+ while (1) {
+ int pos = open_tag_pos + i;
+ if (pos > strlen(text))
+ return;
+ if (text[pos] == ' ') {
+ end_of_tag_name = pos-1;
+ break;
+ }
+ i++;
+ }
+ int length_tag_name = end_of_tag_name - open_tag_pos;
+ for (int k=0; k<length_tag_name; k++) {
+ tag_name[k] = text[open_tag_pos+k+1];
+ }
+ tag_name[length_tag_name] = '\0';
+}
+
+int find_closing_tag_pos(int open_tag_pos, bool inner_html) {
+ int level = 0;
+ int failure = 0;
+ char close_tag[strlen(tag_name)+3];
+ close_tag[0] = '<';
+ close_tag[1] = '/';
+ for (int k=0; k<strlen(tag_name); k++) {
+ close_tag[2+k] = tag_name[k];
+ }
+ close_tag[sizeof(close_tag)-1] = '>';
+ close_tag[sizeof(close_tag)] = '\0';
+
+ for (int l=open_tag_pos; l<strlen(text); l++) { // Could be more precise
+ if (text[l] == '<') {
+ for (int o=0; o<strlen(tag_name); o++) {
+ if (tag_name[o] != text[l+o+1]) {
+ failure = 1;
+ break;
+ }
+ }
+ if (failure == 0) {
+ if (
+ text[l+strlen(tag_name)+1] != ' ' &&
+ text[l+strlen(tag_name)+1] != '>'
+ ) {
+ failure = 1;
+ }
+ }
+ if (failure == 0) {
+ level++;
+ }
+ failure = 0;
+ if (text[l+1] == '/') {
+ for (int o=2; o<strlen(close_tag); o++) {
+ if (close_tag[o] != text[l+o]) {
+ failure = 1;
+ break;
+ }
+ }
+ if (failure == 0) {
+ if (level > 0) {
+ level--;
+ }
+ if (level == 0) {
+ if (inner_html) {
+ return l;
+ } else {
+ return l + strlen(tag_name) + 3;
+ }
+ }
+ }
+ failure = 0;
+ }
+ }
+ }
+ return -1;
+}
+
+bool correct_class_name_begin_or_end(char prev_char) {
+ switch(prev_char) {
+ case '"':
+ return true;
+ case '\'':
+ return true;
+ case ' ':
+ return true;
+ default:
+ return false;
+ }
+}
+
+void find_html_tag_by_class_name(bool inner_html) {
+ int o = 0;
+ int failure = 0;
+ // int counter = 0;
+ int is_not_quotation_mark = 1;
+
+ for (int k=0; k<strlen(text); k++) {
+ if (
+ text[k] == 'c' &&
+ text[k+1] == 'l' &&
+ text[k+2] == 'a' &&
+ text[k+3] == 's' &&
+ text[k+4] == 's'
+ ) {
+ while (is_not_quotation_mark == 1) {
+ if (text[k+7+o] == '"' || text[k+7+o] == '\'') {
+ is_not_quotation_mark = 0;
+ break;
+ }
+ if (class_name[0] == text[k+7+o]) {
+ for (int l=1; l<strlen(class_name); l++) {
+ if (class_name[l] != text[k+7+o+l]) {
+ failure = 1;
+ break;
+ }
+ }
+ if (failure == 0) {
+ if (
+ !correct_class_name_begin_or_end(text[k+6+o]) ||
+ !correct_class_name_begin_or_end(text[k+7+o+strlen(class_name)])
+ ) {
+ failure = 1;
+ }
+ }
+ if (failure == 0) {
+ // counter++;
+ if (inner_html) {
+ int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
+ find_tag_name(start_of_open_tag_pos);
+ int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
+ int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
+ for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
+ printf("%c", text[e]);
+ }
+ printf("\n");
+ } else {
+ int open_tag_pos = find_start_of_opening_tag_pos(k);
+ find_tag_name(open_tag_pos);
+ int close_tag_pos = find_closing_tag_pos(open_tag_pos, false);
+ for (int e=open_tag_pos; e<close_tag_pos; e++) {
+ printf("%c", text[e]);
+ }
+ printf("\n");
+ }
+ }
+ failure = 0;
+ }
+ o++;
+ }
+ is_not_quotation_mark = 1;
+ o = 0;
+ }
+ }
+ // printf("counter: %d\n", counter);
+}
+
+int main(int argc, char *argv[]) {
+ int i = 0;
+ bool inner_html = false;
+ char buffer;
+ int o;
+
+ static struct option long_options[] = {
+ { "class", required_argument, 0, 'c' },
+ { "innerhtml", no_argument, 0, 'i' },
+ { 0, 0, 0, 0 }
+ };
+ int option_index = 0;
+ while ((o = getopt_long(argc, argv, "ic:", long_options, &option_index)) != -1) {
+ switch(o) {
+ case 'c':
+ for (int j=0; j<strlen(optarg); j++) {
+ class_name[j] = optarg[j];
+ }
+ break;
+ case 'i':
+ inner_html = true;
+ break;
+ }
+ }
+ if (argc == (optind + 1)) {
+ if (*argv[argc-1] == '-') {
+ while (read(0, &buffer, 1) > 0) {
+ text[i] = buffer;
+ i++;
+ }
+ find_html_tag_by_class_name(inner_html);
+ } else {
+ int fd = open(argv[argc-1], O_RDONLY);
+ if (fd != -1) {
+ while (read(fd, &buffer, 1) > 0) {
+ text[i] = buffer;
+ i++;
+ }
+ find_html_tag_by_class_name(inner_html);
+ } else {
+ printf("Couldn't read file \"%s\"\n", argv[argc-1]);
+ }
+ }
+ } else {
+ printf("Nothing to read from.\n");
+ }
+ return 0;
+}
+\ No newline at end of file