Start implementing finding by attribute name - htex

commit 4873756e4ae3c76f0f90da7f3feb61442dd3f4ef
parent 4007840a37366244a2a5b5041d0ebb2853863eaf
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat, 27 Aug 2022 22:43:53 +0200

Start implementing finding by attribute name

Instead of only being able to find by a class
name, find instead by id name or class name

Diffstat:
A .gitignore  | 3 +++
M htex.1  | 18 ++++++++++++------
M htex.c  | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
A todo  | 2 ++

4 files changed, 112 insertions(+), 18 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+test.html
+htex
+\ No newline at end of file
diff --git a/htex.1 b/htex.1
@@ -3,22 +3,28 @@
 htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml
 .SH SYNOPSIS
 .B htex
--c \fI\,class_name\/\fR [-i] -|\fI\,filename\/\fR
+-a \fI\,attribute_name\/\fR [-i] -|\fI\,filename\/\fR
 .SH DESCRIPTION
 .PP
 Receives text from stdin or a file
 and interprets it as html. You provide
-a \fI\,class_name\/\fR via the
-.B -c
+an \fI\,attribute_name\/\fR via the
+.B -a
 option and htex will write the html tag
-found by \fI\,class_name\/\fR to stdout.
+found by \fI\,attribute_name\/\fR to stdout.
 Pass the
 .B -i
 option to only output the content (innerHTML) of the
 html tag.
 .TP
-\fB\,-c\/\fR, \fB\,--class\/\fR \fI\,class_name\/\fR
-Filter html by the class name of a html tag
+\fB\,-a\/\fR, \fB\,--attribute\/\fR \fI\,attribute_name\/\fR
+Filter html by the attribute name of a html tag.
+If \fI\,attribute_name\/\fR starts with a dot (.) then
+the following characters will be taken as the class name
+of a tag. If \fI\,attribute_name\/\fR starts with a hashtag (#)
+the following characters will be taken as the id name of a tag.
+If \fI\,attribute_name\/\fR is neither a dot nor a hashtag it
+will be taken as a tag name.
 .TP
 \fB\,-i\/\fR, \fB\,--innerhtml\/\fR
 Instead of returning the html tag only return
diff --git a/htex.c b/htex.c
@@ -7,8 +7,9 @@
 
 #define ONE_MILLION 1000000
 char text[ONE_MILLION];
-char class_name[200];
+char attribute_name[200];
 char tag_name[50];
+bool inner_html = false;
 
 int find_start_of_opening_tag_pos(int class_position) {
     int i = 1;
@@ -114,7 +115,7 @@ int find_closing_tag_pos(int open_tag_pos, bool inner_html) {
     return -1;
 }
 
-bool correct_class_name_begin_or_end(char prev_char) {
+bool correct_name_begin_or_end(char prev_char) {
     switch(prev_char) {
         case '"':
             return true;
@@ -127,7 +128,7 @@ bool correct_class_name_begin_or_end(char prev_char) {
     }
 }
 
-void find_html_tag_by_class_name(bool inner_html) {
+void find_html_tag_by_class(char *class_name) {
     int o = 0;
     int failure = 0;
     // int counter = 0;
@@ -155,8 +156,8 @@ void find_html_tag_by_class_name(bool inner_html) {
                     }
                     if (failure == 0) {
                         if (
-                            !correct_class_name_begin_or_end(text[k+6+o]) ||
-                            !correct_class_name_begin_or_end(text[k+7+o+strlen(class_name)])
+                            !correct_name_begin_or_end(text[k+6+o]) ||
+                            !correct_name_begin_or_end(text[k+7+o+strlen(class_name)])
                         ) {
                             failure = 1;
                         }
@@ -193,23 +194,105 @@ void find_html_tag_by_class_name(bool inner_html) {
     // printf("counter: %d\n", counter);
 }
 
+void find_html_tag_by_id(char *id_name) {
+    int o = 0;
+    int failure = 0;
+    // int counter = 0;
+    int is_not_quotation_mark = 1;
+
+    for (int k=0; k<strlen(text); k++) {
+        if (
+            text[k] == 'i' &&
+            text[k+1] == 'd'
+        ) {
+            while (is_not_quotation_mark == 1) {
+                if (text[k+4+o] == '"' || text[k+4+o] == '\'') {
+                    is_not_quotation_mark = 0;
+                    break;
+                }
+                if (id_name[0] == text[k+4+o]) {
+                    for (int l=1; l<strlen(id_name); l++) {
+                        if (id_name[l] != text[k+4+o+l]) {
+                            failure = 1;
+                            break;
+                        }
+                    }
+                    if (failure == 0) {
+                        if (
+                            !correct_name_begin_or_end(text[k+3+o]) ||
+                            !correct_name_begin_or_end(text[k+4+o+strlen(id_name)])
+                        ) {
+                            failure = 1;
+                        }
+                    }
+                    if (failure == 0) {
+                        // counter++;
+                        if (inner_html) {
+                            int start_of_open_tag_pos = find_start_of_opening_tag_pos(k);
+                            find_tag_name(start_of_open_tag_pos);
+                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
+                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true);
+                            for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) {
+                                printf("%c", text[e]);
+                            }
+                            printf("\n");
+                        } else {
+                            int open_tag_pos = find_start_of_opening_tag_pos(k);
+                            find_tag_name(open_tag_pos);
+                            int close_tag_pos = find_closing_tag_pos(open_tag_pos, false);
+                            for (int e=open_tag_pos; e<close_tag_pos; e++) {
+                                printf("%c", text[e]);
+                            }
+                            printf("\n");
+                        }
+                    }
+                    failure = 0;
+                }
+                o++;
+            }
+            is_not_quotation_mark = 1;
+            o = 0;
+        }
+    }
+    // printf("counter: %d\n", counter);
+}
+void find_html_tag_by_tag() {
+    printf("Not yet implemented.\n");
+}
+
+void find_html_tag() {
+    char identifier[200];
+    for (int i=0; i<strlen(attribute_name); i++) {
+        identifier[i] = attribute_name[i+1];
+    }
+    switch(attribute_name[0]) {
+        case '.':
+            find_html_tag_by_class(identifier);
+            break;
+        case '#':
+            find_html_tag_by_id(identifier);
+            break;
+        default:
+            find_html_tag_by_tag(); // it uses attribute_name directly
+    }
+}
+
 int main(int argc, char *argv[]) {
     int i = 0;
-    bool inner_html = false;
     char buffer;
     int o;
 
     static struct option long_options[] = {
-        { "class", required_argument, 0, 'c' },
+        { "attribute", required_argument, 0, 'a' },
         { "innerhtml", no_argument, 0, 'i' },
         { 0, 0, 0, 0 }
     };
     int option_index = 0;
-    while ((o = getopt_long(argc, argv, "ic:", long_options, &option_index)) != -1) {
+    while ((o = getopt_long(argc, argv, "ia:", long_options, &option_index)) != -1) {
         switch(o) {
-            case 'c':
+            case 'a':
                 for (int j=0; j<strlen(optarg); j++) {
-                    class_name[j] = optarg[j];
+                    attribute_name[j] = optarg[j];
                 }
                 break;
             case 'i':
@@ -223,7 +306,7 @@ int main(int argc, char *argv[]) {
                 text[i] = buffer;
                 i++;
             }
-            find_html_tag_by_class_name(inner_html);
+            find_html_tag();
         } else {
             int fd = open(argv[argc-1], O_RDONLY);
             if (fd != -1) {
@@ -231,7 +314,7 @@ int main(int argc, char *argv[]) {
                     text[i] = buffer;
                     i++;
                 }
-                find_html_tag_by_class_name(inner_html);
+                find_html_tag();
             } else {
                 printf("Couldn't read file \"%s\"\n", argv[argc-1]);
             }
diff --git a/todo b/todo
@@ -0,0 +1 @@
+doesn't find tag if class="" is in a new line different from the tag.
+\ No newline at end of file

	htex simple incorrect html parser
	git clone git://git.relim.de/htex.git
	Log \| Files \| Refs \| README

A	.gitignore	\|	3	+++
M	htex.1	\|	18	++++++++++++------
M	htex.c	\|	107	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
A	todo	\|	2	++