Add -e/--except option - htex - simple incorrect html parser

commit 05d47383d977a9cb3bd69ac2c9636cc0ca9e06f6
parent 52dd773f9344d02753b2b9b4dce1fa8d44f415f9
Author: Robin <kroekerrobin@gmail.com>
Date:   Thu, 15 Sep 2022 22:49:26 +0200

Add -e/--except option

Diffstat:
M htex.1  | 14 ++++++++++----
M htex.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
M todo  | 4 +---

3 files changed, 105 insertions(+), 22 deletions(-)
diff --git a/htex.1 b/htex.1
@@ -3,7 +3,7 @@
 htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml
 .SH SYNOPSIS
 .B htex
--a \fI\,attribute_name\/\fR [-i] -|\fI\,filename\/\fR
+-a \fI\,attribute_name\/\fR [-e] [-i] -|\fI\,filename\/\fR
 .SH DESCRIPTION
 .PP
 Receives text from stdin or a file
@@ -28,7 +28,11 @@ will be taken as a tag name.
 .TP
 \fB\,-i\/\fR, \fB\,--innerhtml\/\fR
 Instead of returning the html tag only return
-the content (innerHTML) of the tag
+the content (innerHTML) of the tag. Cannot be used together with the -e option.
+.TP
+\fB\,-e\/\fR, \fB\,--except\/\fR
+Output everything except the html tag specified in -a.
+Cannot be used together with the -i option.
 .SH EXAMPLES
 .sp
 .RS 4
@@ -36,4 +40,6 @@ cat test.html | htex -i -a ".o-headline" -
 
 htex -a span test.html
 
-htex --innerhtml --attribute "#container" test.html
-\ No newline at end of file
+htex --innerhtml --attribute "#container" test.html
+
+htex -e -a ".unnecessary-class" test.html
+\ No newline at end of file
diff --git a/htex.c b/htex.c
@@ -10,6 +10,12 @@ char *text;
 char attribute_name[200];
 char tag_name[50];
 bool inner_html = false;
+bool except = false;
+struct match {
+    int start;
+    int end;
+};
+struct match *matches;
 
 int find_start_of_opening_tag_pos(int class_position) {
     int i = 1;
@@ -133,7 +139,7 @@ bool correct_name_begin_or_end(char prev_char) {
 void find_html_tag_by_class(char *class_name) {
     int o = 0;
     int failure = 0;
-    // int counter = 0;
+    int counter = 0;
     int is_not_quotation_mark = 1;
 
     for (int k=0; k<strlen(text); k++) {
@@ -178,11 +184,19 @@ void find_html_tag_by_class(char *class_name) {
                         } else {
                             int open_tag_pos = find_start_of_opening_tag_pos(k);
                             find_tag_name(open_tag_pos);
-                            int close_tag_pos = find_closing_tag_pos(open_tag_pos, false);
-                            for (int e=open_tag_pos; e<close_tag_pos; e++) {
-                                printf("%c", text[e]);
+                            int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
+                            int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false);
+                            if (except) {
+                                matches = realloc(matches, (counter+1) * sizeof(struct match));
+                                matches[counter].start = open_tag_pos;
+                                matches[counter].end = close_tag_pos;
+                                counter++;
+                            } else {
+                                for (int e=open_tag_pos; e<close_tag_pos; e++) {
+                                    printf("%c", text[e]);
+                                }
+                                printf("\n");
                             }
-                            printf("\n");
                         }
                     }
                     failure = 0;
@@ -193,13 +207,26 @@ void find_html_tag_by_class(char *class_name) {
             o = 0;
         }
     }
-    // printf("counter: %d\n", counter);
+    if (except) {
+        int start = 0;
+        for (int i=0; i<counter; i++) {
+            for (int e=start; e<matches[i].start; e++) {
+                printf("%c", text[e]);
+            }
+            start = matches[i].end;
+        }
+        for (int i=start; i<strlen(text); i++) {
+            printf("%c", text[i]);
+        }
+        printf("\n");
+        free(matches);
+    }
 }
 
 void find_html_tag_by_id(char *id_name) {
     int o = 0;
     int failure = 0;
-    // int counter = 0;
+    int counter = 0;
     int is_not_quotation_mark = 1;
 
     for (int k=0; k<strlen(text); k++) {
@@ -243,10 +270,17 @@ void find_html_tag_by_id(char *id_name) {
                             find_tag_name(start_of_open_tag_pos);
                             int end_of_open_tag_pos = find_end_of_opening_tag_pos(k);
                             int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false);
-                            for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) {
-                                printf("%c", text[e]);
+                            if (except) {
+                                matches = realloc(matches, (counter+1) * sizeof(struct match));
+                                matches[counter].start = start_of_open_tag_pos;
+                                matches[counter].end = close_tag_pos;
+                                counter++;
+                            } else {
+                                for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) {
+                                    printf("%c", text[e]);
+                                }
+                                printf("\n");   
                             }
-                            printf("\n");
                         }
                     }
                     failure = 0;
@@ -257,10 +291,24 @@ void find_html_tag_by_id(char *id_name) {
             o = 0;
         }
     }
-    // printf("counter: %d\n", counter);
+    if (except) {
+        int start = 0;
+        for (int i=0; i<counter; i++) {
+            for (int e=start; e<matches[i].start; e++) {
+                printf("%c", text[e]);
+            }
+            start = matches[i].end;
+        }
+        for (int i=start; i<strlen(text); i++) {
+            printf("%c", text[i]);
+        }
+        printf("\n");
+        free(matches);
+    }
 }
 void find_html_tag_by_tag() {
     int failure = 0;
+    int counter = 0;
     for (int k=0; k<strlen(text); k++) {
         if (text[k] == '<' && text[k+1] != '/') {
             for (int o=0; o<strlen(attribute_name); o++) {
@@ -286,16 +334,37 @@ void find_html_tag_by_tag() {
                         printf("\n");
                     } else {
                         int close_tag_pos = find_closing_tag_pos(after_tag_pos, false);
-                        for (int e=open_tag_pos; e<close_tag_pos; e++) {
-                            printf("%c", text[e]);
+                        if (except) {
+                            matches = realloc(matches, (counter+1) * sizeof(struct match));
+                            matches[counter].start = open_tag_pos;
+                            matches[counter].end = close_tag_pos;
+                            counter++;
+                        } else {
+                            for (int e=open_tag_pos; e<close_tag_pos; e++) {
+                                printf("%c", text[e]);
+                            }
+                            printf("\n");
                         }
-                        printf("\n");
                     }
                 }
             }
             failure = 0;
         }
     }
+    if (except) {
+        int start = 0;
+        for (int i=0; i<counter; i++) {
+            for (int e=start; e<matches[i].start; e++) {
+                printf("%c", text[e]);
+            }
+            start = matches[i].end;
+        }
+        for (int i=start; i<strlen(text); i++) {
+            printf("%c", text[i]);
+        }
+        printf("\n");
+        free(matches);
+    }
 }
 
 void find_html_tag() {
@@ -331,10 +400,11 @@ int main(int argc, char *argv[]) {
     static struct option long_options[] = {
         { "attribute", required_argument, 0, 'a' },
         { "innerhtml", no_argument, 0, 'i' },
+        { "except", no_argument, 0, 'e' },
         { 0, 0, 0, 0 }
     };
     int option_index = 0;
-    while ((o = getopt_long(argc, argv, "ia:", long_options, &option_index)) != -1) {
+    while ((o = getopt_long(argc, argv, "eia:", long_options, &option_index)) != -1) {
         switch(o) {
             case 'a':
                 for (int j=0; j<strlen(optarg); j++) {
@@ -344,8 +414,15 @@ int main(int argc, char *argv[]) {
             case 'i':
                 inner_html = true;
                 break;
+            case 'e':
+                except = true;
+                break;
         }
     }
+    if (inner_html && except) {
+        printf("You can't use the options -i (--innerhtml) and -e (--except) at the same time.\n");
+        return -1;
+    }
     if (argc == (optind + 1)) {
         if (*argv[argc-1] == '-') {
             while (read(0, &buffer, 1) > 0) {
@@ -357,6 +434,7 @@ int main(int argc, char *argv[]) {
                     return -1;
                 }
             }
+            text[i] = '\0';
             find_html_tag();
             free(text);
         } else {
@@ -371,6 +449,7 @@ int main(int argc, char *argv[]) {
                         return -1;
                     }
                 }
+                text[i] = '\0';
                 find_html_tag();
                 free(text);
             } else {
diff --git a/todo b/todo
@@ -1,4 +1,3 @@
+refactor; heavy
 implement find_attribute_value_by_*
 implement filtering not only by class or id, also like this .test[data="asdf"]
-improve structure of code
-make it better man 
-\ No newline at end of file

	htex simple incorrect html parser
	git clone git://git.relim.de/htex.git
	Log \| Files \| Refs \| README

M	htex.1	\|	14	++++++++++----
M	htex.c	\|	109	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
M	todo	\|	4	+---