htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 05d47383d977a9cb3bd69ac2c9636cc0ca9e06f6
parent 52dd773f9344d02753b2b9b4dce1fa8d44f415f9
Author: Robin <kroekerrobin@gmail.com>
Date:   Thu, 15 Sep 2022 22:49:26 +0200

Add -e/--except option

Diffstat:
Mhtex.1 | 14++++++++++----
Mhtex.c | 109++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Mtodo | 4+---
3 files changed, 105 insertions(+), 22 deletions(-)

diff --git a/htex.1 b/htex.1 @@ -3,7 +3,7 @@ htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml .SH SYNOPSIS .B htex --a \fI\,attribute_name\/\fR [-i] -|\fI\,filename\/\fR +-a \fI\,attribute_name\/\fR [-e] [-i] -|\fI\,filename\/\fR .SH DESCRIPTION .PP Receives text from stdin or a file @@ -28,7 +28,11 @@ will be taken as a tag name. .TP \fB\,-i\/\fR, \fB\,--innerhtml\/\fR Instead of returning the html tag only return -the content (innerHTML) of the tag +the content (innerHTML) of the tag. Cannot be used together with the -e option. +.TP +\fB\,-e\/\fR, \fB\,--except\/\fR +Output everything except the html tag specified in -a. +Cannot be used together with the -i option. .SH EXAMPLES .sp .RS 4 @@ -36,4 +40,6 @@ cat test.html | htex -i -a ".o-headline" - htex -a span test.html -htex --innerhtml --attribute "#container" test.html -\ No newline at end of file +htex --innerhtml --attribute "#container" test.html + +htex -e -a ".unnecessary-class" test.html +\ No newline at end of file diff --git a/htex.c b/htex.c @@ -10,6 +10,12 @@ char *text; char attribute_name[200]; char tag_name[50]; bool inner_html = false; +bool except = false; +struct match { + int start; + int end; +}; +struct match *matches; int find_start_of_opening_tag_pos(int class_position) { int i = 1; @@ -133,7 +139,7 @@ bool correct_name_begin_or_end(char prev_char) { void find_html_tag_by_class(char *class_name) { int o = 0; int failure = 0; - // int counter = 0; + int counter = 0; int is_not_quotation_mark = 1; for (int k=0; k<strlen(text); k++) { @@ -178,11 +184,19 @@ void find_html_tag_by_class(char *class_name) { } else { int open_tag_pos = find_start_of_opening_tag_pos(k); find_tag_name(open_tag_pos); - int close_tag_pos = find_closing_tag_pos(open_tag_pos, false); - for (int e=open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); + int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); + int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false); + if (except) { + matches = realloc(matches, (counter+1) * sizeof(struct match)); + matches[counter].start = open_tag_pos; + matches[counter].end = close_tag_pos; + counter++; + } else { + for (int e=open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); } - printf("\n"); } } failure = 0; @@ -193,13 +207,26 @@ void find_html_tag_by_class(char *class_name) { o = 0; } } - // printf("counter: %d\n", counter); + if (except) { + int start = 0; + for (int i=0; i<counter; i++) { + for (int e=start; e<matches[i].start; e++) { + printf("%c", text[e]); + } + start = matches[i].end; + } + for (int i=start; i<strlen(text); i++) { + printf("%c", text[i]); + } + printf("\n"); + free(matches); + } } void find_html_tag_by_id(char *id_name) { int o = 0; int failure = 0; - // int counter = 0; + int counter = 0; int is_not_quotation_mark = 1; for (int k=0; k<strlen(text); k++) { @@ -243,10 +270,17 @@ void find_html_tag_by_id(char *id_name) { find_tag_name(start_of_open_tag_pos); int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, false); - for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); + if (except) { + matches = realloc(matches, (counter+1) * sizeof(struct match)); + matches[counter].start = start_of_open_tag_pos; + matches[counter].end = close_tag_pos; + counter++; + } else { + for (int e=start_of_open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); } - printf("\n"); } } failure = 0; @@ -257,10 +291,24 @@ void find_html_tag_by_id(char *id_name) { o = 0; } } - // printf("counter: %d\n", counter); + if (except) { + int start = 0; + for (int i=0; i<counter; i++) { + for (int e=start; e<matches[i].start; e++) { + printf("%c", text[e]); + } + start = matches[i].end; + } + for (int i=start; i<strlen(text); i++) { + printf("%c", text[i]); + } + printf("\n"); + free(matches); + } } void find_html_tag_by_tag() { int failure = 0; + int counter = 0; for (int k=0; k<strlen(text); k++) { if (text[k] == '<' && text[k+1] != '/') { for (int o=0; o<strlen(attribute_name); o++) { @@ -286,16 +334,37 @@ void find_html_tag_by_tag() { printf("\n"); } else { int close_tag_pos = find_closing_tag_pos(after_tag_pos, false); - for (int e=open_tag_pos; e<close_tag_pos; e++) { - printf("%c", text[e]); + if (except) { + matches = realloc(matches, (counter+1) * sizeof(struct match)); + matches[counter].start = open_tag_pos; + matches[counter].end = close_tag_pos; + counter++; + } else { + for (int e=open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); } - printf("\n"); } } } failure = 0; } } + if (except) { + int start = 0; + for (int i=0; i<counter; i++) { + for (int e=start; e<matches[i].start; e++) { + printf("%c", text[e]); + } + start = matches[i].end; + } + for (int i=start; i<strlen(text); i++) { + printf("%c", text[i]); + } + printf("\n"); + free(matches); + } } void find_html_tag() { @@ -331,10 +400,11 @@ int main(int argc, char *argv[]) { static struct option long_options[] = { { "attribute", required_argument, 0, 'a' }, { "innerhtml", no_argument, 0, 'i' }, + { "except", no_argument, 0, 'e' }, { 0, 0, 0, 0 } }; int option_index = 0; - while ((o = getopt_long(argc, argv, "ia:", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "eia:", long_options, &option_index)) != -1) { switch(o) { case 'a': for (int j=0; j<strlen(optarg); j++) { @@ -344,8 +414,15 @@ int main(int argc, char *argv[]) { case 'i': inner_html = true; break; + case 'e': + except = true; + break; } } + if (inner_html && except) { + printf("You can't use the options -i (--innerhtml) and -e (--except) at the same time.\n"); + return -1; + } if (argc == (optind + 1)) { if (*argv[argc-1] == '-') { while (read(0, &buffer, 1) > 0) { @@ -357,6 +434,7 @@ int main(int argc, char *argv[]) { return -1; } } + text[i] = '\0'; find_html_tag(); free(text); } else { @@ -371,6 +449,7 @@ int main(int argc, char *argv[]) { return -1; } } + text[i] = '\0'; find_html_tag(); free(text); } else { diff --git a/todo b/todo @@ -1,4 +1,3 @@ +refactor; heavy implement find_attribute_value_by_* implement filtering not only by class or id, also like this .test[data="asdf"] -improve structure of code -make it better man -\ No newline at end of file