htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 4873756e4ae3c76f0f90da7f3feb61442dd3f4ef
parent 4007840a37366244a2a5b5041d0ebb2853863eaf
Author: Robin <kroekerrobin@gmail.com>
Date:   Sat, 27 Aug 2022 22:43:53 +0200

Start implementing finding by attribute name

Instead of only being able to find by a class
name, find instead by id name or class name

Diffstat:
A.gitignore | 3+++
Mhtex.1 | 18++++++++++++------
Mhtex.c | 107++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Atodo | 2++
4 files changed, 112 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,2 @@ +test.html +htex +\ No newline at end of file diff --git a/htex.1 b/htex.1 @@ -3,22 +3,28 @@ htex \- \fI\,ex\/\fRtract \fI\,ht\/\fRml .SH SYNOPSIS .B htex --c \fI\,class_name\/\fR [-i] -|\fI\,filename\/\fR +-a \fI\,attribute_name\/\fR [-i] -|\fI\,filename\/\fR .SH DESCRIPTION .PP Receives text from stdin or a file and interprets it as html. You provide -a \fI\,class_name\/\fR via the -.B -c +an \fI\,attribute_name\/\fR via the +.B -a option and htex will write the html tag -found by \fI\,class_name\/\fR to stdout. +found by \fI\,attribute_name\/\fR to stdout. Pass the .B -i option to only output the content (innerHTML) of the html tag. .TP -\fB\,-c\/\fR, \fB\,--class\/\fR \fI\,class_name\/\fR -Filter html by the class name of a html tag +\fB\,-a\/\fR, \fB\,--attribute\/\fR \fI\,attribute_name\/\fR +Filter html by the attribute name of a html tag. +If \fI\,attribute_name\/\fR starts with a dot (.) then +the following characters will be taken as the class name +of a tag. If \fI\,attribute_name\/\fR starts with a hashtag (#) +the following characters will be taken as the id name of a tag. +If \fI\,attribute_name\/\fR is neither a dot nor a hashtag it +will be taken as a tag name. .TP \fB\,-i\/\fR, \fB\,--innerhtml\/\fR Instead of returning the html tag only return diff --git a/htex.c b/htex.c @@ -7,8 +7,9 @@ #define ONE_MILLION 1000000 char text[ONE_MILLION]; -char class_name[200]; +char attribute_name[200]; char tag_name[50]; +bool inner_html = false; int find_start_of_opening_tag_pos(int class_position) { int i = 1; @@ -114,7 +115,7 @@ int find_closing_tag_pos(int open_tag_pos, bool inner_html) { return -1; } -bool correct_class_name_begin_or_end(char prev_char) { +bool correct_name_begin_or_end(char prev_char) { switch(prev_char) { case '"': return true; @@ -127,7 +128,7 @@ bool correct_class_name_begin_or_end(char prev_char) { } } -void find_html_tag_by_class_name(bool inner_html) { +void find_html_tag_by_class(char *class_name) { int o = 0; int failure = 0; // int counter = 0; @@ -155,8 +156,8 @@ void find_html_tag_by_class_name(bool inner_html) { } if (failure == 0) { if ( - !correct_class_name_begin_or_end(text[k+6+o]) || - !correct_class_name_begin_or_end(text[k+7+o+strlen(class_name)]) + !correct_name_begin_or_end(text[k+6+o]) || + !correct_name_begin_or_end(text[k+7+o+strlen(class_name)]) ) { failure = 1; } @@ -193,23 +194,105 @@ void find_html_tag_by_class_name(bool inner_html) { // printf("counter: %d\n", counter); } +void find_html_tag_by_id(char *id_name) { + int o = 0; + int failure = 0; + // int counter = 0; + int is_not_quotation_mark = 1; + + for (int k=0; k<strlen(text); k++) { + if ( + text[k] == 'i' && + text[k+1] == 'd' + ) { + while (is_not_quotation_mark == 1) { + if (text[k+4+o] == '"' || text[k+4+o] == '\'') { + is_not_quotation_mark = 0; + break; + } + if (id_name[0] == text[k+4+o]) { + for (int l=1; l<strlen(id_name); l++) { + if (id_name[l] != text[k+4+o+l]) { + failure = 1; + break; + } + } + if (failure == 0) { + if ( + !correct_name_begin_or_end(text[k+3+o]) || + !correct_name_begin_or_end(text[k+4+o+strlen(id_name)]) + ) { + failure = 1; + } + } + if (failure == 0) { + // counter++; + if (inner_html) { + int start_of_open_tag_pos = find_start_of_opening_tag_pos(k); + find_tag_name(start_of_open_tag_pos); + int end_of_open_tag_pos = find_end_of_opening_tag_pos(k); + int close_tag_pos = find_closing_tag_pos(end_of_open_tag_pos, true); + for (int e=end_of_open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); + } else { + int open_tag_pos = find_start_of_opening_tag_pos(k); + find_tag_name(open_tag_pos); + int close_tag_pos = find_closing_tag_pos(open_tag_pos, false); + for (int e=open_tag_pos; e<close_tag_pos; e++) { + printf("%c", text[e]); + } + printf("\n"); + } + } + failure = 0; + } + o++; + } + is_not_quotation_mark = 1; + o = 0; + } + } + // printf("counter: %d\n", counter); +} +void find_html_tag_by_tag() { + printf("Not yet implemented.\n"); +} + +void find_html_tag() { + char identifier[200]; + for (int i=0; i<strlen(attribute_name); i++) { + identifier[i] = attribute_name[i+1]; + } + switch(attribute_name[0]) { + case '.': + find_html_tag_by_class(identifier); + break; + case '#': + find_html_tag_by_id(identifier); + break; + default: + find_html_tag_by_tag(); // it uses attribute_name directly + } +} + int main(int argc, char *argv[]) { int i = 0; - bool inner_html = false; char buffer; int o; static struct option long_options[] = { - { "class", required_argument, 0, 'c' }, + { "attribute", required_argument, 0, 'a' }, { "innerhtml", no_argument, 0, 'i' }, { 0, 0, 0, 0 } }; int option_index = 0; - while ((o = getopt_long(argc, argv, "ic:", long_options, &option_index)) != -1) { + while ((o = getopt_long(argc, argv, "ia:", long_options, &option_index)) != -1) { switch(o) { - case 'c': + case 'a': for (int j=0; j<strlen(optarg); j++) { - class_name[j] = optarg[j]; + attribute_name[j] = optarg[j]; } break; case 'i': @@ -223,7 +306,7 @@ int main(int argc, char *argv[]) { text[i] = buffer; i++; } - find_html_tag_by_class_name(inner_html); + find_html_tag(); } else { int fd = open(argv[argc-1], O_RDONLY); if (fd != -1) { @@ -231,7 +314,7 @@ int main(int argc, char *argv[]) { text[i] = buffer; i++; } - find_html_tag_by_class_name(inner_html); + find_html_tag(); } else { printf("Couldn't read file \"%s\"\n", argv[argc-1]); } diff --git a/todo b/todo @@ -0,0 +1 @@ +doesn't find tag if class="" is in a new line different from the tag. +\ No newline at end of file