htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 6d4549a5f6312fc8770145fdf9e99ecf3ff1cef1
parent 610d5dd2650669c481d7be1ea630844d26cdf79b
Author: Robin <kroekerrobin@gmail.com>
Date:   Mon,  1 Apr 2024 17:02:39 +0200

Improve ampersand handling

Not only parse and encode named character
references but also try to be mercyful of
mistakes related to the ampersand character.

Diffstat:
Mhtex.c | 2+-
Mhtml.c | 29++++++++++++++++++++++++-----
Mhtml.h | 1+
Dlib.c | 108-------------------------------------------------------------------------------
Amisc.c | 123+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 149 insertions(+), 114 deletions(-)

diff --git a/htex.c b/htex.c @@ -5,7 +5,7 @@ #include <getopt.h> #include <inttypes.h> #include <grapheme.h> -#include "lib.c" +#include "misc.c" #include "html.c" struct find_opts *parseFilterOpts(const char *pattern) diff --git a/html.c b/html.c @@ -369,12 +369,18 @@ char *parseNamedCharRef(char *text, size_t off, size_t len) namedCharRef[0] = 0; size_t ret; uint_least32_t cp; - do { + int i = 0; + for (;;) + { ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp); + if (cp == AMPERSAND || isASCIIWhitespace(cp)) + break; namedCharRef = stringCat(namedCharRef, cpToChars(cp, ret)); + if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) + break; off += ret; + i++; } - while (cp != SEMICOLON); return namedCharRef; } @@ -386,7 +392,7 @@ char *encodeNamedCharRef(const char *name) size_t len; for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) { - if (strcmp(entities[i].name, name) == 0) + if (startsWith(name, entities[i].name)) { len = grapheme_encode_utf8(entities[i].cp[0], cp, MAX_CODEPOINT_SIZE); strcpy(buf, cp); @@ -396,10 +402,24 @@ char *encodeNamedCharRef(const char *name) strcat(buf, cp); } buf[len] = 0; + const char *part = &name[strlen(entities[i].name)]; + size_t partLen = strlen(part); + if (partLen > 0) + { + if (partLen == 1 && part[0] == ';') + return buf; + buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+partLen); + strcat(buf, &name[strlen(entities[i].name)]); + buf[len+partLen] = 0; + } return buf; } } - return NULL; + buf = realloc(buf, (strlen(name)+2) * sizeof(char)); + buf[0] = '&'; + buf[1] = 0; + strcat(buf, name); + return buf; } struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList) @@ -680,7 +700,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis { state = STATE_CHAR_REF_NUMERIC; break; - // handle decimal and hexadecimal numeric character reference } char *namedCharRef = parseNamedCharRef(text, off, len); stillOpenTag = getLastOpenTag(tagList); diff --git a/html.h b/html.h @@ -21,6 +21,7 @@ #define CAPITAL_LETTER_X 0x58 #define NAMED_CHAR_REF_COUNT 2231 +#define LONGEST_NAMED_CHAR_REF 32 #define MAX_CODEPOINT_SIZE 4 static const char *voidElements[] = { diff --git a/lib.c b/lib.c @@ -1,108 +0,0 @@ -char *stringCat(char *str1, char *str2) -{ - int str1Len = 0; - int str2Len = 0; - if (str1) - str1Len = strlen(str1); - if (str2) - str2Len = strlen(str2); - char *string = malloc((str1Len+str2Len+1) * sizeof(char)); - int i = 0; - int k = 0; - for (; i<str1Len; i++) - string[i] = str1[i]; - for (; k<str2Len; k++) - string[i+k] = str2[k]; - string[i+k] = '\0'; - free(str1); - free(str2); - return string; -} - -char *cpToChars(uint_least32_t cp, size_t len) -{ - char *str = malloc((len+1) * sizeof(char)); - grapheme_encode_utf8(cp, str, len); - str[len] = 0; - return str; -} - -char *trim(char *text) -{ - char *trimmedText = NULL; - int begin = 0; - int end = 0; - for (int i=0; i<strlen(text); i++) - { - if - ( - text[i] == ' ' || - text[i] == '\n' || - text[i] == '\t' || - text[i] == '\r' - ) - begin++; - else - break; - } - for (int i=strlen(text)-1; i>=0; i--) - { - if - ( - text[i] == ' '|| - text[i] == '\n' || - text[i] == '\t' || - text[i] == '\r' - ) - end++; - else - break; - } - int k = 0; - for (int i=0; i<strlen(text); i++) - { - if (i >= begin && i < strlen(text) - end) - { - trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); - trimmedText[k] = text[i]; - k++; - } - } - trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); - trimmedText[k] = 0; - return trimmedText; -} - -// Do not use for reading from a socket fd -bool tryRead(char *buf, FILE *fp) -{ - size_t bytesRead = fread(buf, 1, 1, fp); - if (feof(fp) != 0) - return false; - if (ferror(fp) != 0) - tryRead(buf, fp); - if (bytesRead != 1) - tryRead(buf, fp); - return true; -} - -char *readFile(FILE *fp) -{ - char *text = NULL; - int i = 0; - char buf; - while (1) - { - if (tryRead(&buf, fp)) - { - text = realloc(text, (i+1) * sizeof(char)); - text[i] = buf; - i++; - } - else - break; - } - text = realloc(text, (i+1) * sizeof(char)); - text[i] = 0; - return text; -} diff --git a/misc.c b/misc.c @@ -0,0 +1,123 @@ +char *stringCat(char *str1, char *str2) +{ + int str1Len = 0; + int str2Len = 0; + if (str1) + str1Len = strlen(str1); + if (str2) + str2Len = strlen(str2); + char *string = malloc((str1Len+str2Len+1) * sizeof(char)); + int i = 0; + int k = 0; + for (; i<str1Len; i++) + string[i] = str1[i]; + for (; k<str2Len; k++) + string[i+k] = str2[k]; + string[i+k] = '\0'; + free(str1); + free(str2); + return string; +} + +char *cpToChars(uint_least32_t cp, size_t len) +{ + char *str = malloc((len+1) * sizeof(char)); + grapheme_encode_utf8(cp, str, len); + str[len] = 0; + return str; +} + +char *trim(char *text) +{ + char *trimmedText = NULL; + int begin = 0; + int end = 0; + for (int i=0; i<strlen(text); i++) + { + if + ( + text[i] == ' ' || + text[i] == '\n' || + text[i] == '\t' || + text[i] == '\r' + ) + begin++; + else + break; + } + for (int i=strlen(text)-1; i>=0; i--) + { + if + ( + text[i] == ' '|| + text[i] == '\n' || + text[i] == '\t' || + text[i] == '\r' + ) + end++; + else + break; + } + int k = 0; + for (int i=0; i<strlen(text); i++) + { + if (i >= begin && i < strlen(text) - end) + { + trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); + trimmedText[k] = text[i]; + k++; + } + } + trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); + trimmedText[k] = 0; + return trimmedText; +} + +bool startsWith(const char *string, const char *part) +{ + size_t partLen = strlen(part); + if (partLen > strlen(string)) + return false; + for (int i=0; i<partLen; i++) + { + if (string[i] != part[i]) + { + return false; + } + } + return true; +} + +// Do not use for reading from a socket fd +bool tryRead(char *buf, FILE *fp) +{ + size_t bytesRead = fread(buf, 1, 1, fp); + if (feof(fp) != 0) + return false; + if (ferror(fp) != 0) + tryRead(buf, fp); + if (bytesRead != 1) + tryRead(buf, fp); + return true; +} + +char *readFile(FILE *fp) +{ + char *text = NULL; + int i = 0; + char buf; + while (1) + { + if (tryRead(&buf, fp)) + { + text = realloc(text, (i+1) * sizeof(char)); + text[i] = buf; + i++; + } + else + break; + } + text = realloc(text, (i+1) * sizeof(char)); + text[i] = 0; + return text; +}