commit 6d4549a5f6312fc8770145fdf9e99ecf3ff1cef1
parent 610d5dd2650669c481d7be1ea630844d26cdf79b
Author: Robin <kroekerrobin@gmail.com>
Date: Mon, 1 Apr 2024 17:02:39 +0200
Improve ampersand handling
Not only parse and encode named character
references but also try to be mercyful of
mistakes related to the ampersand character.
Diffstat:
| M | htex.c | | | 2 | +- |
| M | html.c | | | 29 | ++++++++++++++++++++++++----- |
| M | html.h | | | 1 | + |
| D | lib.c | | | 108 | ------------------------------------------------------------------------------- |
| A | misc.c | | | 123 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 149 insertions(+), 114 deletions(-)
diff --git a/htex.c b/htex.c
@@ -5,7 +5,7 @@
#include <getopt.h>
#include <inttypes.h>
#include <grapheme.h>
-#include "lib.c"
+#include "misc.c"
#include "html.c"
struct find_opts *parseFilterOpts(const char *pattern)
diff --git a/html.c b/html.c
@@ -369,12 +369,18 @@ char *parseNamedCharRef(char *text, size_t off, size_t len)
namedCharRef[0] = 0;
size_t ret;
uint_least32_t cp;
- do {
+ int i = 0;
+ for (;;)
+ {
ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp);
+ if (cp == AMPERSAND || isASCIIWhitespace(cp))
+ break;
namedCharRef = stringCat(namedCharRef, cpToChars(cp, ret));
+ if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
+ break;
off += ret;
+ i++;
}
- while (cp != SEMICOLON);
return namedCharRef;
}
@@ -386,7 +392,7 @@ char *encodeNamedCharRef(const char *name)
size_t len;
for (int i=0; i<NAMED_CHAR_REF_COUNT; i++)
{
- if (strcmp(entities[i].name, name) == 0)
+ if (startsWith(name, entities[i].name))
{
len = grapheme_encode_utf8(entities[i].cp[0], cp, MAX_CODEPOINT_SIZE);
strcpy(buf, cp);
@@ -396,10 +402,24 @@ char *encodeNamedCharRef(const char *name)
strcat(buf, cp);
}
buf[len] = 0;
+ const char *part = &name[strlen(entities[i].name)];
+ size_t partLen = strlen(part);
+ if (partLen > 0)
+ {
+ if (partLen == 1 && part[0] == ';')
+ return buf;
+ buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+partLen);
+ strcat(buf, &name[strlen(entities[i].name)]);
+ buf[len+partLen] = 0;
+ }
return buf;
}
}
- return NULL;
+ buf = realloc(buf, (strlen(name)+2) * sizeof(char));
+ buf[0] = '&';
+ buf[1] = 0;
+ strcat(buf, name);
+ return buf;
}
struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList)
@@ -680,7 +700,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
{
state = STATE_CHAR_REF_NUMERIC;
break;
- // handle decimal and hexadecimal numeric character reference
}
char *namedCharRef = parseNamedCharRef(text, off, len);
stillOpenTag = getLastOpenTag(tagList);
diff --git a/html.h b/html.h
@@ -21,6 +21,7 @@
#define CAPITAL_LETTER_X 0x58
#define NAMED_CHAR_REF_COUNT 2231
+#define LONGEST_NAMED_CHAR_REF 32
#define MAX_CODEPOINT_SIZE 4
static const char *voidElements[] = {
diff --git a/lib.c b/lib.c
@@ -1,108 +0,0 @@
-char *stringCat(char *str1, char *str2)
-{
- int str1Len = 0;
- int str2Len = 0;
- if (str1)
- str1Len = strlen(str1);
- if (str2)
- str2Len = strlen(str2);
- char *string = malloc((str1Len+str2Len+1) * sizeof(char));
- int i = 0;
- int k = 0;
- for (; i<str1Len; i++)
- string[i] = str1[i];
- for (; k<str2Len; k++)
- string[i+k] = str2[k];
- string[i+k] = '\0';
- free(str1);
- free(str2);
- return string;
-}
-
-char *cpToChars(uint_least32_t cp, size_t len)
-{
- char *str = malloc((len+1) * sizeof(char));
- grapheme_encode_utf8(cp, str, len);
- str[len] = 0;
- return str;
-}
-
-char *trim(char *text)
-{
- char *trimmedText = NULL;
- int begin = 0;
- int end = 0;
- for (int i=0; i<strlen(text); i++)
- {
- if
- (
- text[i] == ' ' ||
- text[i] == '\n' ||
- text[i] == '\t' ||
- text[i] == '\r'
- )
- begin++;
- else
- break;
- }
- for (int i=strlen(text)-1; i>=0; i--)
- {
- if
- (
- text[i] == ' '||
- text[i] == '\n' ||
- text[i] == '\t' ||
- text[i] == '\r'
- )
- end++;
- else
- break;
- }
- int k = 0;
- for (int i=0; i<strlen(text); i++)
- {
- if (i >= begin && i < strlen(text) - end)
- {
- trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
- trimmedText[k] = text[i];
- k++;
- }
- }
- trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
- trimmedText[k] = 0;
- return trimmedText;
-}
-
-// Do not use for reading from a socket fd
-bool tryRead(char *buf, FILE *fp)
-{
- size_t bytesRead = fread(buf, 1, 1, fp);
- if (feof(fp) != 0)
- return false;
- if (ferror(fp) != 0)
- tryRead(buf, fp);
- if (bytesRead != 1)
- tryRead(buf, fp);
- return true;
-}
-
-char *readFile(FILE *fp)
-{
- char *text = NULL;
- int i = 0;
- char buf;
- while (1)
- {
- if (tryRead(&buf, fp))
- {
- text = realloc(text, (i+1) * sizeof(char));
- text[i] = buf;
- i++;
- }
- else
- break;
- }
- text = realloc(text, (i+1) * sizeof(char));
- text[i] = 0;
- return text;
-}
diff --git a/misc.c b/misc.c
@@ -0,0 +1,123 @@
+char *stringCat(char *str1, char *str2)
+{
+ int str1Len = 0;
+ int str2Len = 0;
+ if (str1)
+ str1Len = strlen(str1);
+ if (str2)
+ str2Len = strlen(str2);
+ char *string = malloc((str1Len+str2Len+1) * sizeof(char));
+ int i = 0;
+ int k = 0;
+ for (; i<str1Len; i++)
+ string[i] = str1[i];
+ for (; k<str2Len; k++)
+ string[i+k] = str2[k];
+ string[i+k] = '\0';
+ free(str1);
+ free(str2);
+ return string;
+}
+
+char *cpToChars(uint_least32_t cp, size_t len)
+{
+ char *str = malloc((len+1) * sizeof(char));
+ grapheme_encode_utf8(cp, str, len);
+ str[len] = 0;
+ return str;
+}
+
+char *trim(char *text)
+{
+ char *trimmedText = NULL;
+ int begin = 0;
+ int end = 0;
+ for (int i=0; i<strlen(text); i++)
+ {
+ if
+ (
+ text[i] == ' ' ||
+ text[i] == '\n' ||
+ text[i] == '\t' ||
+ text[i] == '\r'
+ )
+ begin++;
+ else
+ break;
+ }
+ for (int i=strlen(text)-1; i>=0; i--)
+ {
+ if
+ (
+ text[i] == ' '||
+ text[i] == '\n' ||
+ text[i] == '\t' ||
+ text[i] == '\r'
+ )
+ end++;
+ else
+ break;
+ }
+ int k = 0;
+ for (int i=0; i<strlen(text); i++)
+ {
+ if (i >= begin && i < strlen(text) - end)
+ {
+ trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+ trimmedText[k] = text[i];
+ k++;
+ }
+ }
+ trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+ trimmedText[k] = 0;
+ return trimmedText;
+}
+
+bool startsWith(const char *string, const char *part)
+{
+ size_t partLen = strlen(part);
+ if (partLen > strlen(string))
+ return false;
+ for (int i=0; i<partLen; i++)
+ {
+ if (string[i] != part[i])
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Do not use for reading from a socket fd
+bool tryRead(char *buf, FILE *fp)
+{
+ size_t bytesRead = fread(buf, 1, 1, fp);
+ if (feof(fp) != 0)
+ return false;
+ if (ferror(fp) != 0)
+ tryRead(buf, fp);
+ if (bytesRead != 1)
+ tryRead(buf, fp);
+ return true;
+}
+
+char *readFile(FILE *fp)
+{
+ char *text = NULL;
+ int i = 0;
+ char buf;
+ while (1)
+ {
+ if (tryRead(&buf, fp))
+ {
+ text = realloc(text, (i+1) * sizeof(char));
+ text[i] = buf;
+ i++;
+ }
+ else
+ break;
+ }
+ text = realloc(text, (i+1) * sizeof(char));
+ text[i] = 0;
+ return text;
+}