Improve ampersand handling - htex - simple incorrect html parser

commit 6d4549a5f6312fc8770145fdf9e99ecf3ff1cef1
parent 610d5dd2650669c481d7be1ea630844d26cdf79b
Author: Robin <kroekerrobin@gmail.com>
Date:   Mon,  1 Apr 2024 17:02:39 +0200

Improve ampersand handling

Not only parse and encode named character
references but also try to be mercyful of
mistakes related to the ampersand character.

Diffstat:
M htex.c  | 2 +-
M html.c  | 29 ++++++++++++++++++++++++-----
M html.h  | 1 +
D lib.c  | 108 -------------------------------------------------------------------------------
A misc.c  | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 149 insertions(+), 114 deletions(-)
diff --git a/htex.c b/htex.c
@@ -5,7 +5,7 @@
 #include <getopt.h>
 #include <inttypes.h>
 #include <grapheme.h>
-#include "lib.c"
+#include "misc.c"
 #include "html.c"
 
 struct find_opts *parseFilterOpts(const char *pattern)
diff --git a/html.c b/html.c
@@ -369,12 +369,18 @@ char *parseNamedCharRef(char *text, size_t off, size_t len)
   namedCharRef[0] = 0;
   size_t ret;
   uint_least32_t cp;
-  do {
+  int i = 0;
+  for (;;)
+  {
     ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp);
+    if (cp == AMPERSAND || isASCIIWhitespace(cp))
+      break;
     namedCharRef = stringCat(namedCharRef, cpToChars(cp, ret));
+    if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
+      break;
     off += ret;
+    i++;
   }
-  while (cp != SEMICOLON);
   return namedCharRef;
 }
 
@@ -386,7 +392,7 @@ char *encodeNamedCharRef(const char *name)
   size_t len;
   for (int i=0; i<NAMED_CHAR_REF_COUNT; i++)
   {
-    if (strcmp(entities[i].name, name) == 0)
+    if (startsWith(name, entities[i].name))
     {
       len = grapheme_encode_utf8(entities[i].cp[0], cp, MAX_CODEPOINT_SIZE);
       strcpy(buf, cp);
@@ -396,10 +402,24 @@ char *encodeNamedCharRef(const char *name)
         strcat(buf, cp);
       }
       buf[len] = 0;
+      const char *part = &name[strlen(entities[i].name)];
+      size_t partLen = strlen(part);
+      if (partLen > 0)
+      {
+        if (partLen == 1 && part[0] == ';')
+          return buf;
+        buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+partLen);
+        strcat(buf, &name[strlen(entities[i].name)]);
+        buf[len+partLen] = 0;
+      }
       return buf;
     }
   }
-  return NULL;
+  buf = realloc(buf, (strlen(name)+2) * sizeof(char));
+  buf[0] = '&';
+  buf[1] = 0;
+  strcat(buf, name);
+  return buf;
 }
 
 struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList)
@@ -680,7 +700,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
           {
             state = STATE_CHAR_REF_NUMERIC;
             break;
-            // handle decimal and hexadecimal numeric character reference
           }
           char *namedCharRef = parseNamedCharRef(text, off, len);
 					stillOpenTag = getLastOpenTag(tagList);
diff --git a/html.h b/html.h
@@ -21,6 +21,7 @@
 #define CAPITAL_LETTER_X    0x58
 
 #define NAMED_CHAR_REF_COUNT 2231
+#define LONGEST_NAMED_CHAR_REF 32
 #define MAX_CODEPOINT_SIZE 4
 
 static const char *voidElements[] = {
diff --git a/lib.c b/lib.c
@@ -1,108 +0,0 @@
-char *stringCat(char *str1, char *str2)
-{
-	int str1Len = 0;
-	int str2Len = 0;
-	if (str1)
-		str1Len = strlen(str1);
-	if (str2)
-  	str2Len = strlen(str2);
-  char *string = malloc((str1Len+str2Len+1) * sizeof(char));
-  int i = 0;
-  int k = 0;
-  for (; i<str1Len; i++)
-    string[i] = str1[i];
-  for (; k<str2Len; k++)
-    string[i+k] = str2[k];
-  string[i+k] = '\0';
-	free(str1);
-	free(str2);
-  return string;
-}
-
-char *cpToChars(uint_least32_t cp, size_t len)
-{
-	char *str = malloc((len+1) * sizeof(char));
-	grapheme_encode_utf8(cp, str, len);
-	str[len] = 0;
-	return str;
-}
-
-char *trim(char *text)
-{
-	char *trimmedText = NULL;
-	int begin = 0;
-	int end = 0;
-	for (int i=0; i<strlen(text); i++)
-	{
-		if
-		(
-				text[i] == ' ' ||
-				text[i] == '\n' ||
-				text[i] == '\t' ||
-				text[i] == '\r'
-		)
-			begin++;
-		else
-			break;
-	}
-	for (int i=strlen(text)-1; i>=0; i--)
-	{
-		if
-		(
-			text[i] == ' '||
-			text[i] == '\n' ||
-			text[i] == '\t' ||
-			text[i] == '\r'
-		)
-			end++;
-		else
-			break;
-	}
-	int k = 0;
-	for (int i=0; i<strlen(text); i++)
-	{
-		if (i >= begin && i < strlen(text) - end)
-		{
-			trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
-			trimmedText[k] = text[i];
-			k++;
-		}
-	}
-	trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
-	trimmedText[k] = 0;
-	return trimmedText;
-}
-
-// Do not use for reading from a socket fd
-bool tryRead(char *buf, FILE *fp)
-{
-	size_t bytesRead = fread(buf, 1, 1, fp);
-	if (feof(fp) != 0)
-		return false;
-	if (ferror(fp) != 0)
-		tryRead(buf, fp);
-	if (bytesRead != 1)
-		tryRead(buf, fp);
-	return true;
-}
-
-char *readFile(FILE *fp)
-{
-	char *text = NULL;
-	int i = 0;
-	char buf;
-	while (1)
-	{
-		if (tryRead(&buf, fp))
-		{
-			text = realloc(text, (i+1) * sizeof(char));
-			text[i] = buf;
-			i++;
-		}
-		else
-			break;
-	}
-	text = realloc(text, (i+1) * sizeof(char));
-	text[i] = 0;
-	return text;
-}
diff --git a/misc.c b/misc.c
@@ -0,0 +1,123 @@
+char *stringCat(char *str1, char *str2)
+{
+	int str1Len = 0;
+	int str2Len = 0;
+	if (str1)
+		str1Len = strlen(str1);
+	if (str2)
+  	str2Len = strlen(str2);
+  char *string = malloc((str1Len+str2Len+1) * sizeof(char));
+  int i = 0;
+  int k = 0;
+  for (; i<str1Len; i++)
+    string[i] = str1[i];
+  for (; k<str2Len; k++)
+    string[i+k] = str2[k];
+  string[i+k] = '\0';
+	free(str1);
+	free(str2);
+  return string;
+}
+
+char *cpToChars(uint_least32_t cp, size_t len)
+{
+	char *str = malloc((len+1) * sizeof(char));
+	grapheme_encode_utf8(cp, str, len);
+	str[len] = 0;
+	return str;
+}
+
+char *trim(char *text)
+{
+	char *trimmedText = NULL;
+	int begin = 0;
+	int end = 0;
+	for (int i=0; i<strlen(text); i++)
+	{
+		if
+		(
+				text[i] == ' ' ||
+				text[i] == '\n' ||
+				text[i] == '\t' ||
+				text[i] == '\r'
+		)
+			begin++;
+		else
+			break;
+	}
+	for (int i=strlen(text)-1; i>=0; i--)
+	{
+		if
+		(
+			text[i] == ' '||
+			text[i] == '\n' ||
+			text[i] == '\t' ||
+			text[i] == '\r'
+		)
+			end++;
+		else
+			break;
+	}
+	int k = 0;
+	for (int i=0; i<strlen(text); i++)
+	{
+		if (i >= begin && i < strlen(text) - end)
+		{
+			trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+			trimmedText[k] = text[i];
+			k++;
+		}
+	}
+	trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
+	trimmedText[k] = 0;
+	return trimmedText;
+}
+
+bool startsWith(const char *string, const char *part)
+{
+  size_t partLen = strlen(part);
+  if (partLen > strlen(string))
+    return false;
+  for (int i=0; i<partLen; i++)
+  {
+    if (string[i] != part[i])
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Do not use for reading from a socket fd
+bool tryRead(char *buf, FILE *fp)
+{
+	size_t bytesRead = fread(buf, 1, 1, fp);
+	if (feof(fp) != 0)
+		return false;
+	if (ferror(fp) != 0)
+		tryRead(buf, fp);
+	if (bytesRead != 1)
+		tryRead(buf, fp);
+	return true;
+}
+
+char *readFile(FILE *fp)
+{
+	char *text = NULL;
+	int i = 0;
+	char buf;
+	while (1)
+	{
+		if (tryRead(&buf, fp))
+		{
+			text = realloc(text, (i+1) * sizeof(char));
+			text[i] = buf;
+			i++;
+		}
+		else
+			break;
+	}
+	text = realloc(text, (i+1) * sizeof(char));
+	text[i] = 0;
+	return text;
+}

	htex simple incorrect html parser
	git clone git://git.relim.de/htex.git
	Log \| Files \| Refs \| README

M	htex.c	\|	2	+-
M	html.c	\|	29	++++++++++++++++++++++++-----
M	html.h	\|	1	+
D	lib.c	\|	108	-------------------------------------------------------------------------------
A	misc.c	\|	123	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++