commit aff92fa043b11f5e9e4d4157dae664da03262400
parent b0b78096a20ab409a268ec2fb603ec99feccd2c7
Author: Robin <kroekerrobin@gmail.com>
Date: Tue, 2 Apr 2024 18:36:56 +0200
Parse char refs in attr values
Diffstat:
| M | html.c | | | 109 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- |
1 file changed, 88 insertions(+), 21 deletions(-)
diff --git a/html.c b/html.c
@@ -363,8 +363,23 @@ char *parseNumericCharRef(char *text, size_t off, int base, size_t *newOffset)
return character;
}
-char *parseNamedCharRef(char *text, size_t off, size_t len)
+char *parseNamedCharRef(char *text, size_t off, size_t len, enum attr_value_syntax avs)
{
+ uint_least32_t stopAt = 0;
+ switch(avs)
+ {
+ case AVS_QUOTATION_MARK:
+ stopAt = QUOTATION_MARK;
+ break;
+ case AVS_APOSTROPHE:
+ stopAt = APOSTROPHE;
+ break;
+ case AVS_UNQUOTED:
+ stopAt = GREATER_THAN_SIGN;
+ break;
+ case AVS_NO: /* Just to silence the compilier warning */
+ break;
+ }
char *namedCharRef = malloc(sizeof(char));
namedCharRef[0] = 0;
size_t ret;
@@ -375,6 +390,8 @@ char *parseNamedCharRef(char *text, size_t off, size_t len)
ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp);
if (cp == AMPERSAND || isASCIIWhitespace(cp))
break;
+ if (avs > AVS_NO && cp == stopAt)
+ break;
namedCharRef = stringCat(namedCharRef, cpToChars(cp, ret));
if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF)
break;
@@ -432,7 +449,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
struct tag *stillOpenTag = tag;
char *endTag = malloc(sizeof(char));
endTag[0] = 0;
- enum state returnToState;
+ enum state returnToState = STATE_INNER_TEXT;
size_t a = 0;
size_t attrNameCount = 0;
enum attr_value_syntax attrValueSyntax = AVS_NO;
@@ -565,18 +582,31 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
}
else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE)
{
- char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
- strcpy(tmpName, tag->attrs[a]->name);
- tag->attrs = realloc(
- tag->attrs,
- (a+1) * sizeof(struct attr)
- );
- a++;
- tag->attrs[a] = initAttr();
- free(tag->attrs[a]->name);
- tag->attrs[a]->name = tmpName;
- tag->attrsLen++;
- attrNameCount = a + 1;
+ if (
+ strcmp("id", tag->attrs[a]->name) == 0 ||
+ strcmp("class", tag->attrs[a]->name) == 0
+ )
+ {
+ char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char));
+ strcpy(tmpName, tag->attrs[a]->name);
+ tag->attrs = realloc(
+ tag->attrs,
+ (a+1) * sizeof(struct attr)
+ );
+ a++;
+ tag->attrs[a] = initAttr();
+ free(tag->attrs[a]->name);
+ tag->attrs[a]->name = tmpName;
+ tag->attrsLen++;
+ attrNameCount = a + 1;
+ }
+ else
+ {
+ tag->attrs[a]->value = stringCat(
+ tag->attrs[a]->value,
+ cpToChars(cp, ret)
+ );
+ }
}
break;
}
@@ -622,6 +652,12 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
}
if (attrValueSyntax > AVS_NO)
{
+ if (cp == AMPERSAND)
+ {
+ state = STATE_CHAR_REF;
+ returnToState = STATE_ATTR_VALUE;
+ break;
+ }
tag->attrs[a]->value = stringCat(
tag->attrs[a]->value,
cpToChars(cp, ret)
@@ -701,10 +737,21 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
state = STATE_CHAR_REF_NUMERIC;
break;
}
- char *namedCharRef = parseNamedCharRef(text, off, len);
- stillOpenTag = getLastOpenTag(tagList);
- stillOpenTag->innerText = stringCat(stillOpenTag->innerText, encodeNamedCharRef(namedCharRef));
+ char *namedCharRef = parseNamedCharRef(text, off, len, attrValueSyntax);
off += strlen(namedCharRef)-1;
+ char *encodedNamedCharRef = encodeNamedCharRef(namedCharRef);
+ if (returnToState == STATE_INNER_TEXT)
+ {
+ stillOpenTag = getLastOpenTag(tagList);
+ stillOpenTag->innerText = stringCat(stillOpenTag->innerText, encodedNamedCharRef);
+ }
+ else if (returnToState == STATE_ATTR_VALUE)
+ {
+ tag->attrs[a]->value = stringCat(
+ tag->attrs[a]->value,
+ encodedNamedCharRef
+ );
+ }
free(namedCharRef);
state = returnToState;
break;
@@ -713,9 +760,19 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
{
size_t newOffset;
char *numericCharRef = parseNumericCharRef(text, off+1, 16, &newOffset);
- stillOpenTag = getLastOpenTag(tagList);
- stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef);
off += newOffset;
+ if (returnToState == STATE_INNER_TEXT)
+ {
+ stillOpenTag = getLastOpenTag(tagList);
+ stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef);
+ }
+ else if (returnToState == STATE_ATTR_VALUE)
+ {
+ tag->attrs[a]->value = stringCat(
+ tag->attrs[a]->value,
+ numericCharRef
+ );
+ }
state = returnToState;
break;
}
@@ -723,9 +780,19 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
{
size_t newOffset;
char *numericCharRef = parseNumericCharRef(text, off, 10, &newOffset);
- stillOpenTag = getLastOpenTag(tagList);
- stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef);
off += newOffset-1;
+ if (returnToState == STATE_INNER_TEXT)
+ {
+ stillOpenTag = getLastOpenTag(tagList);
+ stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef);
+ }
+ else if (returnToState == STATE_ATTR_VALUE)
+ {
+ tag->attrs[a]->value = stringCat(
+ tag->attrs[a]->value,
+ numericCharRef
+ );
+ }
state = returnToState;
break;
}