htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit aff92fa043b11f5e9e4d4157dae664da03262400
parent b0b78096a20ab409a268ec2fb603ec99feccd2c7
Author: Robin <kroekerrobin@gmail.com>
Date:   Tue,  2 Apr 2024 18:36:56 +0200

Parse char refs in attr values

Diffstat:
Mhtml.c | 109+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 88 insertions(+), 21 deletions(-)

diff --git a/html.c b/html.c @@ -363,8 +363,23 @@ char *parseNumericCharRef(char *text, size_t off, int base, size_t *newOffset) return character; } -char *parseNamedCharRef(char *text, size_t off, size_t len) +char *parseNamedCharRef(char *text, size_t off, size_t len, enum attr_value_syntax avs) { + uint_least32_t stopAt = 0; + switch(avs) + { + case AVS_QUOTATION_MARK: + stopAt = QUOTATION_MARK; + break; + case AVS_APOSTROPHE: + stopAt = APOSTROPHE; + break; + case AVS_UNQUOTED: + stopAt = GREATER_THAN_SIGN; + break; + case AVS_NO: /* Just to silence the compilier warning */ + break; + } char *namedCharRef = malloc(sizeof(char)); namedCharRef[0] = 0; size_t ret; @@ -375,6 +390,8 @@ char *parseNamedCharRef(char *text, size_t off, size_t len) ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp); if (cp == AMPERSAND || isASCIIWhitespace(cp)) break; + if (avs > AVS_NO && cp == stopAt) + break; namedCharRef = stringCat(namedCharRef, cpToChars(cp, ret)); if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) break; @@ -432,7 +449,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis struct tag *stillOpenTag = tag; char *endTag = malloc(sizeof(char)); endTag[0] = 0; - enum state returnToState; + enum state returnToState = STATE_INNER_TEXT; size_t a = 0; size_t attrNameCount = 0; enum attr_value_syntax attrValueSyntax = AVS_NO; @@ -565,18 +582,31 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE) { - char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); - strcpy(tmpName, tag->attrs[a]->name); - tag->attrs = realloc( - tag->attrs, - (a+1) * sizeof(struct attr) - ); - a++; - tag->attrs[a] = initAttr(); - free(tag->attrs[a]->name); - tag->attrs[a]->name = tmpName; - tag->attrsLen++; - attrNameCount = a + 1; + if ( + strcmp("id", tag->attrs[a]->name) == 0 || + strcmp("class", tag->attrs[a]->name) == 0 + ) + { + char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); + strcpy(tmpName, tag->attrs[a]->name); + tag->attrs = realloc( + tag->attrs, + (a+1) * sizeof(struct attr) + ); + a++; + tag->attrs[a] = initAttr(); + free(tag->attrs[a]->name); + tag->attrs[a]->name = tmpName; + tag->attrsLen++; + attrNameCount = a + 1; + } + else + { + tag->attrs[a]->value = stringCat( + tag->attrs[a]->value, + cpToChars(cp, ret) + ); + } } break; } @@ -622,6 +652,12 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } if (attrValueSyntax > AVS_NO) { + if (cp == AMPERSAND) + { + state = STATE_CHAR_REF; + returnToState = STATE_ATTR_VALUE; + break; + } tag->attrs[a]->value = stringCat( tag->attrs[a]->value, cpToChars(cp, ret) @@ -701,10 +737,21 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_CHAR_REF_NUMERIC; break; } - char *namedCharRef = parseNamedCharRef(text, off, len); - stillOpenTag = getLastOpenTag(tagList); - stillOpenTag->innerText = stringCat(stillOpenTag->innerText, encodeNamedCharRef(namedCharRef)); + char *namedCharRef = parseNamedCharRef(text, off, len, attrValueSyntax); off += strlen(namedCharRef)-1; + char *encodedNamedCharRef = encodeNamedCharRef(namedCharRef); + if (returnToState == STATE_INNER_TEXT) + { + stillOpenTag = getLastOpenTag(tagList); + stillOpenTag->innerText = stringCat(stillOpenTag->innerText, encodedNamedCharRef); + } + else if (returnToState == STATE_ATTR_VALUE) + { + tag->attrs[a]->value = stringCat( + tag->attrs[a]->value, + encodedNamedCharRef + ); + } free(namedCharRef); state = returnToState; break; @@ -713,9 +760,19 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis { size_t newOffset; char *numericCharRef = parseNumericCharRef(text, off+1, 16, &newOffset); - stillOpenTag = getLastOpenTag(tagList); - stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef); off += newOffset; + if (returnToState == STATE_INNER_TEXT) + { + stillOpenTag = getLastOpenTag(tagList); + stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef); + } + else if (returnToState == STATE_ATTR_VALUE) + { + tag->attrs[a]->value = stringCat( + tag->attrs[a]->value, + numericCharRef + ); + } state = returnToState; break; } @@ -723,9 +780,19 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis { size_t newOffset; char *numericCharRef = parseNumericCharRef(text, off, 10, &newOffset); - stillOpenTag = getLastOpenTag(tagList); - stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef); off += newOffset-1; + if (returnToState == STATE_INNER_TEXT) + { + stillOpenTag = getLastOpenTag(tagList); + stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef); + } + else if (returnToState == STATE_ATTR_VALUE) + { + tag->attrs[a]->value = stringCat( + tag->attrs[a]->value, + numericCharRef + ); + } state = returnToState; break; }