htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit d847e830279219425e57ef45bde04eed13f2ad4b
parent 275940c7345d05564943a2bb14dfe0ccc306a95e
Author: Robin <kroekerrobin@gmail.com>
Date:   Tue,  8 Aug 2023 22:18:02 +0200

Improve comment parsing

Diffstat:
Mhtml.c | 16++++++++++++----
Mhtml.h | 3++-
Mtodo | 1-
3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/html.c b/html.c @@ -10,7 +10,7 @@ const char *stateToString(enum state s) case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; - case STATE_COMMENT_ETC: return "STATE_COMMENT_ETC"; + case STATE_COMMENT: return "STATE_COMMENT"; } } @@ -276,6 +276,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis size_t attrNameCount = 0; size_t attrValueCount = 0; enum attr_value_syntax attrValueSyntax = AVS_NO; + size_t hyphenCount = 0; uint_least32_t cp; size_t len = strlen(text); size_t ret, off; @@ -309,7 +310,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } if (cp == EXCLAMATION_MARK) { - state = STATE_COMMENT_ETC; + state = STATE_COMMENT; break; } stillOpenTag->children = realloc( @@ -467,9 +468,16 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis ); } break; - case STATE_COMMENT_ETC: - if (cp == GREATER_THAN_SIGN) + case STATE_COMMENT: + if (cp == GREATER_THAN_SIGN && hyphenCount == 2) + { state = STATE_INNER_TEXT; + break; + } + if (cp == HYPHEN_MINUS) + hyphenCount++; + else + hyphenCount = 0; break; } } diff --git a/html.h b/html.h @@ -13,6 +13,7 @@ #define QUOTATION_MARK 0x22 #define APOSTROPHE 0x27 #define GRAVE_ACCENT 0x60 +#define HYPHEN_MINUS 0x2D const char *voidElements[] = { "area", "base", "br", "col", "embed", "hr", "img", @@ -66,7 +67,7 @@ enum state STATE_END_TAG_NAME, STATE_ATTR_NAME, STATE_ATTR_VALUE, - STATE_COMMENT_ETC + STATE_COMMENT }; enum attr_value_syntax diff --git a/todo b/todo @@ -1,2 +1 @@ strip beginning and ending whitespace of inner and outer html -parse html comments right