commit 7a549bd99d0c278a0fcc199e920e709231ea5d25
parent de3711d4d4300d4b5294ec1a8ea4aac4994febde
Author: Robin <kroekerrobin@gmail.com>
Date: Fri, 11 Aug 2023 15:49:21 +0200
Fix parsing tag and subtag with same name
e.g.:
<div>
<div></div>
</div
Diffstat:
| M | htex.c | | | 34 | ---------------------------------- |
| M | html.c | | | 94 | ++++++++++++++++++++++++++++++++++++++++++++----------------------------------- |
| M | html.h | | | 5 | ++++- |
| M | lib.c | | | 34 | ++++++++++++++++++++++++++++++++++ |
4 files changed, 91 insertions(+), 76 deletions(-)
diff --git a/htex.c b/htex.c
@@ -8,40 +8,6 @@
#include "lib.c"
#include "html.c"
-// Do not use for reading from a socket fd
-bool tryRead(char *buf, FILE *fp)
-{
- size_t bytesRead = fread(buf, 1, 1, fp);
- if (feof(fp) != 0)
- return false;
- if (ferror(fp) != 0)
- tryRead(buf, fp);
- if (bytesRead != 1)
- tryRead(buf, fp);
- return true;
-}
-
-char *readFile(FILE *fp)
-{
- char *text = NULL;
- int i = 0;
- char buf;
- while (1)
- {
- if (tryRead(&buf, fp))
- {
- text = realloc(text, (i+1) * sizeof(char));
- text[i] = buf;
- i++;
- }
- else
- break;
- }
- text = realloc(text, (i+1) * sizeof(char));
- text[i] = 0;
- return text;
-}
-
struct filter_opts *parseFilterOpts(const char *pattern)
{
struct filter_opts *opt = malloc(sizeof(struct filter_opts));
diff --git a/html.c b/html.c
@@ -14,6 +14,9 @@ const char *stateToString(enum state s)
case STATE_SCRIPT: return "STATE_SCRIPT";
case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
+ case STATE_STYLE: return "STATE_STYLE";
+ case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG";
+ case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG";
}
return "";
}
@@ -210,6 +213,7 @@ size_t parseDoctype(const char *text)
struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset)
{
+ printf("Closing %s\n", endTag);
for (int i=tagList->len-1; i>-1; i--)
{
if (strcmp(tagList->tags[i]->name, endTag) == 0)
@@ -232,18 +236,6 @@ struct tag *getLastOpenTag(struct tag_list *tagList)
return tagList->tags[0];
}
-/* void saveOuterHtml(struct tag *tag, char *text)
-{
- int o = 0;
- for (int i=tag->_outerHtmlBeginOffset; i<tag->_outerHtmlEndOffset; i++)
- {
- tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char));
- tag->outerHtml[o] = text[i];
- o++;
- }
- tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char));
- tag->outerHtml[o] = 0;
-} */
char *getOuterHtml(char *text, struct tag *t)
{
char *outerHtml = NULL;
@@ -274,19 +266,6 @@ char *getInnerHtml(char *text, struct tag *t)
return innerHtml;
}
-/* void saveInnerHtml(struct tag *tag, char *text)
-{
- int o = 0;
- for (int i=tag->_innerHtmlBeginOffset; i<tag->_innerHtmlEndOffset; i++)
- {
- tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char));
- tag->innerHtml[o] = text[i];
- o++;
- }
- tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char));
- tag->innerHtml[o] = 0;
-} */
-
void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off)
{
int i = off;
@@ -322,9 +301,9 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
}
else
{
- // char *the_codepoint = cpToChars(cp, ret);
- // printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state));
- // free(the_codepoint);
+ char *the_codepoint = cpToChars(cp, ret);
+ printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state));
+ free(the_codepoint);
switch (state)
{
case STATE_INNER_TEXT:
@@ -333,7 +312,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
state = STATE_TAG;
break;
}
- // stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret));
break;
case STATE_TAG:
if (cp == SOLIDUS)
@@ -346,8 +324,8 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
state = STATE_COMMENT;
break;
}
- struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
stillOpenTag = getLastOpenTag(tagList);
+ struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
stillOpenTag->children = realloc(
stillOpenTag->children,
(stillOpenTag->childrenLen+1) * sizeof(struct tag)
@@ -362,12 +340,11 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
tag->_innerHtmlBeginOffset = off+1;
tag->_isVoidElement = isVoidElement(tag->name);
if (tag->_isVoidElement)
- {
tag->_outerHtmlEndOffset = off+1;
- // saveOuterHtml(tag, text);
- }
if (strcmp(tag->name, "script") == 0)
state = STATE_SCRIPT;
+ else if (strcmp(tag->name, "style") == 0)
+ state = STATE_STYLE;
else
state = STATE_INNER_TEXT;
break;
@@ -387,8 +364,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
{
struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
setInnerHtmlEndOffset(closedTag, text, off);
- // saveOuterHtml(closedTag, text);
- // saveInnerHtml(closedTag, text);
free(endTag);
endTag = malloc(sizeof(char));
endTag[0] = 0;
@@ -406,9 +381,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
if (tag->_isVoidElement)
{
tag->_outerHtmlEndOffset = off+1;
- // saveOuterHtml(tag, text);
}
- state = STATE_INNER_TEXT;
+ if (strcmp(tag->name, "script") == 0)
+ state = STATE_SCRIPT;
+ else if (strcmp(tag->name, "style") == 0)
+ state = STATE_STYLE;
+ else
+ state = STATE_INNER_TEXT;
break;
}
if (isASCIIWhitespace(cp))
@@ -485,9 +464,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
if (tag->_isVoidElement)
{
tag->_outerHtmlEndOffset = off+1;
- // saveOuterHtml(tag, text);
}
- state = STATE_INNER_TEXT;
+ if (strcmp(tag->name, "script") == 0)
+ state = STATE_SCRIPT;
+ else if (strcmp(tag->name, "style") == 0)
+ state = STATE_STYLE;
+ else
+ state = STATE_INNER_TEXT;
break;
}
if (
@@ -516,6 +499,33 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
else
hyphenCount = 0;
break;
+ case STATE_STYLE:
+ if (cp == LESS_THAN_SIGN)
+ {
+ state = STATE_STYLE_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_STYLE_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_STYLE_END_TAG;
+ else
+ state = STATE_STYLE;
+ break;
+ case STATE_STYLE_END_TAG:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+ setInnerHtmlEndOffset(closedTag, text, off);
+ free(endTag);
+ endTag = malloc(sizeof(char));
+ endTag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!isASCIIWhitespace(cp))
+ endTag = stringCat(endTag, cpToChars(cp, ret));
+ break;
case STATE_SCRIPT:
if (cp == LESS_THAN_SIGN)
{
@@ -534,8 +544,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
{
struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
setInnerHtmlEndOffset(closedTag, text, off);
- // saveOuterHtml(closedTag, text);
- // saveInnerHtml(closedTag, text);
free(endTag);
endTag = malloc(sizeof(char));
endTag[0] = 0;
@@ -688,6 +696,10 @@ void filterHtml(char *text, struct filter_opts *opts)
if (len)
text = text + len;
struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
+ for (int i=0; i<tagList->len; i++)
+ {
+ printf("%s\n", tagList->tags[i]->name);
+ }
findTag(rootTag, opts, foundTags);
printResult(text, opts, foundTags);
// printHtml(rootTag, 0);
diff --git a/html.h b/html.h
@@ -73,7 +73,10 @@ enum state
STATE_COMMENT,
STATE_SCRIPT,
STATE_SCRIPT_POSSIBLE_END_TAG,
- STATE_SCRIPT_END_TAG
+ STATE_SCRIPT_END_TAG,
+ STATE_STYLE,
+ STATE_STYLE_POSSIBLE_END_TAG,
+ STATE_STYLE_END_TAG
};
enum attr_value_syntax
diff --git a/lib.c b/lib.c
@@ -77,3 +77,37 @@ char *trim(char *text)
free(text);
return trimmedText;
}
+
+// Do not use for reading from a socket fd
+bool tryRead(char *buf, FILE *fp)
+{
+ size_t bytesRead = fread(buf, 1, 1, fp);
+ if (feof(fp) != 0)
+ return false;
+ if (ferror(fp) != 0)
+ tryRead(buf, fp);
+ if (bytesRead != 1)
+ tryRead(buf, fp);
+ return true;
+}
+
+char *readFile(FILE *fp)
+{
+ char *text = NULL;
+ int i = 0;
+ char buf;
+ while (1)
+ {
+ if (tryRead(&buf, fp))
+ {
+ text = realloc(text, (i+1) * sizeof(char));
+ text[i] = buf;
+ i++;
+ }
+ else
+ break;
+ }
+ text = realloc(text, (i+1) * sizeof(char));
+ text[i] = 0;
+ return text;
+}