commit 0e33554f458efcf0b45d3985db29a333c2c35385
parent 0240fb1173b0287cc14983e94c41003beba9f3b6
Author: Robin <kroekerrobin@gmail.com>
Date: Tue, 8 Aug 2023 21:30:09 +0200
Support outerHtml for void elements
Diffstat:
| M | htex.c | | | 7 | +++++++ |
| M | html.c | | | 73 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- |
| M | html.h | | | 8 | ++++++-- |
| M | todo | | | 7 | ++----- |
4 files changed, 79 insertions(+), 16 deletions(-)
diff --git a/htex.c b/htex.c
@@ -45,6 +45,8 @@ char *readFile(FILE *fp)
struct filter_opts *parseFilterOpts(const char *pattern)
{
struct filter_opts *opt = malloc(sizeof(struct filter_opts));
+ opt->outerHtml = true;
+ opt->innerHtml = false;
opt->tag = malloc(sizeof(char));
opt->tag[0] = 0;
opt->attr = malloc(sizeof(char));
@@ -221,6 +223,11 @@ int main(int argc, char *argv[])
}
}
struct filter_opts *options = parseFilterOpts(searchPattern);
+ if (isInnerHtml)
+ {
+ options->innerHtml = true;
+ options->outerHtml = false;
+ }
filterHtml(text, options);
freeOpts(options);
free(text);
diff --git a/html.c b/html.c
@@ -158,9 +158,9 @@ static inline bool
isValidUnquotedAttrValue(uint_least32_t cp)
{
/*
- Not mentioned invalid characters
- are already handled before funtion
- call.
+ Not mentioned invalid characters.
+ They are already handled before
+ funtion call.
*/
if (
cp == EQUALS_SIGN ||
@@ -206,7 +206,7 @@ struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, s
if (strcmp(tagList->tags[i]->name, endTag) == 0)
{
tagList->tags[i]->_isClosed = true;
- tagList->tags[i]->_endOffset = endOffset;
+ tagList->tags[i]->_outerHtmlEndOffset = endOffset;
return tagList->tags[i];
}
}
@@ -222,10 +222,14 @@ struct tag *getLastOpenTag(struct tag_list *tagList)
return tagList->tags[0];
}
-void saveOuterAndInnerHtml(struct tag *tag, char *text)
+/* char *getInnerHtml(struct tag *tag)
+{
+} */
+
+void saveOuterHtml(struct tag *tag, char *text)
{
int o = 0;
- for (int i=tag->_beginOffset; i<tag->_endOffset; i++)
+ for (int i=tag->_outerHtmlBeginOffset; i<tag->_outerHtmlEndOffset; i++)
{
tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char));
tag->outerHtml[o] = text[i];
@@ -235,10 +239,33 @@ void saveOuterAndInnerHtml(struct tag *tag, char *text)
tag->outerHtml[o] = 0;
}
+void saveInnerHtml(struct tag *tag, char *text)
+{
+ int o = 0;
+ for (int i=tag->_innerHtmlBeginOffset; i<tag->_innerHtmlEndOffset; i++)
+ {
+ tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char));
+ tag->innerHtml[o] = text[i];
+ o++;
+ }
+ tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char));
+ tag->innerHtml[o] = 0;
+}
+
+void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off)
+{
+ int i = off;
+ while (text[i] != '<')
+ {
+ i--;
+ }
+ closedTag->_innerHtmlEndOffset = i;
+}
+
struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList)
{
struct tag *tag = initTag();
- tag->_beginOffset = offset-1;
+ tag->_outerHtmlBeginOffset= offset-1;
tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag));
tagList->tags[tagList->len] = tag;
tagList->len++;
@@ -297,7 +324,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
case STATE_BEGIN_TAG_NAME:
if (cp == GREATER_THAN_SIGN)
{
+ tag->_innerHtmlBeginOffset = off+1;
tag->_isVoidElement = isVoidElement(tag->name);
+ if (tag->_isVoidElement)
+ {
+ tag->_outerHtmlEndOffset = off+1;
+ saveOuterHtml(tag, text);
+ }
state = STATE_INNER_TEXT;
break;
}
@@ -315,7 +348,9 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
if (cp == GREATER_THAN_SIGN)
{
struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
- saveOuterAndInnerHtml(closedTag, text);
+ setInnerHtmlEndOffset(closedTag, text, off);
+ saveOuterHtml(closedTag, text);
+ saveInnerHtml(closedTag, text);
free(endTag);
endTag = malloc(sizeof(char));
endTag[0] = 0;
@@ -328,7 +363,13 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
case STATE_ATTR_NAME:
if (cp == GREATER_THAN_SIGN)
{
+ tag->_innerHtmlBeginOffset = off+1;
tag->_isVoidElement = isVoidElement(tag->name);
+ if (tag->_isVoidElement)
+ {
+ tag->_outerHtmlEndOffset = off+1;
+ saveOuterHtml(tag, text);
+ }
state = STATE_INNER_TEXT;
break;
}
@@ -399,6 +440,18 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
break;
}
}
+ if (cp == GREATER_THAN_SIGN)
+ {
+ tag->_innerHtmlBeginOffset = off+1;
+ tag->_isVoidElement = isVoidElement(tag->name);
+ if (tag->_isVoidElement)
+ {
+ tag->_outerHtmlEndOffset = off+1;
+ saveOuterHtml(tag, text);
+ }
+ state = STATE_INNER_TEXT;
+ break;
+ }
if (
attrValueSyntax == AVS_NO &&
isValidUnquotedAttrValue(cp)
@@ -529,10 +582,12 @@ void filterHtml(char *text, struct filter_opts *opts)
else
{
printf("result: %s\n", result->name);
+ printf("%ld %ld\n", result->_outerHtmlBeginOffset, result->_outerHtmlEndOffset);
if (!result->_isVoidElement)
{
- printf("outerHtml: %s\n", result->outerHtml);
+ printf("innerHtml: %s\n", result->innerHtml);
}
+ printf("outerHtml: %s\n", result->outerHtml);
}
freeTag(rootTag);
freeTagList(tagList);
diff --git a/html.h b/html.h
@@ -24,6 +24,8 @@ struct filter_opts
char *tag;
char *attr;
char *key;
+ bool innerHtml;
+ bool outerHtml;
};
struct attr
@@ -44,8 +46,10 @@ struct tag
size_t childrenLen;
bool _isVoidElement; // means there is no closing tag
bool _isClosed;
- size_t _beginOffset;
- size_t _endOffset;
+ size_t _outerHtmlBeginOffset;
+ size_t _outerHtmlEndOffset;
+ size_t _innerHtmlBeginOffset;
+ size_t _innerHtmlEndOffset;
};
struct tag_list
diff --git a/todo b/todo
@@ -1,5 +1,2 @@
-refactor; heavy
-implement find_attribute_value_by_*
-implement filtering not only by class or id, also like this .test[data="asdf"]
-implement finding tags that have no end tag, e.g. the img tag
-Actually correctly parse html according to spec ;-)
+strip beginning and ending whitespace of inner and outer html
+find element by attr value (something's still wrong)