commit 0240fb1173b0287cc14983e94c41003beba9f3b6
parent bcc4ec1b9dcc6d5133ad5be90a95c8de364c88df
Author: Robin <kroekerrobin@gmail.com>
Date: Mon, 7 Aug 2023 21:45:42 +0200
Add support for parsing outerHtml
Diffstat:
| M | html.c | | | 45 | +++++++++++++++++++++++++++++++++++++-------- |
| M | html.h | | | 4 | ++++ |
| M | lib.c | | | 8 | ++++++-- |
3 files changed, 47 insertions(+), 10 deletions(-)
diff --git a/html.c b/html.c
@@ -31,6 +31,8 @@ struct tag *initTag()
t->name[0] = 0;
t->innerText = malloc(sizeof(char));
t->innerText[0] = 0;
+ t->innerHtml = NULL;
+ t->outerHtml = NULL;
t->attrs = NULL;
t->children = NULL;
t->attrsLen = 0;
@@ -197,12 +199,16 @@ size_t parseDOCTYPE(const char *text)
return 0;
}
-void closeLastUnclosedTag(struct tag_list *tagList, const char *endTag)
+struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset)
{
for (int i=tagList->len-1; i>-1; i--)
{
if (strcmp(tagList->tags[i]->name, endTag) == 0)
+ {
tagList->tags[i]->_isClosed = true;
+ tagList->tags[i]->_endOffset = endOffset;
+ return tagList->tags[i];
+ }
}
}
@@ -216,9 +222,23 @@ struct tag *getLastOpenTag(struct tag_list *tagList)
return tagList->tags[0];
}
-struct tag *parseTag(const char *text, enum state state, struct tag_list *tagList)
+void saveOuterAndInnerHtml(struct tag *tag, char *text)
+{
+ int o = 0;
+ for (int i=tag->_beginOffset; i<tag->_endOffset; i++)
+ {
+ tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char));
+ tag->outerHtml[o] = text[i];
+ o++;
+ }
+ tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char));
+ tag->outerHtml[o] = 0;
+}
+
+struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_list *tagList)
{
struct tag *tag = initTag();
+ tag->_beginOffset = offset-1;
tagList->tags = realloc(tagList->tags, (tagList->len+1) * sizeof(struct tag));
tagList->tags[tagList->len] = tag;
tagList->len++;
@@ -232,7 +252,7 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list *tagLis
uint_least32_t cp;
size_t len = strlen(text);
size_t ret, off;
- for (off = 0; off<len; off += ret)
+ for (off = offset; off<len; off += ret)
{
if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off)
{
@@ -269,7 +289,7 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list *tagLis
stillOpenTag->children,
(stillOpenTag->childrenLen+1) * sizeof(struct tag)
);
- struct tag *oneTag = parseTag(text+off, STATE_BEGIN_TAG_NAME, tagList);
+ struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
stillOpenTag->children[stillOpenTag->childrenLen] = oneTag;
stillOpenTag->childrenLen++;
free(endTag);
@@ -294,7 +314,8 @@ struct tag *parseTag(const char *text, enum state state, struct tag_list *tagLis
case STATE_END_TAG_NAME:
if (cp == GREATER_THAN_SIGN)
{
- closeLastUnclosedTag(tagList, endTag);
+ struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+ saveOuterAndInnerHtml(closedTag, text);
free(endTag);
endTag = malloc(sizeof(char));
endTag[0] = 0;
@@ -407,6 +428,8 @@ void freeTag(struct tag *t)
{
free(t->name);
free(t->innerText);
+ free(t->innerHtml);
+ free(t->outerHtml);
for (int i=0; i<t->attrsLen; i++)
{
free(t->attrs[i]->name);
@@ -489,7 +512,7 @@ struct tag *findTag(struct tag *tag, struct tag_list *list, struct filter_opts *
return NULL;
}
-void filterHtml(const char *text, struct filter_opts *opts)
+void filterHtml(char *text, struct filter_opts *opts)
{
struct tag *rootTag;
struct tag_list *tagList = malloc(sizeof(struct tag_list));
@@ -497,14 +520,20 @@ void filterHtml(const char *text, struct filter_opts *opts)
tagList->len = 0;
size_t len = parseDOCTYPE(text);
if (len)
- rootTag = parseTag(text+len, STATE_INNER_TEXT, tagList);
+ rootTag = parseTag(text+len, 0, STATE_INNER_TEXT, tagList);
else
- rootTag = parseTag(text, STATE_INNER_TEXT, tagList);
+ rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
struct tag *result = findTag(rootTag, tagList, opts);
if (result == NULL)
printError("No tag found.");
else
+ {
printf("result: %s\n", result->name);
+ if (!result->_isVoidElement)
+ {
+ printf("outerHtml: %s\n", result->outerHtml);
+ }
+ }
freeTag(rootTag);
freeTagList(tagList);
}
diff --git a/html.h b/html.h
@@ -36,12 +36,16 @@ struct tag
{
char *name;
char *innerText;
+ char *innerHtml;
+ char *outerHtml;
struct attr **attrs;
struct tag **children;
size_t attrsLen;
size_t childrenLen;
bool _isVoidElement; // means there is no closing tag
bool _isClosed;
+ size_t _beginOffset;
+ size_t _endOffset;
};
struct tag_list
diff --git a/lib.c b/lib.c
@@ -1,7 +1,11 @@
char *stringCat(char *str1,char *str2)
{
- int str1Len = strlen(str1);
- int str2Len = strlen(str2);
+ int str1Len = 0;
+ int str2Len = 0;
+ if (str1)
+ str1Len = strlen(str1);
+ if (str2)
+ str2Len = strlen(str2);
char *string = malloc((str1Len+str2Len+1) * sizeof(char));
int i = 0;
int k = 0;