commit de3711d4d4300d4b5294ec1a8ea4aac4994febde
parent 7ee9e2ffcc747ef4a0a14eeec8f55402d4653ca6
Author: Robin <kroekerrobin@gmail.com>
Date: Thu, 10 Aug 2023 19:24:35 +0200
Don't save inner and outer html, only the offsets
Diffstat:
| M | Makefile | | | 4 | ++-- |
| M | htex.c | | | 10 | +--------- |
| M | html.c | | | 161 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ |
| M | html.h | | | 7 | ++++--- |
| M | lib.c | | | 1 | + |
| M | todo | | | 1 | + |
6 files changed, 121 insertions(+), 63 deletions(-)
diff --git a/Makefile b/Makefile
@@ -2,9 +2,9 @@ PREFIX = /usr/local
MANPREFIX = $(PREFIX)/share/man
all:
- $(CC) -O -Werror -o htex htex.c -lgrapheme
+ $(CC) -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
debug:
- $(CC) -fsanitize=address -O -Werror -o htex htex.c -lgrapheme
+ $(CC) -fsanitize=address -O -pedantic -Werror -Wall -o htex htex.c -lgrapheme
clean:
rm htex
install: all
diff --git a/htex.c b/htex.c
@@ -52,9 +52,7 @@ struct filter_opts *parseFilterOpts(const char *pattern)
opt->attr[0] = 0;
opt->key = malloc(sizeof(char));
opt->key[0] = 0;
- char *classValue = NULL;
bool isClassValue = false;
- char *idValue = NULL;
bool isIdValue = false;
int i = 0;
bool isAttrKey = false;
@@ -73,7 +71,6 @@ struct filter_opts *parseFilterOpts(const char *pattern)
isIdValue = true;
i = 1;
break;
- default:
}
for (; i<strlen(pattern); i++)
{
@@ -170,22 +167,17 @@ int main(int argc, char *argv[])
int o = 0;
int option_index = 0;
bool isInnerHtml = false;
- bool isExcept = false;
char *text = NULL;
char *searchPattern = NULL;
static struct option long_options[] = {
{ "innerhtml", no_argument, 0, 'i' },
- { "except", no_argument, 0, 'e' },
{ 0, 0, 0, 0 }
};
- while ((o = getopt_long(argc, argv, "ie", long_options, &option_index)) != -1) {
+ while ((o = getopt_long(argc, argv, "i", long_options, &option_index)) != -1) {
switch(o) {
case 'i':
isInnerHtml = true;
break;
- case 'e':
- isExcept = true;
- break;
}
}
if (argc == optind)
diff --git a/html.c b/html.c
@@ -11,7 +11,11 @@ const char *stateToString(enum state s)
case STATE_ATTR_NAME: return "STATE_ATTR_NAME";
case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE";
case STATE_COMMENT: return "STATE_COMMENT";
+ case STATE_SCRIPT: return "STATE_SCRIPT";
+ case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG";
+ case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG";
}
+ return "";
}
struct attr *initAttr()
@@ -31,8 +35,6 @@ struct tag *initTag()
t->name[0] = 0;
t->innerText = malloc(sizeof(char));
t->innerText[0] = 0;
- t->innerHtml = NULL;
- t->outerHtml = NULL;
t->attrs = NULL;
t->children = NULL;
t->attrsLen = 0;
@@ -42,6 +44,14 @@ struct tag *initTag()
return t;
}
+struct tag_list *initTagList()
+{
+ struct tag_list *t = malloc(sizeof(struct tag_list));
+ t->tags = NULL;
+ t->len = 0;
+ return t;
+}
+
static inline bool isASCIIDigit(uint_least32_t cp)
{
if (cp >= 0x30 && cp <= 0x39)
@@ -172,10 +182,9 @@ isValidUnquotedAttrValue(uint_least32_t cp)
return true;
}
-size_t parseDOCTYPE(const char *text)
+size_t parseDoctype(const char *text)
{
char *firstLine = NULL;
- char c;
int i = 0;
while (text[i] != '\n')
{
@@ -210,6 +219,7 @@ struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, s
return tagList->tags[i];
}
}
+ return NULL;
}
struct tag *getLastOpenTag(struct tag_list *tagList)
@@ -222,11 +232,7 @@ struct tag *getLastOpenTag(struct tag_list *tagList)
return tagList->tags[0];
}
-/* char *getInnerHtml(struct tag *tag)
-{
-} */
-
-void saveOuterHtml(struct tag *tag, char *text)
+/* void saveOuterHtml(struct tag *tag, char *text)
{
int o = 0;
for (int i=tag->_outerHtmlBeginOffset; i<tag->_outerHtmlEndOffset; i++)
@@ -237,9 +243,38 @@ void saveOuterHtml(struct tag *tag, char *text)
}
tag->outerHtml = realloc(tag->outerHtml, (o+1) * sizeof(char));
tag->outerHtml[o] = 0;
+} */
+char *getOuterHtml(char *text, struct tag *t)
+{
+ char *outerHtml = NULL;
+ int o = 0;
+ for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++)
+ {
+ outerHtml = realloc(outerHtml, (o+1) * sizeof(char));
+ outerHtml[o] = text[i];
+ o++;
+ }
+ outerHtml = realloc(outerHtml, (o+1) * sizeof(char));
+ outerHtml[o] = 0;
+ return outerHtml;
+}
+
+char *getInnerHtml(char *text, struct tag *t)
+{
+ char *innerHtml = NULL;
+ int o = 0;
+ for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++)
+ {
+ innerHtml = realloc(innerHtml, (o+1) * sizeof(char));
+ innerHtml[o] = text[i];
+ o++;
+ }
+ innerHtml = realloc(innerHtml, (o+1) * sizeof(char));
+ innerHtml[o] = 0;
+ return innerHtml;
}
-void saveInnerHtml(struct tag *tag, char *text)
+/* void saveInnerHtml(struct tag *tag, char *text)
{
int o = 0;
for (int i=tag->_innerHtmlBeginOffset; i<tag->_innerHtmlEndOffset; i++)
@@ -250,7 +285,7 @@ void saveInnerHtml(struct tag *tag, char *text)
}
tag->innerHtml = realloc(tag->innerHtml, (o+1) * sizeof(char));
tag->innerHtml[o] = 0;
-}
+} */
void setInnerHtmlEndOffset(struct tag *closedTag, char *text, size_t off)
{
@@ -274,7 +309,6 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
endTag[0] = 0;
size_t a = 0;
size_t attrNameCount = 0;
- size_t attrValueCount = 0;
enum attr_value_syntax attrValueSyntax = AVS_NO;
size_t hyphenCount = 0;
uint_least32_t cp;
@@ -299,8 +333,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
state = STATE_TAG;
break;
}
- stillOpenTag = getLastOpenTag(tagList);
- stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret));
+ // stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret));
break;
case STATE_TAG:
if (cp == SOLIDUS)
@@ -313,11 +346,12 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
state = STATE_COMMENT;
break;
}
+ struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
+ stillOpenTag = getLastOpenTag(tagList);
stillOpenTag->children = realloc(
stillOpenTag->children,
(stillOpenTag->childrenLen+1) * sizeof(struct tag)
);
- struct tag *oneTag = parseTag(text, off, STATE_BEGIN_TAG_NAME, tagList);
stillOpenTag->children[stillOpenTag->childrenLen] = oneTag;
stillOpenTag->childrenLen++;
free(endTag);
@@ -330,9 +364,12 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
if (tag->_isVoidElement)
{
tag->_outerHtmlEndOffset = off+1;
- saveOuterHtml(tag, text);
+ // saveOuterHtml(tag, text);
}
- state = STATE_INNER_TEXT;
+ if (strcmp(tag->name, "script") == 0)
+ state = STATE_SCRIPT;
+ else
+ state = STATE_INNER_TEXT;
break;
}
if (isASCIIWhitespace(cp))
@@ -350,8 +387,8 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
{
struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
setInnerHtmlEndOffset(closedTag, text, off);
- saveOuterHtml(closedTag, text);
- saveInnerHtml(closedTag, text);
+ // saveOuterHtml(closedTag, text);
+ // saveInnerHtml(closedTag, text);
free(endTag);
endTag = malloc(sizeof(char));
endTag[0] = 0;
@@ -369,7 +406,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
if (tag->_isVoidElement)
{
tag->_outerHtmlEndOffset = off+1;
- saveOuterHtml(tag, text);
+ // saveOuterHtml(tag, text);
}
state = STATE_INNER_TEXT;
break;
@@ -448,7 +485,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
if (tag->_isVoidElement)
{
tag->_outerHtmlEndOffset = off+1;
- saveOuterHtml(tag, text);
+ // saveOuterHtml(tag, text);
}
state = STATE_INNER_TEXT;
break;
@@ -469,7 +506,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
}
break;
case STATE_COMMENT:
- if (cp == GREATER_THAN_SIGN && hyphenCount == 2)
+ if (cp == GREATER_THAN_SIGN && hyphenCount >= 2)
{
state = STATE_INNER_TEXT;
break;
@@ -479,18 +516,46 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis
else
hyphenCount = 0;
break;
+ case STATE_SCRIPT:
+ if (cp == LESS_THAN_SIGN)
+ {
+ state = STATE_SCRIPT_POSSIBLE_END_TAG;
+ break;
+ }
+ break;
+ case STATE_SCRIPT_POSSIBLE_END_TAG:
+ if (cp == SOLIDUS)
+ state = STATE_SCRIPT_END_TAG;
+ else
+ state = STATE_SCRIPT;
+ break;
+ case STATE_SCRIPT_END_TAG:
+ if (cp == GREATER_THAN_SIGN)
+ {
+ struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret);
+ setInnerHtmlEndOffset(closedTag, text, off);
+ // saveOuterHtml(closedTag, text);
+ // saveInnerHtml(closedTag, text);
+ free(endTag);
+ endTag = malloc(sizeof(char));
+ endTag[0] = 0;
+ state = STATE_INNER_TEXT;
+ break;
+ }
+ if (!isASCIIWhitespace(cp))
+ endTag = stringCat(endTag, cpToChars(cp, ret));
+ break;
}
}
}
free(endTag);
+ return tag;
}
void freeTag(struct tag *t)
{
free(t->name);
free(t->innerText);
- free(t->innerHtml);
- free(t->outerHtml);
for (int i=0; i<t->attrsLen; i++)
{
free(t->attrs[i]->name);
@@ -513,15 +578,6 @@ void freeTagList(struct tag_list *t)
free(t);
}
-void printHtml(struct tag *t)
-{
- printf("name: %s\n", t->name);
- for (int i=0; i<t->childrenLen; i++)
- {
- printHtml(t->children[i]);
- }
-}
-
void findTag(struct tag *tag, struct filter_opts *opt, struct tag_list *foundTags)
{
bool matchesTag = false;
@@ -596,17 +652,29 @@ void findTag(struct tag *tag, struct filter_opts *opt, struct tag_list *foundTag
}
}
-void printResult(struct tag_list *foundTags, struct filter_opts *opts)
+void printHtml(struct tag *t, int indent)
+{
+ for (int i=0; i<indent; i++)
+ putchar(' ');
+ printf("%s\n", t->name);
+ indent++;
+ for (int i=t->childrenLen-1; i>-1; i--)
+ {
+ printHtml(t->children[i], indent);
+ }
+}
+
+void printResult(char *text, struct filter_opts *opts, struct tag_list *foundTags)
{
- char *trimmedOutput;
+ char *trimmedOutput = NULL;
for (int i=0; i<foundTags->len; i++)
{
if (foundTags->tags[i]->_isVoidElement)
opts->out = OUT_OUTER_HTML;
if (opts->out == OUT_OUTER_HTML)
- trimmedOutput = trim(foundTags->tags[i]->outerHtml);
+ trimmedOutput = trim(getOuterHtml(text, foundTags->tags[i]));
else if (opts->out == OUT_INNER_HTML)
- trimmedOutput = trim(foundTags->tags[i]->innerHtml);
+ trimmedOutput = trim(getInnerHtml(text, foundTags->tags[i]));
printf("%s\n", trimmedOutput);
free(trimmedOutput);
}
@@ -614,20 +682,15 @@ void printResult(struct tag_list *foundTags, struct filter_opts *opts)
void filterHtml(char *text, struct filter_opts *opts)
{
- struct tag *rootTag;
- struct tag_list *tagList = malloc(sizeof(struct tag_list));
- tagList->tags = NULL;
- tagList->len = 0;
- size_t len = parseDOCTYPE(text);
+ struct tag_list *tagList = initTagList();
+ struct tag_list *foundTags = initTagList();
+ size_t len = parseDoctype(text);
if (len)
- rootTag = parseTag(text+len, 0, STATE_INNER_TEXT, tagList);
- else
- rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
- struct tag_list *foundTags = malloc(sizeof(struct tag_list));
- foundTags->tags = NULL;
- foundTags->len = 0;
+ text = text + len;
+ struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList);
findTag(rootTag, opts, foundTags);
- printResult(foundTags, opts);
+ printResult(text, opts, foundTags);
+ // printHtml(rootTag, 0);
freeTag(rootTag);
freeTagList(tagList);
freeTagList(foundTags);
diff --git a/html.h b/html.h
@@ -44,8 +44,6 @@ struct tag
{
char *name;
char *innerText;
- char *innerHtml;
- char *outerHtml;
struct attr **attrs;
struct tag **children;
size_t attrsLen;
@@ -72,7 +70,10 @@ enum state
STATE_END_TAG_NAME,
STATE_ATTR_NAME,
STATE_ATTR_VALUE,
- STATE_COMMENT
+ STATE_COMMENT,
+ STATE_SCRIPT,
+ STATE_SCRIPT_POSSIBLE_END_TAG,
+ STATE_SCRIPT_END_TAG
};
enum attr_value_syntax
diff --git a/lib.c b/lib.c
@@ -74,5 +74,6 @@ char *trim(char *text)
}
trimmedText = realloc(trimmedText, (k+1) * sizeof(char));
trimmedText[k] = 0;
+ free(text);
return trimmedText;
}
diff --git a/todo b/todo
@@ -0,0 +1 @@
+only save offsets for inner and outer html and don't save it