htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

commit 468e6772f71842a17b2013db6f92f8ca908d6648
parent 29cc7cac94d871dc422a3e2b42c3138f359bb275
Author: Robin <kroekerrobin@gmail.com>
Date:   Tue,  2 Apr 2024 22:28:00 +0200

Change curly brace style

Diffstat:
Mhtex.c | 59++++++++++++++++++++---------------------------------------
Mhtml.c | 331++++++++++++++++++++++++++-----------------------------------------------------
Mhtml.h | 54++++++++++++++++++++++--------------------------------
Mmisc.c | 27+++++++++------------------
4 files changed, 159 insertions(+), 312 deletions(-)

diff --git a/htex.c b/htex.c @@ -27,8 +27,7 @@ struct find_opts *parseFilterOpts(const char *pattern) int aot = 0; int ak = 0; int av = 0; - switch (pattern[0]) - { + switch (pattern[0]) { case '.': isClassValue = true; i = 1; @@ -38,36 +37,31 @@ struct find_opts *parseFilterOpts(const char *pattern) i = 1; break; } - for (; i<strlen(pattern); i++) - { + for (; i<strlen(pattern); i++) { if (pattern[i] == ']') break; if ( - !isAttrKey && + !isAttrKey && !isAttrOrTag && pattern[i] != ']' && pattern[i] != '"' - ) - { + ) { opt->attr = realloc(opt->attr, (av+1) * sizeof(char)); opt->attr[av] = pattern[i]; av++; } if (pattern[i] == '=') isAttrKey = false; - if (isAttrKey && !isAttrOrTag) - { + if (isAttrKey && !isAttrOrTag) { opt->key = realloc(opt->key, (ak+1) * sizeof(char)); opt->key[ak] = pattern[i]; ak++; } - if (pattern[i] == '[') - { + if (pattern[i] == '[') { isAttrKey = true; isAttrOrTag = false; } - if (isAttrOrTag) - { + if (isAttrOrTag) { attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char)); attrOrTag[aot] = pattern[i]; aot++; @@ -75,8 +69,7 @@ struct find_opts *parseFilterOpts(const char *pattern) } attrOrTag = realloc(attrOrTag, (aot+1) * sizeof(char)); attrOrTag[aot] = 0; - if (isIdValue) - { + if (isIdValue) { free(opt->key); opt->key = NULL; free(opt->attr); @@ -87,8 +80,7 @@ struct find_opts *parseFilterOpts(const char *pattern) opt->key[1] = 'd'; opt->key[2] = 0; } - else if (isClassValue) - { + else if (isClassValue) { free(opt->key); opt->key = NULL; free(opt->attr); @@ -102,17 +94,14 @@ struct find_opts *parseFilterOpts(const char *pattern) opt->key[4] = 's'; opt->key[5] = 0; } - else - { + else { free(opt->tag); opt->tag = attrOrTag; - if (av > 0) - { + if (av > 0) { opt->attr = realloc(opt->attr, (av+1) * sizeof(char)); opt->attr[av] = 0; } - if (ak > 0) - { + if (ak > 0) { opt->key = realloc(opt->key, (ak+1) * sizeof(char)); opt->key[ak] = 0; } @@ -173,47 +162,39 @@ int main(int argc, char *argv[]) } } enum output_type out = parseOutputArg(output); - if (out == -1) - { + if (out == -1) { fprintf(stderr, "Provide a valid output type!\n"); free(output); return -1; } - if (limit == 0) - { + if (limit == 0) { fprintf(stderr, "Provide a valid limit value.\n"); free(output); return -1; } - if (argc == optind) - { + if (argc == optind) { fprintf(stderr, "Provide a search pattern!\n"); return -1; } - if (argc > optind+2) - { + if (argc > optind+2) { fprintf(stderr, "Provide only one file!\n"); return -1; } - if (argc == optind+1) - { + if (argc == optind+1) { searchPattern = argv[argc-1]; text = readFile(stdin); } - else if (argc == optind+2) - { + else if (argc == optind+2) { searchPattern = argv[argc-2]; char *filepath = argv[argc-1]; FILE *fp = fopen(filepath, "r"); - if (fp == NULL) - { + if (fp == NULL) { perror("fopen failed: "); return -1; } text = readFile(fp); fclose(fp); - if (strlen(text) == 0) - { + if (strlen(text) == 0) { fprintf(stderr, "No data in file.\n"); free(output); free(text); diff --git a/html.c b/html.c @@ -2,8 +2,7 @@ const char *stateToString(enum state s) { - switch(s) - { + switch(s) { case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; case STATE_TAG: return "STATE_TAG"; case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; @@ -104,8 +103,7 @@ static inline bool isASCIIWhitespace(uint_least32_t cp) static inline bool isVoidElement(const char *tagName) { - for (int i=0; i<13; i++) - { + for (int i=0; i<13; i++) { if (strcmp(tagName, voidElements[i]) == 0) return true; } @@ -199,24 +197,18 @@ size_t parseDoctype(const char *text) uint_least32_t cp; size_t len = strlen(text); size_t ret, off; - for (off = 0; off<len; off += ret) - { - if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) - { + for (off = 0; off<len; off += ret) { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { printError("Something wrong with ending of text"); } - else - { - switch (state) - { + else { + switch (state) { case DSTATE_TEXT: - if (cp == LESS_THAN_SIGN) - { + if (cp == LESS_THAN_SIGN) { state = DSTATE_POSSIBLE_DTYPE; break; } - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { offset = off; goto CLEANUP; } @@ -228,25 +220,24 @@ size_t parseDoctype(const char *text) goto CLEANUP; break; case DSTATE_DTYPE_OR_COMMENT: - if (cp == HYPHEN_MINUS) + if (cp == HYPHEN_MINUS) { goto CLEANUP; - else - { + } + else { doctype = stringCat(doctype, cpToChars(cp, ret)); state = DSTATE_DTYPE; break; } break; case DSTATE_DTYPE: - if (isASCIIWhitespace(cp)) - { + if (isASCIIWhitespace(cp)) { size_t dlen = strlen(doctype)+1; lowerDoctype = malloc(dlen * sizeof(char)); grapheme_to_lowercase_utf8(doctype, dlen, lowerDoctype, dlen); - if (strcmp(lowerDoctype, "doctype") == 0) + if (strcmp(lowerDoctype, "doctype") == 0) { state = DSTATE_TEXT; - else - { + } + else { offset = -1; goto CLEANUP; } @@ -265,10 +256,8 @@ CLEANUP: struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, size_t endOffset) { - for (int i=tagList->len-1; i>-1; i--) - { - if (strcmp(tagList->tags[i]->name, endTag) == 0 && !tagList->tags[i]->_isClosed) - { + for (int i=tagList->len-1; i>-1; i--) { + if (strcmp(tagList->tags[i]->name, endTag) == 0 && !tagList->tags[i]->_isClosed) { tagList->tags[i]->_isClosed = true; tagList->tags[i]->_outerHtmlEndOffset = endOffset; return tagList->tags[i]; @@ -279,10 +268,8 @@ struct tag *closeLastUnclosedTag(struct tag_list *tagList, const char *endTag, s struct tag *getLastOpenTag(struct tag_list *tagList) { - for (int i=tagList->len-1; i>-1; i--) - { - if (!tagList->tags[i]->_isVoidElement && !tagList->tags[i]->_isClosed) - { + for (int i=tagList->len-1; i>-1; i--) { + if (!tagList->tags[i]->_isVoidElement && !tagList->tags[i]->_isClosed) { return tagList->tags[i]; } } @@ -293,8 +280,7 @@ char *getOuterHtml(char *text, struct tag *t) { char *outerHtml = NULL; int o = 0; - for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++) - { + for (int i=t->_outerHtmlBeginOffset; i<t->_outerHtmlEndOffset; i++) { outerHtml = realloc(outerHtml, (o+1) * sizeof(char)); outerHtml[o] = text[i]; o++; @@ -308,8 +294,7 @@ char *getInnerHtml(char *text, struct tag *t) { char *innerHtml = NULL; int o = 0; - for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++) - { + for (int i=t->_innerHtmlBeginOffset; i<t->_innerHtmlEndOffset; i++) { innerHtml = realloc(innerHtml, (o+1) * sizeof(char)); innerHtml[o] = text[i]; o++; @@ -337,8 +322,7 @@ enum state endOfBeginTag(struct tag *t, size_t offset) return STATE_SCRIPT; else if (strcmp(t->name, "style") == 0) return STATE_STYLE; - else - return STATE_INNER_TEXT; + return STATE_INNER_TEXT; } char *parseNumericCharRef(char *text, size_t off, int base, size_t *newOffset) @@ -353,8 +337,7 @@ char *parseNumericCharRef(char *text, size_t off, int base, size_t *newOffset) ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp); numericCharRef = stringCat(numericCharRef, cpToChars(cp, ret)); off += ret; - } - while (cp != SEMICOLON); + } while (cp != SEMICOLON); *newOffset = off - oldOffset; long i = strtol(numericCharRef, NULL, base); ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE); @@ -366,8 +349,7 @@ char *parseNumericCharRef(char *text, size_t off, int base, size_t *newOffset) char *parseNamedCharRef(char *text, size_t off, size_t len, enum attr_value_syntax avs) { uint_least32_t stopAt = 0; - switch(avs) - { + switch(avs) { case AVS_QUOTATION_MARK: stopAt = QUOTATION_MARK; break; @@ -385,8 +367,7 @@ char *parseNamedCharRef(char *text, size_t off, size_t len, enum attr_value_synt size_t ret; uint_least32_t cp; int i = 0; - for (;;) - { + for (;;) { ret = grapheme_decode_utf8(text+off, strlen(text+off), &cp); if (cp == AMPERSAND || isASCIIWhitespace(cp)) break; @@ -407,22 +388,18 @@ char *encodeNamedCharRef(const char *name) char cp[MAX_CODEPOINT_SIZE]; memset(&cp, 0, MAX_CODEPOINT_SIZE); size_t len; - for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) - { - if (startsWith(name, entities[i].name)) - { + for (int i=0; i<NAMED_CHAR_REF_COUNT; i++) { + if (startsWith(name, entities[i].name)) { len = grapheme_encode_utf8(entities[i].cp[0], cp, MAX_CODEPOINT_SIZE); strcpy(buf, cp); - if (entities[i].cp[1] != 0) - { + if (entities[i].cp[1] != 0) { len += grapheme_encode_utf8(entities[i].cp[1], cp, MAX_CODEPOINT_SIZE); strcat(buf, cp); } buf[len] = 0; const char *part = &name[strlen(entities[i].name)]; size_t partLen = strlen(part); - if (partLen > 0) - { + if (partLen > 0) { if (partLen == 1 && part[0] == ';') return buf; buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1+partLen); @@ -457,27 +434,20 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis uint_least32_t cp; size_t len = strlen(text); size_t ret, off; - for (off = offset; off<len; off += ret) - { - if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) - { - printError("Something wrong with ending of text"); - } - else - { + for (off = offset; off<len; off += ret) { + if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { + fprintf(stderr, "parseTag.grapheme_decode_utf8 failed.\n"); + } else { // char *the_codepoint = cpToChars(cp, ret); // printf("cp: %02X, %s, %s\n", cp, the_codepoint, stateToString(state)); // free(the_codepoint); - switch (state) - { + switch (state) { case STATE_INNER_TEXT: - if (cp == LESS_THAN_SIGN) - { + if (cp == LESS_THAN_SIGN) { state = STATE_TAG; break; } - if (cp == AMPERSAND) - { + if (cp == AMPERSAND) { returnToState = STATE_INNER_TEXT; state = STATE_CHAR_REF; break; @@ -486,13 +456,11 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis stillOpenTag->innerText = stringCat(stillOpenTag->innerText, cpToChars(cp, ret)); break; case STATE_TAG: - if (cp == SOLIDUS) - { + if (cp == SOLIDUS) { state = STATE_END_TAG_NAME; break; } - if (cp == EXCLAMATION_MARK) - { + if (cp == EXCLAMATION_MARK) { state = STATE_COMMENT; break; } @@ -507,24 +475,20 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis free(endTag); return tag; case STATE_BEGIN_TAG_NAME: - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { state = endOfBeginTag(tag, off); break; } - if (isASCIIWhitespace(cp)) - { + if (isASCIIWhitespace(cp)) { state = STATE_ATTR_NAME; break; } - if (isASCIIDigit(cp) || isASCIIAlpha(cp)) - { + if (isASCIIDigit(cp) || isASCIIAlpha(cp)) { tag->name = stringCat(tag->name, cpToChars(cp, ret)); } break; case STATE_END_TAG_NAME: - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); if (closedTag != NULL) setInnerHtmlEndOffset(closedTag, text, off); @@ -538,26 +502,21 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis endTag = stringCat(endTag, cpToChars(cp, ret)); break; case STATE_ATTR_NAME: - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { state = endOfBeginTag(tag, off); break; } - if (isASCIIWhitespace(cp)) - { + if (isASCIIWhitespace(cp)) { if (attrNameCount == a+1) a++; break; } - if (cp == EQUALS_SIGN) - { + if (cp == EQUALS_SIGN) { state = STATE_ATTR_VALUE; break; } - if (isValidAttrName(cp)) - { - if (attrNameCount != a+1) - { + if (isValidAttrName(cp)) { + if (attrNameCount != a+1) { tag->attrs = realloc( tag->attrs, (a+1) * sizeof(struct attr) @@ -573,20 +532,15 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } break; case STATE_ATTR_VALUE: - if (isASCIIWhitespace(cp)) - { - if (attrValueSyntax == AVS_UNQUOTED) - { + if (isASCIIWhitespace(cp)) { + if (attrValueSyntax == AVS_UNQUOTED) { attrValueSyntax = AVS_NO; state = STATE_ATTR_NAME; - } - else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE) - { + } else if (attrValueSyntax == AVS_QUOTATION_MARK || attrValueSyntax == AVS_APOSTROPHE) { if ( strcmp("id", tag->attrs[a]->name) == 0 || strcmp("class", tag->attrs[a]->name) == 0 - ) - { + ) { char *tmpName = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); strcpy(tmpName, tag->attrs[a]->name); tag->attrs = realloc( @@ -599,9 +553,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis tag->attrs[a]->name = tmpName; tag->attrsLen++; attrNameCount = a + 1; - } - else - { + } else { tag->attrs[a]->value = stringCat( tag->attrs[a]->value, cpToChars(cp, ret) @@ -610,50 +562,40 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } break; } - if (cp == QUOTATION_MARK) - { - if (attrValueSyntax == AVS_NO) - { + if (cp == QUOTATION_MARK) { + if (attrValueSyntax == AVS_NO) { attrValueSyntax = AVS_QUOTATION_MARK; break; } - if (attrValueSyntax == AVS_QUOTATION_MARK) - { + if (attrValueSyntax == AVS_QUOTATION_MARK) { attrValueSyntax = AVS_NO; state = STATE_ATTR_NAME; break; } } - if (cp == APOSTROPHE) - { - if (attrValueSyntax == AVS_NO) - { + if (cp == APOSTROPHE) { + if (attrValueSyntax == AVS_NO) { attrValueSyntax = AVS_APOSTROPHE; break; } - if (attrValueSyntax == AVS_APOSTROPHE) - { + if (attrValueSyntax == AVS_APOSTROPHE) { attrValueSyntax = AVS_NO; state = STATE_ATTR_NAME; break; } } - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { state = endOfBeginTag(tag, off); break; } if ( attrValueSyntax == AVS_NO && isValidUnquotedAttrValue(cp) - ) - { + ) { attrValueSyntax = AVS_UNQUOTED; } - if (attrValueSyntax > AVS_NO) - { - if (cp == AMPERSAND) - { + if (attrValueSyntax > AVS_NO) { + if (cp == AMPERSAND) { state = STATE_CHAR_REF; returnToState = STATE_ATTR_VALUE; break; @@ -665,8 +607,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } break; case STATE_COMMENT: - if (cp == GREATER_THAN_SIGN && hyphenCount >= 2) - { + if (cp == GREATER_THAN_SIGN && hyphenCount >= 2) { state = STATE_INNER_TEXT; break; } @@ -676,8 +617,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis hyphenCount = 0; break; case STATE_STYLE: - if (cp == LESS_THAN_SIGN) - { + if (cp == LESS_THAN_SIGN) { state = STATE_STYLE_POSSIBLE_END_TAG; break; } @@ -689,8 +629,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_STYLE; break; case STATE_STYLE_END_TAG: - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); if (closedTag != NULL) setInnerHtmlEndOffset(closedTag, text, off); @@ -704,8 +643,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis endTag = stringCat(endTag, cpToChars(cp, ret)); break; case STATE_SCRIPT: - if (cp == LESS_THAN_SIGN) - { + if (cp == LESS_THAN_SIGN) { state = STATE_SCRIPT_POSSIBLE_END_TAG; break; } @@ -717,8 +655,7 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = STATE_SCRIPT; break; case STATE_SCRIPT_END_TAG: - if (cp == GREATER_THAN_SIGN) - { + if (cp == GREATER_THAN_SIGN) { struct tag *closedTag = closeLastUnclosedTag(tagList, endTag, off+ret); if (closedTag != NULL) setInnerHtmlEndOffset(closedTag, text, off); @@ -732,21 +669,17 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis endTag = stringCat(endTag, cpToChars(cp, ret)); break; case STATE_CHAR_REF: - if (cp == NUMBER_SIGN) // hashtag - { + if (cp == NUMBER_SIGN) { /* hashtag */ state = STATE_CHAR_REF_NUMERIC; break; } char *namedCharRef = parseNamedCharRef(text, off, len, attrValueSyntax); off += strlen(namedCharRef)-1; char *encodedNamedCharRef = encodeNamedCharRef(namedCharRef); - if (returnToState == STATE_INNER_TEXT) - { + if (returnToState == STATE_INNER_TEXT) { stillOpenTag = getLastOpenTag(tagList); stillOpenTag->innerText = stringCat(stillOpenTag->innerText, encodedNamedCharRef); - } - else if (returnToState == STATE_ATTR_VALUE) - { + } else if (returnToState == STATE_ATTR_VALUE) { tag->attrs[a]->value = stringCat( tag->attrs[a]->value, encodedNamedCharRef @@ -756,18 +689,14 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis state = returnToState; break; case STATE_CHAR_REF_NUMERIC: - if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) - { + if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) { size_t newOffset; char *numericCharRef = parseNumericCharRef(text, off+1, 16, &newOffset); off += newOffset; - if (returnToState == STATE_INNER_TEXT) - { + if (returnToState == STATE_INNER_TEXT) { stillOpenTag = getLastOpenTag(tagList); stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef); - } - else if (returnToState == STATE_ATTR_VALUE) - { + } else if (returnToState == STATE_ATTR_VALUE) { tag->attrs[a]->value = stringCat( tag->attrs[a]->value, numericCharRef @@ -775,19 +704,14 @@ struct tag *parseTag(char *text, size_t offset, enum state state, struct tag_lis } state = returnToState; break; - } - else if (isASCIIDigit(cp)) - { + } else if (isASCIIDigit(cp)) { size_t newOffset; char *numericCharRef = parseNumericCharRef(text, off, 10, &newOffset); off += newOffset-1; - if (returnToState == STATE_INNER_TEXT) - { + if (returnToState == STATE_INNER_TEXT) { stillOpenTag = getLastOpenTag(tagList); stillOpenTag->innerText = stringCat(stillOpenTag->innerText, numericCharRef); - } - else if (returnToState == STATE_ATTR_VALUE) - { + } else if (returnToState == STATE_ATTR_VALUE) { tag->attrs[a]->value = stringCat( tag->attrs[a]->value, numericCharRef @@ -809,15 +733,13 @@ void freeTag(struct tag *t) { free(t->name); free(t->innerText); - for (int i=0; i<t->attrsLen; i++) - { + for (int i=0; i<t->attrsLen; i++) { free(t->attrs[i]->name); free(t->attrs[i]->value); free(t->attrs[i]); } free(t->attrs); - for (int i=0; i<t->childrenLen; i++) - { + for (int i=0; i<t->childrenLen; i++) { if (t->children[i] != NULL) freeTag(t->children[i]); } @@ -840,69 +762,50 @@ void findTag(struct tag *tag, struct find_opts *opt, struct tag_list *foundTags) bool matchesAttrValue = false; if (strcmp(tag->name, opt->tag) == 0) matchesTag = true; - for (int i=0; i<tag->attrsLen; i++) - { + for (int i=0; i<tag->attrsLen; i++) { if (strcmp(tag->attrs[i]->name, opt->key) == 0) matchesAttrKey = true; if (strcmp(tag->attrs[i]->value, opt->attr) == 0) matchesAttrValue = true; } - if (strlen(opt->tag) > 0 && strlen(opt->key) > 0 && strlen(opt->attr) > 0) - { - if (matchesTag && matchesAttrKey && matchesAttrValue) - { + if (strlen(opt->tag) > 0 && strlen(opt->key) > 0 && strlen(opt->attr) > 0) { + if (matchesTag && matchesAttrKey && matchesAttrValue) { foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); foundTags->tags[foundTags->len] = tag; foundTags->len++; } - } - else if (strlen(opt->tag) > 0 && strlen(opt->key) > 0) - { - if (matchesTag && matchesAttrKey) - { + } else if (strlen(opt->tag) > 0 && strlen(opt->key) > 0) { + if (matchesTag && matchesAttrKey) { foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); foundTags->tags[foundTags->len] = tag; foundTags->len++; } - } - else if (strlen(opt->tag) > 0) - { - if (matchesTag) - { + } else if (strlen(opt->tag) > 0) { + if (matchesTag) { foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); foundTags->tags[foundTags->len] = tag; foundTags->len++; } - } - else if (strlen(opt->key) > 0 && strlen(opt->attr) > 0) - { - if (matchesAttrKey && matchesAttrValue) - { + } else if (strlen(opt->key) > 0 && strlen(opt->attr) > 0) { + if (matchesAttrKey && matchesAttrValue) { foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); foundTags->tags[foundTags->len] = tag; foundTags->len++; } - } - else if (strlen(opt->key) > 0) - { - if (matchesAttrKey) - { + } else if (strlen(opt->key) > 0) { + if (matchesAttrKey) { foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); foundTags->tags[foundTags->len] = tag; foundTags->len++; } - } - else if (strlen(opt->attr) > 0) - { - if (matchesAttrValue) - { + } else if (strlen(opt->attr) > 0) { + if (matchesAttrValue) { foundTags->tags = realloc(foundTags->tags, (foundTags->len+1) * sizeof(struct tag)); foundTags->tags[foundTags->len] = tag; foundTags->len++; } } - for (int i=tag->childrenLen-1; i>-1; i--) - { + for (int i=tag->childrenLen-1; i>-1; i--) { findTag(tag->children[i], opt, foundTags); } } @@ -913,15 +816,11 @@ void printHtml(struct tag *t, int indent) putchar(' '); printf("%s", t->name); for (int i=0; i<t->attrsLen; i++) - { printf(" %s=%s", t->attrs[i]->name, t->attrs[i]->value); - } printf("\n"); indent++; for (int i=t->childrenLen-1; i>-1; i--) - { printHtml(t->children[i], indent); - } } void printResult @@ -932,14 +831,11 @@ void printResult struct tag_list *foundTags ) { - if (opts->isExcept) - { + if (opts->isExcept) { bool isMatch = false; - for (int i=0; i<strlen(text); i++) - { + for (int i=0; i<strlen(text); i++) { isMatch = false; - for (int k=0; k<foundTags->len; k++) - { + for (int k=0; k<foundTags->len; k++) { if ( foundTags->tags[k]->_outerHtmlBeginOffset <= i && foundTags->tags[k]->_outerHtmlEndOffset > i @@ -949,15 +845,11 @@ void printResult if (!isMatch) putchar(text[i]); } - } - else - { + } else { char *requestedText = NULL; char *trimmedText = NULL; - for (int i=0; i<foundTags->len; i++) - { - switch (opts->out) - { + for (int i=0; i<foundTags->len; i++) { + switch (opts->out) { case OUT_INNER_HTML: requestedText = getInnerHtml(text, foundTags->tags[i]); trimmedText = trim(requestedText); @@ -972,23 +864,18 @@ void printResult trimmedText = trim(foundTags->tags[i]->innerText); break; case OUT_ATTR_VALUE: - if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) - { - for (int k=0; k<foundTags->tags[i]->attrsLen; k++) - { + if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) { + for (int k=0; k<foundTags->tags[i]->attrsLen; k++) { if (strcmp(foundTags->tags[i]->attrs[k]->name, opts->key) == 0) printf("%s\n", foundTags->tags[i]->attrs[k]->value); } - } - else if (strlen(opts->tag) > 0) - { + } else if (strlen(opts->tag) > 0) { for (int k=0; k<foundTags->tags[i]->attrsLen; k++) printf("%s\n", foundTags->tags[i]->attrs[k]->value); } break; } - if (trimmedText) - { + if (trimmedText) { if (strlen(trimmedText) > 0) printf("%s\n", trimmedText); free(trimmedText); @@ -1013,22 +900,20 @@ void filterHtml(char *text, struct find_opts *opts) struct tag_list *tagList = initTagList(); struct tag_list *foundTags = initTagList(); size_t len = parseDoctype(text); - if (len == -1) - { + if (len == -1) { fprintf(stderr, "Error parsing <!DOCTYPE ....\n"); goto CLEAN; - } - else + } else { text += len; + } struct tag *rootTag = parseTag(text, 0, STATE_INNER_TEXT, tagList); - if (!existFindPattern(opts)) - { + if (!existFindPattern(opts)) { foundTags->tags = realloc(foundTags->tags, sizeof(struct tag)); foundTags->tags[0] = rootTag; foundTags->len = 1; - } - else + } else { findTag(rootTag, opts, foundTags); + } printResult(text, rootTag, opts, foundTags); // printHtml(rootTag, -1); freeTag(rootTag); diff --git a/html.h b/html.h @@ -1,21 +1,19 @@ -#define printError(msg) do { fprintf(stderr, "%s: %s\n", __func__, msg); } while (0) - -#define LESS_THAN_SIGN 0x3C -#define GREATER_THAN_SIGN 0x3E -#define EQUALS_SIGN 0x3D -#define TAB 0x09 -#define LF 0x0A -#define FF 0x0C -#define CR 0x0D -#define SPACE 0x20 -#define SOLIDUS 0x2F -#define EXCLAMATION_MARK 0x21 -#define QUOTATION_MARK 0x22 +#define LESS_THAN_SIGN 0x3C +#define GREATER_THAN_SIGN 0x3E +#define EQUALS_SIGN 0x3D +#define TAB 0x09 +#define LF 0x0A +#define FF 0x0C +#define CR 0x0D +#define SPACE 0x20 +#define SOLIDUS 0x2F +#define EXCLAMATION_MARK 0x21 +#define QUOTATION_MARK 0x22 #define NUMBER_SIGN 0x23 #define AMPERSAND 0x26 -#define APOSTROPHE 0x27 -#define GRAVE_ACCENT 0x60 -#define HYPHEN_MINUS 0x2D +#define APOSTROPHE 0x27 +#define GRAVE_ACCENT 0x60 +#define HYPHEN_MINUS 0x2D #define SEMICOLON 0x3B #define SMALL_LETTER_X 0x78 #define CAPITAL_LETTER_X 0x58 @@ -29,16 +27,14 @@ static const char *voidElements[] = { "input", "link", "meta", "source", "track", "wbr" }; -enum output_type -{ +enum output_type { OUT_INNER_HTML, OUT_OUTER_HTML, OUT_INNER_TEXT, OUT_ATTR_VALUE }; -struct find_opts -{ +struct find_opts { char *tag; char *attr; char *key; @@ -47,14 +43,12 @@ struct find_opts int limit; }; -struct attr -{ +struct attr { char *name; char *value; // optional }; -struct tag -{ +struct tag { char *name; struct attr **attrs; struct tag **children; @@ -69,14 +63,12 @@ struct tag size_t _innerHtmlEndOffset; }; -struct tag_list -{ +struct tag_list { struct tag **tags; size_t len; }; -enum state -{ +enum state { STATE_INNER_TEXT, STATE_TAG, STATE_BEGIN_TAG_NAME, @@ -94,16 +86,14 @@ enum state STATE_CHAR_REF_NUMERIC }; -enum doctype_state -{ +enum doctype_state { DSTATE_TEXT, DSTATE_POSSIBLE_DTYPE, DSTATE_DTYPE_OR_COMMENT, DSTATE_DTYPE }; -enum attr_value_syntax -{ +enum attr_value_syntax { AVS_NO, AVS_QUOTATION_MARK, AVS_APOSTROPHE, diff --git a/misc.c b/misc.c @@ -32,8 +32,7 @@ char *trim(char *text) char *trimmedText = NULL; int begin = 0; int end = 0; - for (int i=0; i<strlen(text); i++) - { + for (int i=0; i<strlen(text); i++) { if ( text[i] == ' ' || text[i] == '\n' || @@ -44,8 +43,7 @@ char *trim(char *text) else break; } - for (int i=strlen(text)-1; i>=0; i--) - { + for (int i=strlen(text)-1; i>=0; i--) { if ( text[i] == ' '|| text[i] == '\n' || @@ -57,10 +55,8 @@ char *trim(char *text) break; } int k = 0; - for (int i=0; i<strlen(text); i++) - { - if (i >= begin && i < strlen(text) - end) - { + for (int i=0; i<strlen(text); i++) { + if (i >= begin && i < strlen(text) - end) { trimmedText = realloc(trimmedText, (k+1) * sizeof(char)); trimmedText[k] = text[i]; k++; @@ -76,12 +72,9 @@ bool startsWith(const char *string, const char *part) size_t partLen = strlen(part); if (partLen > strlen(string)) return false; - for (int i=0; i<partLen; i++) - { + for (int i=0; i<partLen; i++) { if (string[i] != part[i]) - { return false; - } } return true; } @@ -104,16 +97,14 @@ char *readFile(FILE *fp) char *text = NULL; int i = 0; char buf; - while (1) - { - if (tryRead(&buf, fp)) - { + while (1) { + if (tryRead(&buf, fp)) { text = realloc(text, (i+1) * sizeof(char)); text[i] = buf; i++; - } - else + } else { break; + } } text = realloc(text, (i+1) * sizeof(char)); text[i] = 0;