html.h (2468B)
1 #define LESS_THAN_SIGN 0x3C 2 #define GREATER_THAN_SIGN 0x3E 3 #define EQUALS_SIGN 0x3D 4 #define TAB 0x09 5 #define LF 0x0A 6 #define FF 0x0C 7 #define CR 0x0D 8 #define SPACE 0x20 9 #define SOLIDUS 0x2F 10 #define EXCLAMATION_MARK 0x21 11 #define QUOTATION_MARK 0x22 12 #define NUMBER_SIGN 0x23 13 #define AMPERSAND 0x26 14 #define APOSTROPHE 0x27 15 #define GRAVE_ACCENT 0x60 16 #define HYPHEN_MINUS 0x2D 17 #define SEMICOLON 0x3B 18 #define SMALL_LETTER_X 0x78 19 #define CAPITAL_LETTER_X 0x58 20 21 #define NAMED_CHAR_REF_COUNT 2231 22 #define LONGEST_NAMED_CHAR_REF 32 23 #define MAX_CODEPOINT_SIZE 4 24 25 struct Attr { 26 char *name; 27 char *value; // optional 28 }; 29 30 struct Tag { 31 char *name; 32 struct Attr **attrs; 33 struct Tag **children; 34 char *inner_text; 35 size_t attrs_len; 36 size_t children_len; 37 bool is_void_element; // means there is no closing tag 38 bool is_closed; 39 size_t outer_html_begin_offset; 40 size_t outer_html_end_offset; 41 size_t inner_html_begin_offset; 42 size_t inner_html_end_offset; 43 }; 44 45 struct TagList { 46 struct Tag **tags; 47 size_t len; 48 }; 49 50 struct HTMLDocument { 51 char *buffer; 52 struct Tag *tag; 53 struct TagList *tag_list; 54 }; 55 56 enum State { 57 STATE_INNER_TEXT, 58 STATE_TAG, 59 STATE_BEGIN_TAG_NAME, 60 STATE_END_TAG_NAME, 61 STATE_ATTR_NAME, 62 STATE_ATTR_VALUE, 63 STATE_COMMENT, 64 STATE_SCRIPT, 65 STATE_SCRIPT_POSSIBLE_END_TAG, 66 STATE_SCRIPT_END_TAG, 67 STATE_STYLE, 68 STATE_STYLE_POSSIBLE_END_TAG, 69 STATE_STYLE_END_TAG, 70 STATE_CHAR_REF, 71 STATE_CHAR_REF_NUMERIC 72 }; 73 74 enum DoctypeState { 75 DSTATE_TEXT, 76 DSTATE_POSSIBLE_DTYPE, 77 DSTATE_DTYPE_OR_COMMENT, 78 DSTATE_DTYPE 79 }; 80 81 enum AttrValueSyntax { 82 AVS_NO, 83 AVS_QUOTATION_MARK, 84 AVS_APOSTROPHE, 85 AVS_UNQUOTED 86 }; 87 88 enum OutType { 89 OUT_INNER_HTML, 90 OUT_OUTER_HTML, 91 OUT_INNER_TEXT, 92 OUT_ATTR_VALUE 93 }; 94 95 struct FindOpts { 96 char *tag; 97 char *attr; 98 char *key; 99 enum OutType out; 100 bool is_except; 101 int limit; 102 }; 103 104 struct FindOpts *find_opts_parse(const char *pattern); 105 void find_opts_free(struct FindOpts *opts); 106 enum OutType output_type_parse(const char *type); 107 108 struct HTMLDocument *html_document_parse(char *buffer); 109 struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts); 110 void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts); 111 void html_document_free(struct HTMLDocument *document); 112 void html_document_debug_print_tree(struct HTMLDocument *document); 113 114 void tag_list_free(struct TagList *tag_list);