htex

simple incorrect html parser
git clone git://git.relim.de/htex.git
Log | Files | Refs | README

html.h (2468B)


      1 #define LESS_THAN_SIGN		0x3C
      2 #define GREATER_THAN_SIGN	0x3E
      3 #define EQUALS_SIGN			0x3D
      4 #define TAB					0x09
      5 #define LF					0x0A
      6 #define FF					0x0C
      7 #define CR					0x0D
      8 #define SPACE				0x20
      9 #define SOLIDUS				0x2F
     10 #define EXCLAMATION_MARK	0x21
     11 #define QUOTATION_MARK		0x22
     12 #define NUMBER_SIGN         0x23
     13 #define AMPERSAND           0x26
     14 #define APOSTROPHE			0x27
     15 #define GRAVE_ACCENT		0x60
     16 #define HYPHEN_MINUS		0x2D
     17 #define SEMICOLON           0x3B
     18 #define SMALL_LETTER_X      0x78
     19 #define CAPITAL_LETTER_X    0x58
     20 
     21 #define NAMED_CHAR_REF_COUNT 2231
     22 #define LONGEST_NAMED_CHAR_REF 32
     23 #define MAX_CODEPOINT_SIZE 4
     24 
     25 struct Attr {
     26 	char *name;
     27 	char *value; // optional
     28 };
     29 
     30 struct Tag {
     31 	char *name;
     32 	struct Attr **attrs;
     33 	struct Tag **children;
     34 	char *inner_text;
     35 	size_t attrs_len;
     36 	size_t children_len;
     37 	bool is_void_element; // means there is no closing tag
     38 	bool is_closed;
     39 	size_t outer_html_begin_offset;
     40     size_t outer_html_end_offset;
     41     size_t inner_html_begin_offset;
     42     size_t inner_html_end_offset;
     43 };
     44 
     45 struct TagList {
     46 	struct Tag **tags;
     47 	size_t len;
     48 };
     49 
     50 struct HTMLDocument {
     51     char *buffer;
     52     struct Tag *tag;
     53     struct TagList *tag_list;
     54 };
     55 
     56 enum State {
     57 	STATE_INNER_TEXT,
     58 	STATE_TAG,
     59 	STATE_BEGIN_TAG_NAME,
     60 	STATE_END_TAG_NAME,
     61 	STATE_ATTR_NAME,
     62 	STATE_ATTR_VALUE,
     63 	STATE_COMMENT,
     64 	STATE_SCRIPT,
     65 	STATE_SCRIPT_POSSIBLE_END_TAG,
     66 	STATE_SCRIPT_END_TAG,
     67 	STATE_STYLE,
     68 	STATE_STYLE_POSSIBLE_END_TAG,
     69 	STATE_STYLE_END_TAG,
     70     STATE_CHAR_REF,
     71     STATE_CHAR_REF_NUMERIC
     72 };
     73 
     74 enum DoctypeState {
     75 	DSTATE_TEXT,
     76 	DSTATE_POSSIBLE_DTYPE,
     77 	DSTATE_DTYPE_OR_COMMENT,
     78 	DSTATE_DTYPE
     79 };
     80 
     81 enum AttrValueSyntax {
     82 	AVS_NO,
     83 	AVS_QUOTATION_MARK,
     84 	AVS_APOSTROPHE,
     85 	AVS_UNQUOTED
     86 };
     87 
     88 enum OutType {
     89 	OUT_INNER_HTML,
     90 	OUT_OUTER_HTML,
     91 	OUT_INNER_TEXT,
     92 	OUT_ATTR_VALUE
     93 };
     94 
     95 struct FindOpts {
     96 	char *tag;
     97 	char *attr;
     98 	char *key;
     99 	enum OutType out;
    100 	bool is_except;
    101 	int limit;
    102 };
    103 
    104 struct FindOpts *find_opts_parse(const char *pattern);
    105 void find_opts_free(struct FindOpts *opts);
    106 enum OutType output_type_parse(const char *type);
    107 
    108 struct HTMLDocument *html_document_parse(char *buffer);
    109 struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts);
    110 void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts);
    111 void html_document_free(struct HTMLDocument *document);
    112 void html_document_debug_print_tree(struct HTMLDocument *document);
    113 
    114 void tag_list_free(struct TagList *tag_list);