html.c (34967B)
1 #include <stdio.h> 2 #include <string.h> 3 #include <stdint.h> 4 #include <stdbool.h> 5 #include <stdlib.h> 6 #include <grapheme.h> 7 #include "html.h" 8 #include "entities.h" 9 #include "misc.h" 10 11 static const char *void_elements[] = { 12 "area", "base", "br", "col", "embed", "hr", "img", 13 "input", "link", "meta", "source", "track", "wbr" 14 }; 15 16 /* Only needed for debugging */ 17 /* static const char *state_to_string(enum State state) 18 { 19 switch(state) { 20 case STATE_INNER_TEXT: return "STATE_INNER_TEXT"; 21 case STATE_TAG: return "STATE_TAG"; 22 case STATE_BEGIN_TAG_NAME: return "STATE_BEGIN_TAG_NAME"; 23 case STATE_END_TAG_NAME: return "STATE_END_TAG_NAME"; 24 case STATE_ATTR_NAME: return "STATE_ATTR_NAME"; 25 case STATE_ATTR_VALUE: return "STATE_ATTR_VALUE"; 26 case STATE_COMMENT: return "STATE_COMMENT"; 27 case STATE_SCRIPT: return "STATE_SCRIPT"; 28 case STATE_SCRIPT_POSSIBLE_END_TAG: return "STATE_SCRIPT_POSSIBLE_END_TAG"; 29 case STATE_SCRIPT_END_TAG: return "STATE_SCRIPT_END_TAG"; 30 case STATE_STYLE: return "STATE_STYLE"; 31 case STATE_STYLE_POSSIBLE_END_TAG: return "STATE_STYLE_POSSIBLE_END_TAG"; 32 case STATE_STYLE_END_TAG: return "STATE_STYLE_END_TAG"; 33 case STATE_CHAR_REF: return "STATE_CHAR_REF"; 34 case STATE_CHAR_REF_NUMERIC: return "STATE_CHAR_REF_NUMERIC"; 35 } 36 return ""; 37 } */ 38 39 static inline bool tag_is_void_element(struct Tag *tag) 40 { 41 for (int i=0; i<13; i++) { 42 if (strcmp(tag->name, void_elements[i]) == 0) 43 return true; 44 } 45 return false; 46 } 47 48 static inline bool is_c0_control(uint_least32_t cp) 49 { 50 if (cp >= 0x00 && cp <= 0x1F) 51 return true; 52 return false; 53 } 54 55 static inline bool is_control(uint_least32_t cp) 56 { 57 if (is_c0_control(cp)) 58 return true; 59 if (cp >= 0x7F && cp <= 0x9F) 60 return true; 61 return false; 62 } 63 64 static inline bool is_non_char(uint_least32_t cp) 65 { 66 if (cp >= 0xFDD0 && cp <= 0xFDEF) 67 return true; 68 if ( 69 cp == 0xFFFE || cp == 0xFFFF || 70 cp == 0x1FFFE || cp == 0x1FFFF || 71 cp == 0x2FFFE || cp == 0x2FFFF || 72 cp == 0x3FFFE || cp == 0x3FFFF || 73 cp == 0x4FFFE || cp == 0x4FFFF || 74 cp == 0x5FFFE || cp == 0x5FFFF || 75 cp == 0x6FFFE || cp == 0x6FFFF || 76 cp == 0x7FFFE || cp == 0x7FFFF || 77 cp == 0x8FFFE || cp == 0x8FFFF || 78 cp == 0x9FFFE || cp == 0x9FFFF || 79 cp == 0xAFFFE || cp == 0xAFFFF || 80 cp == 0xBFFFE || cp == 0xBFFFF || 81 cp == 0xCFFFE || cp == 0xCFFFF || 82 cp == 0xDFFFE || cp == 0xDFFFF || 83 cp == 0xEFFFE || cp == 0xEFFFF || 84 cp == 0xFFFFE || cp == 0xFFFFF || 85 cp == 0x10FFFE || cp == 0x10FFFF 86 ) 87 return true; 88 return false; 89 } 90 91 static inline bool attr_name_char_is_valid(uint_least32_t cp) 92 { 93 if (is_control(cp)) 94 return false; 95 if (is_non_char(cp)) 96 return false; 97 if ( 98 cp == SPACE || 99 cp == QUOTATION_MARK || 100 cp == APOSTROPHE || 101 cp == GREATER_THAN_SIGN || 102 cp == SOLIDUS || 103 cp == EQUALS_SIGN 104 ) 105 return false; 106 return true; 107 } 108 109 static inline bool attr_value_unquoted_char_is_valid(uint_least32_t cp) 110 { 111 /* 112 Not mentioned invalid characters. 113 They are already handled before 114 function call. 115 */ 116 if ( 117 cp == EQUALS_SIGN || 118 cp == LESS_THAN_SIGN || 119 cp == GREATER_THAN_SIGN || 120 cp == GRAVE_ACCENT 121 ) 122 return false; 123 return true; 124 } 125 126 static inline bool ascii_is_digit(uint_least32_t cp) 127 { 128 if (cp >= 0x30 && cp <= 0x39) 129 return true; 130 return false; 131 } 132 133 static inline bool ascii_alpha_is_upper(uint_least32_t cp) 134 { 135 if (cp >= 0x41 && cp <= 0x5A) 136 return true; 137 return false; 138 } 139 140 static inline bool ascii_alpha_is_lower(uint_least32_t cp) 141 { 142 if (cp >= 0x61 && cp <= 0x7A) 143 return true; 144 return false; 145 } 146 147 static inline bool ascii_is_alpha(uint_least32_t cp) 148 { 149 if (ascii_alpha_is_lower(cp) || ascii_alpha_is_upper(cp)) 150 return true; 151 return false; 152 } 153 154 static inline bool ascii_is_whitespace(uint_least32_t cp) 155 { 156 if ( 157 cp == TAB || 158 cp == LF || 159 cp == FF || 160 cp == CR || 161 cp == SPACE 162 ) 163 return true; 164 return false; 165 } 166 167 static bool find_opts_exist(struct FindOpts *opts) 168 { 169 if (opts->tag && strlen(opts->tag) > 0) 170 return true; 171 if (opts->attr && strlen(opts->attr) > 0) 172 return true; 173 if (opts->key && strlen(opts->key) > 0) 174 return true; 175 return false; 176 } 177 178 struct FindOpts *find_opts_parse(const char *pattern) 179 { 180 struct FindOpts *opts = malloc(sizeof(struct FindOpts)); 181 opts->out = OUT_OUTER_HTML; 182 opts->tag = NULL; 183 opts->attr = NULL; 184 opts->key = NULL; 185 bool is_class_value = false; 186 bool is_id_value = false; 187 int i = 0; 188 bool is_attr_key = false; 189 bool is_attr_or_tag = true; 190 char *attr_or_tag = NULL; 191 int aot = 0; 192 int ak = 0; 193 int av = 0; 194 switch (pattern[0]) { 195 case '.': 196 is_class_value = true; 197 i = 1; 198 break; 199 case '#': 200 is_id_value = true; 201 i = 1; 202 break; 203 } 204 for (; i<strlen(pattern); i++) { 205 if (pattern[i] == ']') 206 break; 207 if ( 208 !is_attr_key && 209 !is_attr_or_tag && 210 pattern[i] != ']' && 211 pattern[i] != '"' 212 ) { 213 opts->attr = realloc(opts->attr, (av+1) * sizeof(char)); 214 opts->attr[av] = pattern[i]; 215 av++; 216 } 217 if (pattern[i] == '=') 218 is_attr_key = false; 219 if (is_attr_key && !is_attr_or_tag) { 220 opts->key = realloc(opts->key, (ak+1) * sizeof(char)); 221 opts->key[ak] = pattern[i]; 222 ak++; 223 } 224 if (pattern[i] == '[') { 225 is_attr_key = true; 226 is_attr_or_tag = false; 227 } 228 if (is_attr_or_tag) { 229 attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char)); 230 attr_or_tag[aot] = pattern[i]; 231 aot++; 232 } 233 } 234 attr_or_tag = realloc(attr_or_tag, (aot+1) * sizeof(char)); 235 attr_or_tag[aot] = 0; 236 if (is_id_value) { 237 free(opts->key); 238 opts->key = NULL; 239 free(opts->attr); 240 opts->attr = NULL; 241 opts->attr = attr_or_tag; 242 opts->key = realloc(opts->key, 3 * sizeof(char)); 243 opts->key[0] = 'i'; 244 opts->key[1] = 'd'; 245 opts->key[2] = 0; 246 } else if (is_class_value) { 247 free(opts->key); 248 opts->key = NULL; 249 free(opts->attr); 250 opts->attr = NULL; 251 opts->attr = attr_or_tag; 252 opts->key = realloc(opts->key, 6 * sizeof(char)); 253 opts->key[0] = 'c'; 254 opts->key[1] = 'l'; 255 opts->key[2] = 'a'; 256 opts->key[3] = 's'; 257 opts->key[4] = 's'; 258 opts->key[5] = 0; 259 } else { 260 free(opts->tag); 261 opts->tag = attr_or_tag; 262 if (av > 0) { 263 opts->attr = realloc(opts->attr, (av+1) * sizeof(char)); 264 opts->attr[av] = 0; 265 } 266 if (ak > 0) { 267 opts->key = realloc(opts->key, (ak+1) * sizeof(char)); 268 opts->key[ak] = 0; 269 } 270 } 271 return opts; 272 } 273 274 void find_opts_free(struct FindOpts *opts) 275 { 276 free(opts->tag); 277 free(opts->attr); 278 free(opts->key); 279 free(opts); 280 } 281 282 enum OutType output_type_parse(const char *type) 283 { 284 if (type == NULL) 285 return OUT_OUTER_HTML; 286 if (strcmp(type, "outerhtml") == 0) 287 return OUT_OUTER_HTML; 288 if (strcmp(type, "innerhtml") == 0) 289 return OUT_INNER_HTML; 290 if (strcmp(type, "innertext") == 0) 291 return OUT_INNER_TEXT; 292 if (strcmp(type, "attr_value") == 0) 293 return OUT_ATTR_VALUE; 294 return -1; 295 } 296 297 static struct Attr *attr_init(void) 298 { 299 struct Attr *attr = malloc(sizeof(struct Attr)); 300 attr->name = NULL; 301 attr->value = NULL; 302 return attr; 303 } 304 305 static char *charref_numeric_parse_and_encode(char *text, size_t offset, size_t *new_offset, int base) 306 { 307 size_t old_offset = offset; 308 char *character = malloc(MAX_CODEPOINT_SIZE * sizeof(char)); 309 char *numeric_charref = NULL; 310 size_t ret; 311 uint_least32_t cp; 312 do { 313 ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); 314 numeric_charref = string_concat(numeric_charref, cp_to_string(cp, ret)); 315 offset += ret; 316 } while (cp != SEMICOLON); 317 *new_offset = offset - old_offset; 318 long i = strtol(numeric_charref, NULL, base); 319 ret = grapheme_encode_utf8((uint_least32_t)i, character, MAX_CODEPOINT_SIZE); 320 character[ret] = 0; 321 free(numeric_charref); 322 return character; 323 } 324 325 static char *charref_named_parse(char *text, size_t offset, size_t len, enum AttrValueSyntax avs) 326 { 327 uint_least32_t stop_at = 0; 328 switch(avs) { 329 case AVS_QUOTATION_MARK: 330 stop_at = QUOTATION_MARK; 331 break; 332 case AVS_APOSTROPHE: 333 stop_at = APOSTROPHE; 334 break; 335 case AVS_UNQUOTED: 336 stop_at = GREATER_THAN_SIGN; 337 break; 338 case AVS_NO: /* Just to silence the compilier warning */ 339 break; 340 } 341 char *named_charref = NULL; 342 size_t ret; 343 uint_least32_t cp; 344 int i = 0; 345 for (;;) { 346 ret = grapheme_decode_utf8(text+offset, strlen(text+offset), &cp); 347 if (cp == AMPERSAND || ascii_is_whitespace(cp)) 348 break; 349 if (avs > AVS_NO && cp == stop_at) 350 break; 351 named_charref = string_concat(named_charref, cp_to_string(cp, ret)); 352 if (cp == SEMICOLON || i>=LONGEST_NAMED_CHAR_REF) 353 break; 354 offset += ret; 355 i++; 356 } 357 return named_charref; 358 } 359 360 static void charref_named_concat_remaining_string(const char *parsed_string, const char *charref, char **buf) 361 { 362 const char *remaining = &parsed_string[strlen(charref)]; 363 size_t remaining_len = strlen(remaining); 364 size_t buf_len = strlen(*buf); 365 if (remaining_len > 0) { 366 if (remaining_len == 1 && remaining[0] == ';') 367 return; 368 *buf = realloc(*buf, buf_len+remaining_len+1); 369 strcat(*buf, remaining); 370 } 371 } 372 373 static char *charref_named_encode(const char *name) 374 { 375 char *buf = NULL; 376 size_t len; 377 int i; 378 if (name) { 379 for (i=0; i<2138; i++) { 380 if (string_starts_with(name, single_cp_entities[i].name)) { 381 buf = realloc(buf, MAX_CODEPOINT_SIZE+1); 382 len = grapheme_encode_utf8(single_cp_entities[i].cp, buf, MAX_CODEPOINT_SIZE); 383 buf[len] = 0; 384 charref_named_concat_remaining_string(name, single_cp_entities[i].name, &buf); 385 return buf; 386 } 387 } 388 for (i=0; i<93; i++) { 389 if (string_starts_with(name, double_cp_entities[i].name)) { 390 size_t buf_len = 0; 391 buf = realloc(buf, 2*MAX_CODEPOINT_SIZE+1); 392 len = grapheme_encode_utf8(double_cp_entities[i].cp[0], buf, MAX_CODEPOINT_SIZE); 393 buf_len += len; 394 buf += len; 395 len = grapheme_encode_utf8(double_cp_entities[i].cp[1], buf, MAX_CODEPOINT_SIZE); 396 buf_len += len; 397 buf[buf_len] = 0; 398 charref_named_concat_remaining_string(name, double_cp_entities[i].name, &buf); 399 return buf; 400 } 401 } 402 buf = realloc(buf, (strlen(name)+2) * sizeof(char)); 403 buf[0] = '&'; 404 buf[1] = 0; 405 strcat(buf, name); 406 return buf; 407 } else { 408 buf = realloc(buf, 2 * sizeof(char)); 409 buf[0] = '&'; 410 buf[1] = 0; 411 return buf; 412 } 413 } 414 415 static struct Tag *tag_init(void) 416 { 417 struct Tag *tag = malloc(sizeof(struct Tag)); 418 tag->name = NULL; 419 tag->inner_text = NULL; 420 tag->attrs = NULL; 421 tag->children = NULL; 422 tag->attrs_len = 0; 423 tag->children_len = 0; 424 tag->is_void_element = false; 425 tag->is_closed = false; 426 tag->outer_html_begin_offset = 0; 427 tag->outer_html_end_offset = 0; 428 tag->inner_html_begin_offset = 0; 429 tag->inner_html_end_offset = 0; 430 return tag; 431 } 432 433 static struct Tag *tag_close_last_unclosed(struct TagList *tag_list, const char *end_tag_name, size_t end_offset) 434 { 435 for (int i=tag_list->len-1; i>-1; i--) { 436 if (strcmp(tag_list->tags[i]->name, end_tag_name) == 0 && !tag_list->tags[i]->is_closed) { 437 tag_list->tags[i]->is_closed = true; 438 tag_list->tags[i]->outer_html_end_offset = end_offset; 439 return tag_list->tags[i]; 440 } 441 } 442 return NULL; 443 } 444 445 static struct Tag *tag_get_last_open(struct TagList *tag_list) 446 { 447 for (int i=tag_list->len-1; i>-1; i--) { 448 if (!tag_list->tags[i]->is_void_element && !tag_list->tags[i]->is_closed) { 449 return tag_list->tags[i]; 450 } 451 } 452 return tag_list->tags[0]; 453 } 454 455 static char *tag_get_outer_html(struct Tag *tag, char *text) 456 { 457 char *outer_html = NULL; 458 int o = 0; 459 for (int i=tag->outer_html_begin_offset; i<tag->outer_html_end_offset; i++) { 460 outer_html = realloc(outer_html, (o+1) * sizeof(char)); 461 outer_html[o] = text[i]; 462 o++; 463 } 464 outer_html = realloc(outer_html, (o+1) * sizeof(char)); 465 outer_html[o] = 0; 466 return outer_html; 467 } 468 469 static char *tag_get_inner_html(struct Tag *tag, char *text) 470 { 471 char *inner_html = NULL; 472 int o = 0; 473 for (int i=tag->inner_html_begin_offset; i<tag->inner_html_end_offset; i++) { 474 inner_html = realloc(inner_html, (o+1) * sizeof(char)); 475 inner_html[o] = text[i]; 476 o++; 477 } 478 inner_html = realloc(inner_html, (o+1) * sizeof(char)); 479 inner_html[o] = 0; 480 return inner_html; 481 } 482 483 static enum State tag_process_end_of_opening_tag(struct Tag *tag, size_t offset) 484 { 485 tag->inner_html_begin_offset = offset+1; 486 tag->is_void_element = tag_is_void_element(tag); 487 if (tag->is_void_element) 488 tag->outer_html_end_offset = offset+1; 489 if (strcmp(tag->name, "script") == 0) 490 return STATE_SCRIPT; 491 else if (strcmp(tag->name, "style") == 0) 492 return STATE_STYLE; 493 return STATE_INNER_TEXT; 494 } 495 496 static void tag_set_inner_html_end_offset(struct Tag *closed_tag, char *text, size_t offset) 497 { 498 int i = offset; 499 while (text[i] != '<') 500 i--; 501 closed_tag->inner_html_end_offset = i; 502 } 503 504 static void tag_free(struct Tag *tag) 505 { 506 free(tag->name); 507 free(tag->inner_text); 508 for (int i=0; i<tag->attrs_len; i++) { 509 free(tag->attrs[i]->name); 510 free(tag->attrs[i]->value); 511 free(tag->attrs[i]); 512 } 513 free(tag->attrs); 514 for (int i=0; i<tag->children_len; i++) { 515 if (tag->children[i] != NULL) 516 tag_free(tag->children[i]); 517 } 518 free(tag->children); 519 free(tag); 520 } 521 522 static void tag_find(struct Tag *tag, struct FindOpts *opts, struct TagList *found_tags) 523 { 524 if (opts->limit > 0 && found_tags->len == opts->limit) 525 return; 526 bool matches_tag = false; 527 bool matches_attr_key = false; 528 bool matches_attr_value = false; 529 if (!string_is_empty(opts->tag)) { 530 if (strcmp(tag->name, opts->tag) == 0) 531 matches_tag = true; 532 } 533 if (!string_is_empty(opts->key)) { 534 for (int i=0; i<tag->attrs_len; i++) { 535 if (strcmp(tag->attrs[i]->name, opts->key) == 0) 536 matches_attr_key = true; 537 } 538 } 539 if (!string_is_empty(opts->attr)) { 540 for (int i=0; i<tag->attrs_len; i++) { 541 if (tag->attrs[i]->value && strcmp(tag->attrs[i]->value, opts->attr) == 0) 542 matches_attr_value = true; 543 } 544 } 545 if (!string_is_empty(opts->tag) && !string_is_empty(opts->key) && !string_is_empty(opts->attr)) { 546 if (matches_tag && matches_attr_key && matches_attr_value) { 547 found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); 548 found_tags->tags[found_tags->len] = tag; 549 found_tags->len++; 550 } 551 } else if (!string_is_empty(opts->tag) && !string_is_empty(opts->key)) { 552 if (matches_tag && matches_attr_key) { 553 found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); 554 found_tags->tags[found_tags->len] = tag; 555 found_tags->len++; 556 } 557 } else if (!string_is_empty(opts->tag)) { 558 if (matches_tag) { 559 found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); 560 found_tags->tags[found_tags->len] = tag; 561 found_tags->len++; 562 } 563 } else if (!string_is_empty(opts->key) && !string_is_empty(opts->attr)) { 564 if (matches_attr_key && matches_attr_value) { 565 found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); 566 found_tags->tags[found_tags->len] = tag; 567 found_tags->len++; 568 } 569 } else if (!string_is_empty(opts->key)) { 570 if (matches_attr_key) { 571 found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); 572 found_tags->tags[found_tags->len] = tag; 573 found_tags->len++; 574 } 575 } else if (!string_is_empty(opts->attr)) { 576 if (matches_attr_value) { 577 found_tags->tags = realloc(found_tags->tags, (found_tags->len+1) * sizeof(struct Tag)); 578 found_tags->tags[found_tags->len] = tag; 579 found_tags->len++; 580 } 581 } 582 for (int i=tag->children_len-1; i>-1; i--) { 583 tag_find(tag->children[i], opts, found_tags); 584 } 585 } 586 587 static int tag_doctype_parse(const char *text) 588 { 589 size_t offset = 0; 590 enum DoctypeState state = DSTATE_TEXT; 591 char *doctype = NULL; 592 char *lower_doctype = NULL; 593 uint_least32_t cp; 594 size_t len = strlen(text); 595 size_t ret, off; 596 for (off = 0; off<len; off += ret) { 597 if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { 598 fprintf(stderr, "htex: parseDoctype.grapheme_decode_utf8 failed.\n"); 599 } else { 600 switch (state) { 601 case DSTATE_TEXT: 602 if (cp == LESS_THAN_SIGN) { 603 state = DSTATE_POSSIBLE_DTYPE; 604 break; 605 } 606 if (cp == GREATER_THAN_SIGN) { 607 offset = off; 608 goto CLEANUP; 609 } 610 break; 611 case DSTATE_POSSIBLE_DTYPE: 612 if (cp == EXCLAMATION_MARK) 613 state = DSTATE_DTYPE_OR_COMMENT; 614 else 615 goto CLEANUP; 616 break; 617 case DSTATE_DTYPE_OR_COMMENT: 618 if (cp == HYPHEN_MINUS) { 619 goto CLEANUP; 620 } else { 621 doctype = string_concat(doctype, cp_to_string(cp, ret)); 622 state = DSTATE_DTYPE; 623 break; 624 } 625 break; 626 case DSTATE_DTYPE: 627 if (ascii_is_whitespace(cp)) { 628 size_t dlen = strlen(doctype)+1; 629 lower_doctype = malloc(dlen * sizeof(char)); 630 grapheme_to_lowercase_utf8(doctype, dlen, lower_doctype, dlen); 631 if (strcmp(lower_doctype, "doctype") == 0) { 632 state = DSTATE_TEXT; 633 } else { 634 offset = -1; 635 goto CLEANUP; 636 } 637 break; 638 } 639 doctype = string_concat(doctype, cp_to_string(cp, ret)); 640 break; 641 } 642 } 643 } 644 CLEANUP: 645 free(doctype); 646 free(lower_doctype); 647 return offset; 648 } 649 650 static struct Tag *tag_parse(struct TagList *tag_list, char *text, size_t offset, enum State state) 651 { 652 struct Tag *tag = tag_init(); 653 tag->outer_html_begin_offset= offset-1; 654 tag_list->tags = realloc(tag_list->tags, (tag_list->len+1) * sizeof(struct Tag)); 655 tag_list->tags[tag_list->len] = tag; 656 tag_list->len++; 657 struct Tag *still_open_tag = tag; 658 char *end_tag = NULL; 659 enum State return_to_state = STATE_INNER_TEXT; 660 size_t a = 0; 661 size_t attr_name_count = 0; 662 enum AttrValueSyntax avs = AVS_NO; 663 size_t hyphen_count = 0; 664 uint_least32_t cp; 665 size_t len = strlen(text); 666 size_t ret, off; 667 for (off = offset; off<len; off += ret) { 668 if ((ret = grapheme_decode_utf8(text+off, len-off, &cp)) > len-off) { 669 fprintf(stderr, "htex: parseTag.grapheme_decode_utf8 failed.\n"); 670 } else { 671 /* char *the_codepoint = cp_to_string(cp, ret); 672 printf("cp: %02X, %s, %s\n", cp, the_codepoint, state_to_string(state)); 673 free(the_codepoint); */ 674 switch (state) { 675 case STATE_INNER_TEXT: 676 if (cp == LESS_THAN_SIGN) { 677 state = STATE_TAG; 678 break; 679 } 680 if (cp == AMPERSAND) { 681 return_to_state = STATE_INNER_TEXT; 682 state = STATE_CHAR_REF; 683 break; 684 } 685 still_open_tag = tag_get_last_open(tag_list); 686 still_open_tag->inner_text = string_concat(still_open_tag->inner_text, cp_to_string(cp, ret)); 687 break; 688 case STATE_TAG: 689 if (cp == SOLIDUS) { 690 state = STATE_END_TAG_NAME; 691 break; 692 } 693 if (cp == EXCLAMATION_MARK) { 694 state = STATE_COMMENT; 695 break; 696 } 697 still_open_tag = tag_get_last_open(tag_list); 698 struct Tag *one_tag = tag_parse(tag_list, text, off, STATE_BEGIN_TAG_NAME); 699 still_open_tag->children = realloc( 700 still_open_tag->children, 701 (still_open_tag->children_len+1) * sizeof(struct Tag) 702 ); 703 still_open_tag->children[still_open_tag->children_len] = one_tag; 704 still_open_tag->children_len++; 705 free(end_tag); 706 return tag; 707 case STATE_BEGIN_TAG_NAME: 708 if (cp == GREATER_THAN_SIGN) { 709 state = tag_process_end_of_opening_tag(tag, off); 710 break; 711 } 712 if (ascii_is_whitespace(cp)) { 713 state = STATE_ATTR_NAME; 714 break; 715 } 716 if (ascii_is_digit(cp) || ascii_is_alpha(cp)) { 717 tag->name = string_concat(tag->name, cp_to_string(cp, ret)); 718 } 719 break; 720 case STATE_END_TAG_NAME: 721 if (cp == GREATER_THAN_SIGN) { 722 struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); 723 if (closed_tag != NULL) 724 tag_set_inner_html_end_offset(closed_tag, text, off); 725 free(end_tag); 726 end_tag = NULL; 727 state = STATE_INNER_TEXT; 728 break; 729 } 730 if (!ascii_is_whitespace(cp)) 731 end_tag = string_concat(end_tag, cp_to_string(cp, ret)); 732 break; 733 case STATE_ATTR_NAME: 734 if (cp == GREATER_THAN_SIGN) { 735 state = tag_process_end_of_opening_tag(tag, off); 736 break; 737 } 738 if (ascii_is_whitespace(cp)) { 739 if (attr_name_count == a+1) 740 a++; 741 break; 742 } 743 if (cp == EQUALS_SIGN) { 744 state = STATE_ATTR_VALUE; 745 break; 746 } 747 if (attr_name_char_is_valid(cp)) { 748 if (attr_name_count != a+1) { 749 tag->attrs = realloc(tag->attrs, (a+1) * sizeof(struct Attr)); 750 tag->attrs[a] = attr_init(); 751 attr_name_count = a + 1; 752 tag->attrs_len = attr_name_count; 753 } 754 tag->attrs[a]->name = string_concat(tag->attrs[a]->name, cp_to_string(cp, ret)); 755 } 756 break; 757 case STATE_ATTR_VALUE: 758 if (ascii_is_whitespace(cp)) { 759 if (avs == AVS_UNQUOTED) { 760 avs = AVS_NO; 761 state = STATE_ATTR_NAME; 762 } else if (avs == AVS_QUOTATION_MARK || avs == AVS_APOSTROPHE) { 763 if ( 764 strcmp("id", tag->attrs[a]->name) == 0 || 765 strcmp("class", tag->attrs[a]->name) == 0 766 ) { 767 char *tmp_name = malloc((strlen(tag->attrs[a]->name)+1) * sizeof(char)); 768 strcpy(tmp_name, tag->attrs[a]->name); 769 tag->attrs = realloc( 770 tag->attrs, 771 (a+1) * sizeof(struct Attr) 772 ); 773 a++; 774 tag->attrs[a] = attr_init(); 775 free(tag->attrs[a]->name); 776 tag->attrs[a]->name = tmp_name; 777 tag->attrs_len++; 778 attr_name_count = a + 1; 779 } else { 780 tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); 781 } 782 } 783 break; 784 } 785 if (cp == QUOTATION_MARK) { 786 if (avs == AVS_NO) { 787 avs = AVS_QUOTATION_MARK; 788 break; 789 } 790 if (avs == AVS_QUOTATION_MARK) { 791 avs = AVS_NO; 792 state = STATE_ATTR_NAME; 793 break; 794 } 795 } 796 if (cp == APOSTROPHE) { 797 if (avs == AVS_NO) { 798 avs = AVS_APOSTROPHE; 799 break; 800 } 801 if (avs == AVS_APOSTROPHE) { 802 avs = AVS_NO; 803 state = STATE_ATTR_NAME; 804 break; 805 } 806 } 807 if (cp == GREATER_THAN_SIGN) { 808 state = tag_process_end_of_opening_tag(tag, off); 809 break; 810 } 811 if (avs == AVS_NO && attr_value_unquoted_char_is_valid(cp)) { 812 avs = AVS_UNQUOTED; 813 } 814 if (avs > AVS_NO) { 815 if (cp == AMPERSAND) { 816 state = STATE_CHAR_REF; 817 return_to_state = STATE_ATTR_VALUE; 818 break; 819 } 820 tag->attrs[a]->value = string_concat(tag->attrs[a]->value, cp_to_string(cp, ret)); 821 } 822 break; 823 case STATE_COMMENT: 824 if (cp == GREATER_THAN_SIGN && hyphen_count >= 2) { 825 state = STATE_INNER_TEXT; 826 break; 827 } 828 if (cp == HYPHEN_MINUS) 829 hyphen_count++; 830 else 831 hyphen_count = 0; 832 break; 833 case STATE_STYLE: 834 if (cp == LESS_THAN_SIGN) { 835 state = STATE_STYLE_POSSIBLE_END_TAG; 836 break; 837 } 838 break; 839 case STATE_STYLE_POSSIBLE_END_TAG: 840 if (cp == SOLIDUS) 841 state = STATE_STYLE_END_TAG; 842 else 843 state = STATE_STYLE; 844 break; 845 case STATE_STYLE_END_TAG: 846 if (cp == GREATER_THAN_SIGN) { 847 struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); 848 if (closed_tag != NULL) 849 tag_set_inner_html_end_offset(closed_tag, text, off); 850 free(end_tag); 851 end_tag = NULL; 852 state = STATE_INNER_TEXT; 853 break; 854 } 855 if (!ascii_is_whitespace(cp)) 856 end_tag = string_concat(end_tag, cp_to_string(cp, ret)); 857 break; 858 case STATE_SCRIPT: 859 if (cp == LESS_THAN_SIGN) { 860 state = STATE_SCRIPT_POSSIBLE_END_TAG; 861 break; 862 } 863 break; 864 case STATE_SCRIPT_POSSIBLE_END_TAG: 865 if (cp == SOLIDUS) 866 state = STATE_SCRIPT_END_TAG; 867 else 868 state = STATE_SCRIPT; 869 break; 870 case STATE_SCRIPT_END_TAG: 871 if (cp == GREATER_THAN_SIGN) { 872 struct Tag *closed_tag = tag_close_last_unclosed(tag_list, end_tag, off+ret); 873 if (closed_tag != NULL) 874 tag_set_inner_html_end_offset(closed_tag, text, off); 875 free(end_tag); 876 end_tag = NULL; 877 state = STATE_INNER_TEXT; 878 break; 879 } 880 if (!ascii_is_whitespace(cp)) 881 end_tag = string_concat(end_tag, cp_to_string(cp, ret)); 882 break; 883 case STATE_CHAR_REF: 884 if (cp == NUMBER_SIGN) { /* hashtag */ 885 state = STATE_CHAR_REF_NUMERIC; 886 break; 887 } 888 char *named_charref = charref_named_parse(text, off, len, avs); 889 if (named_charref) { 890 off += strlen(named_charref)-1; 891 } else { 892 off--; 893 } 894 char *encoded_named_charref = charref_named_encode(named_charref); 895 if (return_to_state == STATE_INNER_TEXT) { 896 still_open_tag = tag_get_last_open(tag_list); 897 still_open_tag->inner_text = string_concat(still_open_tag->inner_text, encoded_named_charref); 898 } else if (return_to_state == STATE_ATTR_VALUE) { 899 tag->attrs[a]->value = string_concat(tag->attrs[a]->value, encoded_named_charref); 900 } 901 free(named_charref); 902 state = return_to_state; 903 break; 904 case STATE_CHAR_REF_NUMERIC: 905 if (cp == SMALL_LETTER_X || cp == CAPITAL_LETTER_X) { 906 size_t new_offset; 907 char *numeric_charref = charref_numeric_parse_and_encode(text, off+1, &new_offset, 16); 908 off += new_offset; 909 if (return_to_state == STATE_INNER_TEXT) { 910 still_open_tag = tag_get_last_open(tag_list); 911 still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); 912 } else if (return_to_state == STATE_ATTR_VALUE) { 913 tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); 914 } 915 state = return_to_state; 916 break; 917 } else if (ascii_is_digit(cp)) { 918 size_t new_offset; 919 char *numeric_charref = charref_numeric_parse_and_encode(text, off, &new_offset, 10); 920 off += new_offset-1; 921 if (return_to_state == STATE_INNER_TEXT) { 922 still_open_tag = tag_get_last_open(tag_list); 923 still_open_tag->inner_text = string_concat(still_open_tag->inner_text, numeric_charref); 924 } else if (return_to_state == STATE_ATTR_VALUE) { 925 tag->attrs[a]->value = string_concat(tag->attrs[a]->value, numeric_charref); 926 } 927 state = return_to_state; 928 break; 929 } 930 state = return_to_state; 931 break; 932 } 933 } 934 } 935 free(end_tag); 936 return tag; 937 } 938 939 static void tag_debug_print(struct Tag *tag, int indent) 940 { 941 for (int i=0; i<indent; i++) 942 putchar(' '); 943 printf("%s", tag->name); 944 for (int i=0; i<tag->attrs_len; i++) 945 printf(" %s=%s", tag->attrs[i]->name, tag->attrs[i]->value); 946 printf("\n"); 947 indent++; 948 for (int i=tag->children_len-1; i>-1; i--) 949 tag_debug_print(tag->children[i], indent); 950 } 951 952 static struct TagList *tag_list_init(void) 953 { 954 struct TagList *tag_list = malloc(sizeof(struct TagList)); 955 tag_list->tags = NULL; 956 tag_list->len = 0; 957 return tag_list; 958 } 959 960 void tag_list_free(struct TagList *tag_list) 961 { 962 free(tag_list->tags); 963 free(tag_list); 964 } 965 966 static struct HTMLDocument *html_document_init(void) 967 { 968 struct HTMLDocument *document = malloc(sizeof(struct HTMLDocument)); 969 document->buffer = NULL; 970 document->tag = NULL; 971 document->tag_list = NULL; 972 return document; 973 } 974 975 void html_document_free(struct HTMLDocument *document) 976 { 977 // free(doc->buffer); 978 tag_free(document->tag); 979 tag_list_free(document->tag_list); 980 free(document); 981 } 982 983 struct HTMLDocument *html_document_parse(char *buffer) 984 { 985 struct HTMLDocument *document = html_document_init(); 986 document->buffer = buffer; 987 document->tag_list = tag_list_init(); 988 int len = tag_doctype_parse(document->buffer); 989 if (len == -1) { 990 fprintf(stderr, "htex: Error parsing <!DOCTYPE ....\n"); 991 html_document_free(document); 992 return NULL; 993 } else { 994 document->buffer += len; 995 } 996 document->tag = tag_parse(document->tag_list, document->buffer, 0, STATE_INNER_TEXT); 997 document->tag->name = malloc(sizeof(char)); 998 document->tag->name[0] = 0; 999 return document; 1000 } 1001 1002 struct TagList *html_document_find(struct HTMLDocument *document, struct FindOpts *opts) 1003 { 1004 struct TagList *found_tags = tag_list_init(); 1005 if (!find_opts_exist(opts)) { 1006 found_tags->tags = realloc(found_tags->tags, sizeof(struct Tag)); 1007 found_tags->tags[0] = document->tag->children[0]; 1008 found_tags->len = 1; 1009 } else { 1010 tag_find(document->tag, opts, found_tags); 1011 } 1012 return found_tags; 1013 } 1014 1015 void html_document_print_find_result(struct HTMLDocument *document, struct TagList *found_tags, struct FindOpts *opts) 1016 { 1017 if (opts->is_except) { 1018 bool is_match = false; 1019 for (int i=0; i<strlen(document->buffer); i++) { 1020 is_match = false; 1021 for (int k=0; k<found_tags->len; k++) { 1022 if ( 1023 found_tags->tags[k]->outer_html_begin_offset <= i && 1024 found_tags->tags[k]->outer_html_end_offset > i 1025 ) 1026 is_match = true; 1027 } 1028 if (!is_match) 1029 putchar(document->buffer[i]); 1030 } 1031 } else { 1032 char *requested_text = NULL; 1033 char *trimmed_text = NULL; 1034 for (int i=0; i<found_tags->len; i++) { 1035 switch (opts->out) { 1036 case OUT_INNER_HTML: 1037 requested_text = tag_get_inner_html(found_tags->tags[i], document->buffer); 1038 trimmed_text = string_trim(requested_text); 1039 free(requested_text); 1040 break; 1041 case OUT_OUTER_HTML: 1042 requested_text = tag_get_outer_html(found_tags->tags[i], document->buffer); 1043 trimmed_text = string_trim(requested_text); 1044 free(requested_text); 1045 break; 1046 case OUT_INNER_TEXT: 1047 trimmed_text = string_trim(found_tags->tags[i]->inner_text); 1048 break; 1049 case OUT_ATTR_VALUE: 1050 if (strlen(opts->key) > 0 && strlen(opts->tag) > 0) { 1051 for (int k=0; k<found_tags->tags[i]->attrs_len; k++) { 1052 if (strcmp(found_tags->tags[i]->attrs[k]->name, opts->key) == 0) 1053 printf("%s\n", found_tags->tags[i]->attrs[k]->value); 1054 } 1055 } else if (strlen(opts->tag) > 0) { 1056 for (int k=0; k<found_tags->tags[i]->attrs_len; k++) 1057 printf("%s\n", found_tags->tags[i]->attrs[k]->value); 1058 } 1059 break; 1060 } 1061 if (trimmed_text) { 1062 if (strlen(trimmed_text) > 0) 1063 printf("%s\n", trimmed_text); 1064 free(trimmed_text); 1065 } 1066 } 1067 } 1068 } 1069 1070 void html_document_debug_print_tree(struct HTMLDocument *document) 1071 { 1072 tag_debug_print(document->tag, -1); 1073 }