1 /* $Id: hparser.c,v 2.134 2007/01/12 10:54:06 gisle Exp $
3 * Copyright 1999-2007, Gisle Aas
4 * Copyright 1999-2000, Michael A. Chase
6 * This library is free software; you can redistribute it and/or
7 * modify it under the same terms as Perl itself.
14 #include "hctype.h" /* isH...() macros */
15 #include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */
36 ARG_SELF = 1, /* need to avoid '\0' in argspec string */
56 ARG_LITERAL, /* Always keep last */
58 /* extra flags always encoded first */
63 /* Must be in the same order as enum argcode */
64 "self", /* ARG_SELF */
65 "tokens", /* ARG_TOKENS */
66 "tokenpos", /* ARG_TOKENPOS */
67 "token0", /* ARG_TOKEN0 */
68 "tagname", /* ARG_TAGNAME */
70 "attr", /* ARG_ATTR */
71 "@attr", /* ARG_ATTRARR */
72 "attrseq", /* ARG_ATTRSEQ */
73 "text", /* ARG_TEXT */
74 "dtext", /* ARG_DTEXT */
75 "is_cdata", /* ARG_IS_CDATA */
76 "skipped_text", /* ARG_SKIPPED_TEXT */
77 "offset", /* ARG_OFFSET */
78 "offset_end", /* ARG_OFFSET_END */
79 "length", /* ARG_LENGTH */
80 "line", /* ARG_LINE */
81 "column", /* ARG_COLUMN */
82 "event", /* ARG_EVENT */
83 "undef", /* ARG_UNDEF */
84 /* ARG_LITERAL (not compared) */
85 /* ARG_FLAG_FLAT_ARRAY */
88 #define CASE_SENSITIVE(p_state) \
89 ((p_state)->xml_mode || (p_state)->case_sensitive)
90 #define STRICT_NAMES(p_state) \
91 ((p_state)->xml_mode || (p_state)->strict_names)
92 #define ALLOW_EMPTY_TAG(p_state) \
93 ((p_state)->xml_mode || (p_state)->empty_element_tags)
95 static void flush_pending_text(PSTATE* p_state, SV* self);
100 * parse() - top level entry point.
101 * deals with text and calls one of its
102 * subordinate parse_*() routines after
103 * looking at the first char after "<"
104 * parse_decl() - deals with declarations <!...>
105 * parse_comment() - deals with <!-- ... -->
106 * parse_marked_section - deals with <![ ... [ ... ]]>
107 * parse_end() - deals with end tags </...>
108 * parse_start() - deals with start tags <A...>
109 * parse_process() - deals with process instructions <?...>
110 * parse_null() - deals with anything else <....>
112 * report_event() - called whenever any of the parse*() routines
113 * has recongnized something.
117 report_event(PSTATE* p_state,
119 char *beg, char *end, U32 utf8,
120 token_pos_t *tokens, int num_tokens,
135 #ifdef UNICODE_HTML_PARSER
136 #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b))
138 #define CHR_DIST(a,b) ((a) - (b))
141 /* some events might still fire after a handler has signaled eof
142 * so suppress them here.
147 /* capture offsets */
148 offset = p_state->offset;
149 line = p_state->line;
150 column = p_state->column;
153 { /* used for debugging at some point */
157 /* print debug output */
159 case E_DECLARATION: printf("DECLARATION"); break;
160 case E_COMMENT: printf("COMMENT"); break;
161 case E_START: printf("START"); break;
162 case E_END: printf("END"); break;
163 case E_TEXT: printf("TEXT"); break;
164 case E_PROCESS: printf("PROCESS"); break;
165 case E_NONE: printf("NONE"); break;
166 default: printf("EVENT #%d", event); break;
172 putchar('\\'); putchar('n');
178 printf("] %d\n", end - beg);
179 for (i = 0; i < num_tokens; i++) {
180 printf(" token %d: %d %d\n",
183 tokens[i].end - tokens[i].beg);
188 if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) {
191 t.beg = p_state->pending_end_tag;
192 t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag);
193 p_state->pending_end_tag = 0;
194 report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
199 p_state->offset += CHR_DIST(end, beg);
211 p_state->column = CHR_DIST(end, nl) - 1;
213 p_state->column += CHR_DIST(end, beg);
219 #ifdef MARKED_SECTION
220 if (p_state->ms == MS_IGNORE)
225 if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) {
227 if (event == E_START || event == E_END) {
228 SV* tagname = p_state->tmp;
230 assert(num_tokens >= 1);
231 sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg);
236 if (!CASE_SENSITIVE(p_state))
237 sv_lower(aTHX_ tagname);
239 if (p_state->ignoring_element) {
240 if (sv_eq(p_state->ignoring_element, tagname)) {
241 if (event == E_START)
242 p_state->ignore_depth++;
243 else if (--p_state->ignore_depth == 0) {
244 SvREFCNT_dec(p_state->ignoring_element);
245 p_state->ignoring_element = 0;
251 if (p_state->ignore_elements &&
252 hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
254 if (event == E_START) {
255 p_state->ignoring_element = newSVsv(tagname);
256 p_state->ignore_depth = 1;
261 if (p_state->ignore_tags &&
262 hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
266 if (p_state->report_tags &&
267 !hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
272 else if (p_state->ignoring_element) {
277 h = &p_state->handlers[event];
279 /* event = E_DEFAULT; */
280 h = &p_state->handlers[E_DEFAULT];
285 if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
286 /* FALSE scalar ('' or 0) means IGNORE this event */
290 if (p_state->unbroken_text && event == E_TEXT) {
291 /* should buffer text */
292 if (!p_state->pend_text)
293 p_state->pend_text = newSV(256);
294 if (SvOK(p_state->pend_text)) {
295 if (p_state->is_cdata != p_state->pend_text_is_cdata) {
296 flush_pending_text(p_state, self);
303 p_state->pend_text_offset = offset;
304 p_state->pend_text_line = line;
305 p_state->pend_text_column = column;
306 p_state->pend_text_is_cdata = p_state->is_cdata;
307 sv_setpvn(p_state->pend_text, "", 0);
309 SvUTF8_off(p_state->pend_text);
311 #ifdef UNICODE_HTML_PARSER
312 if (utf8 && !SvUTF8(p_state->pend_text))
313 sv_utf8_upgrade(p_state->pend_text);
314 if (utf8 || !SvUTF8(p_state->pend_text)) {
315 sv_catpvn(p_state->pend_text, beg, end - beg);
318 SV *tmp = newSVpvn(beg, end - beg);
319 sv_utf8_upgrade(tmp);
320 sv_catsv(p_state->pend_text, tmp);
324 sv_catpvn(p_state->pend_text, beg, end - beg);
328 else if (p_state->pend_text && SvOK(p_state->pend_text)) {
329 flush_pending_text(p_state, self);
333 /* At this point we have decided to generate an event callback */
335 argspec = h->argspec ? SvPV(h->argspec, my_na) : "";
337 if (SvTYPE(h->cb) == SVt_PVAV) {
339 if (*argspec == ARG_FLAG_FLAT_ARRAY) {
344 /* start sub-array for accumulator array */
350 if (*argspec == ARG_FLAG_FLAT_ARRAY)
353 /* start argument stack for callback */
359 for (s = argspec; *s; s++) {
362 enum argcode argcode = (enum argcode)*s;
367 arg = sv_mortalcopy(self);
371 if (num_tokens >= 1) {
373 SV* prev_token = &PL_sv_undef;
375 av_extend(av, num_tokens);
376 for (i = 0; i < num_tokens; i++) {
378 prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg);
380 SvUTF8_on(prev_token);
381 av_push(av, prev_token);
384 av_push(av, p_state->bool_attr_val
385 ? newSVsv(p_state->bool_attr_val)
386 : newSVsv(prev_token));
389 arg = sv_2mortal(newRV_noinc((SV*)av));
394 if (num_tokens >= 1 && tokens[0].beg >= beg) {
397 av_extend(av, num_tokens*2);
398 for (i = 0; i < num_tokens; i++) {
400 av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg)));
401 av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg)));
403 else { /* boolean tag value */
404 av_push(av, newSViv(0));
405 av_push(av, newSViv(0));
408 arg = sv_2mortal(newRV_noinc((SV*)av));
417 if (num_tokens >= 1) {
418 arg = sv_2mortal(newSVpvn(tokens[0].beg,
419 tokens[0].end - tokens[0].beg));
422 if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0)
424 if (argcode == ARG_TAG && event != E_START) {
425 char *e_type = "!##/#?#";
426 sv_insert(arg, 0, 0, &e_type[event], 1);
433 if (event == E_START) {
436 if (argcode == ARG_ATTR) {
438 arg = sv_2mortal(newRV_noinc((SV*)hv));
442 /* gcc -Wall reports this variable as possibly used uninitialized */
445 push_arg = 0; /* deal with argument pushing here */
448 for (i = 1; i < num_tokens; i += 2) {
449 SV* attrname = newSVpvn(tokens[i].beg,
450 tokens[i].end-tokens[i].beg);
455 if (tokens[i+1].beg) {
456 char *beg = tokens[i+1].beg;
457 STRLEN len = tokens[i+1].end - beg;
458 if (*beg == '"' || *beg == '\'') {
459 assert(len >= 2 && *beg == beg[len-1]);
462 attrval = newSVpvn(beg, len);
465 if (!p_state->attr_encoded) {
466 #ifdef UNICODE_HTML_PARSER
467 if (p_state->utf8_mode)
468 sv_utf8_decode(attrval);
470 decode_entities(aTHX_ attrval, p_state->entity2char, 0);
471 if (p_state->utf8_mode)
476 if (p_state->bool_attr_val)
477 attrval = newSVsv(p_state->bool_attr_val);
479 attrval = newSVsv(attrname);
482 if (!CASE_SENSITIVE(p_state))
483 sv_lower(aTHX_ attrname);
485 if (argcode == ARG_ATTR) {
486 if (hv_exists_ent(hv, attrname, 0) ||
487 !hv_store_ent(hv, attrname, attrval, 0)) {
488 SvREFCNT_dec(attrval);
490 SvREFCNT_dec(attrname);
492 else { /* ARG_ATTRARR */
494 av_push(array, attrname);
495 av_push(array, attrval);
498 XPUSHs(sv_2mortal(attrname));
499 XPUSHs(sv_2mortal(attrval));
504 else if (argcode == ARG_ATTRARR) {
509 case ARG_ATTRSEQ: /* (v2 compatibility stuff) */
510 if (event == E_START) {
513 for (i = 1; i < num_tokens; i += 2) {
514 SV* attrname = newSVpvn(tokens[i].beg,
515 tokens[i].end-tokens[i].beg);
518 if (!CASE_SENSITIVE(p_state))
519 sv_lower(aTHX_ attrname);
520 av_push(av, attrname);
522 arg = sv_2mortal(newRV_noinc((SV*)av));
527 arg = sv_2mortal(newSVpvn(beg, end - beg));
533 if (event == E_TEXT) {
534 arg = sv_2mortal(newSVpvn(beg, end - beg));
537 if (!p_state->is_cdata) {
538 #ifdef UNICODE_HTML_PARSER
539 if (p_state->utf8_mode)
542 decode_entities(aTHX_ arg, p_state->entity2char, 1);
543 if (p_state->utf8_mode)
550 if (event == E_TEXT) {
551 arg = boolSV(p_state->is_cdata);
555 case ARG_SKIPPED_TEXT:
556 arg = sv_2mortal(p_state->skipped_text);
557 p_state->skipped_text = newSVpvn("", 0);
561 arg = sv_2mortal(newSViv(offset));
565 arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg)));
569 arg = sv_2mortal(newSViv(CHR_DIST(end, beg)));
573 arg = sv_2mortal(newSViv(line));
577 arg = sv_2mortal(newSViv(column));
581 assert(event >= 0 && event < EVENT_COUNT);
582 arg = sv_2mortal(newSVpv(event_id_str[event], 0));
587 int len = (unsigned char)s[1];
588 arg = sv_2mortal(newSVpvn(s+2, len));
589 if (SvUTF8(h->argspec))
596 arg = sv_mortalcopy(&PL_sv_undef);
600 arg = sv_2mortal(newSVpvf("Bad argspec %d", *s));
606 arg = sv_mortalcopy(&PL_sv_undef);
609 /* have to fix mortality here or add mortality to
610 * XPUSHs after removing it from the switch cases.
612 av_push(array, SvREFCNT_inc(arg));
621 if (array != (AV*)h->cb)
622 av_push((AV*)h->cb, newRV_noinc((SV*)array));
627 if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) {
628 char *method = SvPV(h->cb, my_na);
629 perl_call_method(method, G_DISCARD | G_EVAL | G_VOID);
632 perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID);
642 if (p_state->skipped_text)
643 SvCUR_set(p_state->skipped_text, 0);
647 if (p_state->skipped_text) {
648 if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text))
649 flush_pending_text(p_state, self);
650 #ifdef UNICODE_HTML_PARSER
651 if (utf8 && !SvUTF8(p_state->skipped_text))
652 sv_utf8_upgrade(p_state->skipped_text);
653 if (utf8 || !SvUTF8(p_state->skipped_text)) {
655 sv_catpvn(p_state->skipped_text, beg, end - beg);
656 #ifdef UNICODE_HTML_PARSER
659 SV *tmp = newSVpvn(beg, end - beg);
660 sv_utf8_upgrade(tmp);
661 sv_catsv(p_state->pend_text, tmp);
672 argspec_compile(SV* src, PSTATE* p_state)
675 SV* argspec = newSVpvn("", 0);
677 char *s = SvPV(src, len);
687 /* try to deal with '@{ ... }' wrapping */
689 while (isHSPACE(*tmp))
692 char c = ARG_FLAG_FLAT_ARRAY;
693 sv_catpvn(argspec, &c, 1);
695 while (isHSPACE(*tmp))
701 if (isHNAME_FIRST(*s) || *s == '@') {
707 while (isHNAME_CHAR(*s))
710 /* check identifier */
711 for ( arg_name = argname; a < ARG_LITERAL ; ++a, ++arg_name ) {
712 if (strnEQ(*arg_name, name, s - name) &&
713 (*arg_name)[s - name] == '\0')
716 if (a < ARG_LITERAL) {
717 char c = (unsigned char) a;
718 sv_catpvn(argspec, &c, 1);
720 if (a == ARG_LINE || a == ARG_COLUMN) {
722 p_state->line = 1; /* enable tracing of line/column */
724 if (a == ARG_SKIPPED_TEXT) {
725 if (!p_state->skipped_text) {
726 p_state->skipped_text = newSVpvn("", 0);
729 if (a == ARG_ATTR || a == ARG_ATTRARR || a == ARG_DTEXT) {
730 p_state->argspec_entity_decode++;
734 croak("Unrecognized identifier %.*s in argspec", s - name, name);
737 else if (*s == '"' || *s == '\'') {
738 char *string_beg = s;
740 while (s < end && *s != *string_beg && *s != '\\')
742 if (*s == *string_beg) {
744 int len = s - string_beg - 1;
745 unsigned char buf[2];
747 croak("Literal string is longer than 255 chars in argspec");
748 buf[0] = ARG_LITERAL;
750 sv_catpvn(argspec, (char*)buf, 2);
751 sv_catpvn(argspec, string_beg+1, len);
754 else if (*s == '\\') {
755 croak("Backslash reserved for literal string in argspec");
758 croak("Unterminated literal string in argspec");
762 croak("Bad argspec (%s)", s);
768 if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) {
769 /* end of '@{ ... }' */
774 croak("Bad argspec: stuff after @{...} (%s)", s);
780 croak("Missing comma separator in argspec");
791 flush_pending_text(PSTATE* p_state, SV* self)
794 bool old_unbroken_text = p_state->unbroken_text;
795 SV* old_pend_text = p_state->pend_text;
796 bool old_is_cdata = p_state->is_cdata;
797 STRLEN old_offset = p_state->offset;
798 STRLEN old_line = p_state->line;
799 STRLEN old_column = p_state->column;
801 assert(p_state->pend_text && SvOK(p_state->pend_text));
803 p_state->unbroken_text = 0;
804 p_state->pend_text = 0;
805 p_state->is_cdata = p_state->pend_text_is_cdata;
806 p_state->offset = p_state->pend_text_offset;
807 p_state->line = p_state->pend_text_line;
808 p_state->column = p_state->pend_text_column;
810 report_event(p_state, E_TEXT,
811 SvPVX(old_pend_text), SvEND(old_pend_text),
812 SvUTF8(old_pend_text), 0, 0, self);
813 SvOK_off(old_pend_text);
815 p_state->unbroken_text = old_unbroken_text;
816 p_state->pend_text = old_pend_text;
817 p_state->is_cdata = old_is_cdata;
818 p_state->offset = old_offset;
819 p_state->line = old_line;
820 p_state->column = old_column;
824 skip_until_gt(char *beg, char *end)
826 /* tries to emulate quote skipping behaviour observed in MSIE */
831 if (!quote && *s == '>')
833 if (*s == '"' || *s == '\'') {
835 quote = '\0'; /* end of quoted string */
837 else if (!quote && (prev == ' ' || prev == '=')) {
847 parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
851 if (p_state->strict_comment) {
853 char *start_com = s; /* also used to signal inside/outside */
856 /* try to locate "--" */
858 /* printf("find_dash_dash: [%s]\n", s); */
859 while (s < end && *s != '-' && *s != '>')
872 /* we are done recognizing all comments, make callbacks */
873 report_event(p_state, E_COMMENT,
889 /* two dashes in a row seen */
893 PUSH_TOKEN(start_com, s-2);
902 else if (p_state->no_dash_dash_comment_end) {
905 /* a lone '>' signals end-of-comment */
906 while (s < end && *s != '>')
911 report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
918 else { /* non-strict comment */
921 /* try to locate /--\s*>/ which signals end-of-comment */
923 while (s < end && *s != '-')
935 report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
953 #ifdef MARKED_SECTION
956 marked_section_update(PSTATE* p_state)
959 /* we look at p_state->ms_stack to determine p_state->ms */
960 AV* ms_stack = p_state->ms_stack;
961 p_state->ms = MS_NONE;
964 int stack_len = av_len(ms_stack);
966 for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) {
967 SV** svp = av_fetch(ms_stack, stack_idx, 0);
969 AV* tokens = (AV*)SvRV(*svp);
970 int tokens_len = av_len(tokens);
972 assert(SvTYPE(tokens) == SVt_PVAV);
973 for (i = 0; i <= tokens_len; i++) {
974 SV** svp = av_fetch(tokens, i, 0);
977 char *token_str = SvPV(*svp, len);
978 enum marked_section_t token;
979 if (strEQ(token_str, "include"))
981 else if (strEQ(token_str, "rcdata"))
983 else if (strEQ(token_str, "cdata"))
985 else if (strEQ(token_str, "ignore"))
989 if (p_state->ms < token)
996 /* printf("MS %d\n", p_state->ms); */
997 p_state->is_cdata = (p_state->ms == MS_CDATA);
1003 parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1009 if (!p_state->marked_sections)
1012 assert(beg[0] == '<');
1013 assert(beg[1] == '!');
1014 assert(beg[2] == '[');
1018 while (isHSPACE(*s))
1020 while (isHNAME_FIRST(*s)) {
1021 char *name_start = s;
1025 while (isHNAME_CHAR(*s))
1028 while (isHSPACE(*s))
1035 name = newSVpvn(name_start, name_end - name_start);
1038 av_push(tokens, sv_lower(aTHX_ name));
1046 while (s < end && *s != '-')
1051 s++; /* skip first '-' */
1054 /* comment finished */
1069 av_push(tokens, newSVpvn("include", 7));
1072 if (!p_state->ms_stack)
1073 p_state->ms_stack = newAV();
1074 av_push(p_state->ms_stack, newRV_noinc((SV*)tokens));
1075 marked_section_update(p_state);
1076 report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self);
1081 SvREFCNT_dec(tokens);
1082 return 0; /* not yet implemented */
1085 SvREFCNT_dec(tokens);
1092 parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1105 goto DECL_FAIL; /* nope, illegal */
1107 /* yes, two dashes seen */
1110 tmp = parse_comment(p_state, s, end, utf8, self);
1111 return (tmp == s) ? beg : tmp;
1114 #ifdef MARKED_SECTION
1116 /* marked section */
1118 tmp = parse_marked_section(p_state, beg, end, utf8, self);
1126 /* make <!> into empty comment <SGML Handbook 36:32> */
1131 report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
1142 while (s < end && isHNAME_CHAR(*s))
1144 decl_id_len = s - decl_id;
1148 /* just hardcode a few names as the recognized declarations */
1149 if (!((decl_id_len == 7 &&
1150 strnEQx(decl_id, "DOCTYPE", 7, !CASE_SENSITIVE(p_state))) ||
1151 (decl_id_len == 6 &&
1152 strnEQx(decl_id, "ENTITY", 6, !CASE_SENSITIVE(p_state)))
1159 /* first word available */
1160 PUSH_TOKEN(decl_id, s);
1163 while (s < end && isHSPACE(*s))
1169 if (*s == '"' || *s == '\'') {
1172 while (s < end && *s != *str_beg)
1177 PUSH_TOKEN(str_beg, s);
1179 else if (*s == '-') {
1190 while (s < end && *s != '-')
1199 PUSH_TOKEN(com_beg, s);
1204 else if (*s != '>') {
1208 while (s < end && isHNOT_SPACE_GT(*s))
1212 PUSH_TOKEN(word_beg, s);
1223 report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self);
1239 if (p_state->strict_comment)
1242 /* consider everything up to the first '>' a comment */
1243 while (s < end && *s != '>')
1247 token.beg = beg + 2;
1250 report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
1260 parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1266 hctype_t tag_name_first, tag_name_char;
1267 hctype_t attr_name_first, attr_name_char;
1269 if (STRICT_NAMES(p_state)) {
1270 tag_name_first = attr_name_first = HCTYPE_NAME_FIRST;
1271 tag_name_char = attr_name_char = HCTYPE_NAME_CHAR;
1274 tag_name_first = tag_name_char = HCTYPE_NOT_SPACE_GT;
1275 attr_name_first = HCTYPE_NOT_SPACE_GT;
1276 attr_name_char = HCTYPE_NOT_SPACE_EQ_GT;
1281 while (s < end && isHCTYPE(*s, tag_name_char)) {
1282 if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
1285 if (*(s + 1) == '>')
1290 PUSH_TOKEN(beg+1, s); /* tagname */
1292 while (isHSPACE(*s))
1297 while (isHCTYPE(*s, attr_name_first)) {
1299 char *attr_name_beg = s;
1300 char *attr_name_end;
1301 if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
1304 if (*(s + 1) == '>')
1308 while (s < end && isHCTYPE(*s, attr_name_char)) {
1309 if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
1312 if (*(s + 1) == '>')
1321 PUSH_TOKEN(attr_name_beg, attr_name_end); /* attr name */
1323 while (isHSPACE(*s))
1331 while (isHSPACE(*s))
1336 /* parse it similar to ="" */
1340 if (*s == '"' || *s == '\'') {
1343 while (s < end && *s != *str_beg)
1348 PUSH_TOKEN(str_beg, s);
1351 char *word_start = s;
1352 while (s < end && isHNOT_SPACE_GT(*s)) {
1353 if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
1356 if (*(s + 1) == '>')
1363 PUSH_TOKEN(word_start, s);
1365 while (isHSPACE(*s))
1371 PUSH_TOKEN(0, 0); /* boolean attr value */
1375 if (ALLOW_EMPTY_TAG(p_state) && *s == '/') {
1385 report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self);
1387 report_event(p_state, E_END, s, s, utf8, tokens, 1, self);
1389 else if (!p_state->xml_mode) {
1390 /* find out if this start tag should put us into literal_mode
1393 int tag_len = tokens[0].end - tokens[0].beg;
1395 for (i = 0; literal_mode_elem[i].len; i++) {
1396 if (tag_len == literal_mode_elem[i].len) {
1397 /* try to match it */
1399 char *t = literal_mode_elem[i].str;
1402 if (toLOWER(*s) != *t)
1408 p_state->literal_mode = literal_mode_elem[i].str;
1409 p_state->is_cdata = literal_mode_elem[i].is_cdata;
1410 /* printf("Found %s\n", p_state->literal_mode); */
1411 goto END_OF_LITERAL_SEARCH;
1416 END_OF_LITERAL_SEARCH:
1434 parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1437 hctype_t name_first, name_char;
1439 if (STRICT_NAMES(p_state)) {
1440 name_first = HCTYPE_NAME_FIRST;
1441 name_char = HCTYPE_NAME_CHAR;
1444 name_first = name_char = HCTYPE_NOT_SPACE_GT;
1447 if (isHCTYPE(*s, name_first)) {
1448 token_pos_t tagname;
1451 while (s < end && isHCTYPE(*s, name_char))
1455 if (p_state->strict_end) {
1456 while (isHSPACE(*s))
1460 s = skip_until_gt(s, end);
1465 /* a complete end tag has been recognized */
1466 report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self);
1474 else if (!p_state->strict_comment) {
1475 s = skip_until_gt(s, end);
1478 token.beg = beg + 2;
1481 report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self);
1493 parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1495 char *s = beg + 2; /* skip '<?' */
1496 /* processing instruction */
1497 token_pos_t token_pos;
1505 if (p_state->xml_mode || p_state->xml_pic) {
1506 /* XML processing instructions are ended by "?>" */
1507 if (s - beg < 4 || s[-2] != '?')
1509 token_pos.end = s - 2;
1512 /* a complete processing instruction seen */
1513 report_event(p_state, E_PROCESS, beg, s, utf8,
1514 &token_pos, 1, self);
1519 return beg; /* could not fix end */
1525 parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1532 #include "pfunc.h" /* declares the parsefunc[] */
1533 #endif /* USE_PFUNC */
1536 parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
1542 while (!p_state->eof) {
1544 * At the start of this loop we will always be ready for eating text
1545 * or a new tag. We will never be inside some tag. The 't' points
1546 * to where we started and the 's' is advanced as we go.
1549 while (p_state->literal_mode) {
1550 char *l = p_state->literal_mode;
1551 bool skip_quoted_end = (strEQ(l, "script") || strEQ(l, "style"));
1552 char inside_quote = 0;
1553 bool escape_next = 0;
1557 if (*s == '<' && !inside_quote)
1559 if (skip_quoted_end) {
1566 else if (inside_quote && *s == inside_quote)
1568 else if (*s == '\r' || *s == '\n')
1570 else if (!inside_quote && (*s == '"' || *s == '\''))
1585 /* here we rely on '\0' termination of perl svpv buffers */
1588 while (*l && toLOWER(*s) == *l) {
1593 if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) {
1594 /* matched it all */
1595 token_pos_t end_token;
1596 end_token.beg = end_text + 2;
1599 while (isHSPACE(*s))
1604 report_event(p_state, E_TEXT, t, end_text, utf8,
1606 report_event(p_state, E_END, end_text, s, utf8,
1607 &end_token, 1, self);
1608 p_state->literal_mode = 0;
1609 p_state->is_cdata = 0;
1616 #ifdef MARKED_SECTION
1617 while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
1618 while (s < end && *s != ']')
1623 if (*s == ']' && *(s + 1) == '>') {
1625 /* marked section end */
1627 report_event(p_state, E_TEXT, t, end_text, utf8,
1629 report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self);
1631 SvREFCNT_dec(av_pop(p_state->ms_stack));
1632 marked_section_update(p_state);
1643 /* first we try to match as much text as possible */
1644 while (s < end && *s != '<') {
1645 #ifdef MARKED_SECTION
1646 if (p_state->ms && *s == ']') {
1653 report_event(p_state, E_TEXT, t, end_text, utf8,
1655 report_event(p_state, E_NONE, end_text, s, utf8,
1658 SvREFCNT_dec(av_pop(p_state->ms_stack));
1659 marked_section_update(p_state);
1669 report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
1675 /* wait with white space at end */
1676 while (s >= t && isHSPACE(*s))
1680 /* might be a chopped up entities/words */
1681 while (s >= t && !isHSPACE(*s))
1683 while (s >= t && isHSPACE(*s))
1688 report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
1696 /* next char is known to be '<' and pointed to by 't' as well as 's' */
1700 new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self);
1702 if (isHNAME_FIRST(*s))
1703 new_pos = parse_start(p_state, t, end, utf8, self);
1705 new_pos = parse_end(p_state, t, end, utf8, self);
1707 new_pos = parse_decl(p_state, t, end, utf8, self);
1709 new_pos = parse_process(p_state, t, end, utf8, self);
1712 #endif /* USE_PFUNC */
1716 /* no progress, need more data to know what it is */
1723 /* if we get out here then this was not a conforming tag, so
1724 * treat it is plain text at the top of the loop again (we
1725 * have already skipped past the "<").
1740 char *s, *beg, *end;
1744 if (!p_state->start_document) {
1746 report_event(p_state, E_START_DOCUMENT, dummy, dummy, 0, 0, 0, self);
1747 p_state->start_document = 1;
1753 if (p_state->buf && SvOK(p_state->buf)) {
1755 s = SvPV(p_state->buf, len);
1757 utf8 = SvUTF8(p_state->buf);
1761 if (p_state->literal_mode) {
1762 if (strEQ(p_state->literal_mode, "plaintext") ||
1763 strEQ(p_state->literal_mode, "xmp") ||
1764 strEQ(p_state->literal_mode, "textarea"))
1766 /* rest is considered text */
1769 if (strEQ(p_state->literal_mode, "script") ||
1770 strEQ(p_state->literal_mode, "style"))
1772 /* effectively make it an empty element */
1775 t.beg = p_state->literal_mode;
1776 t.end = p_state->literal_mode + strlen(p_state->literal_mode);
1777 report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
1780 p_state->pending_end_tag = p_state->literal_mode;
1782 p_state->literal_mode = 0;
1783 s = parse_buf(aTHX_ p_state, s, end, utf8, self);
1787 if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') {
1788 p_state->no_dash_dash_comment_end = 1;
1789 s = parse_buf(aTHX_ p_state, s, end, utf8, self);
1793 if (!p_state->strict_comment && *s == '<') {
1795 if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
1796 /* some kind of unterminated markup. Report rest as as comment */
1800 report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
1809 /* report rest as text */
1810 report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
1813 SvREFCNT_dec(p_state->buf);
1816 if (p_state->pend_text && SvOK(p_state->pend_text))
1817 flush_pending_text(p_state, self);
1819 if (p_state->ignoring_element) {
1820 /* document not balanced */
1821 SvREFCNT_dec(p_state->ignoring_element);
1822 p_state->ignoring_element = 0;
1824 report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);
1827 p_state->offset = 0;
1830 p_state->column = 0;
1831 p_state->start_document = 0;
1832 p_state->literal_mode = 0;
1833 p_state->is_cdata = 0;
1837 #ifdef UNICODE_HTML_PARSER
1838 if (p_state->utf8_mode)
1839 sv_utf8_downgrade(chunk, 0);
1842 if (p_state->buf && SvOK(p_state->buf)) {
1843 sv_catsv(p_state->buf, chunk);
1844 beg = SvPV(p_state->buf, len);
1845 utf8 = SvUTF8(p_state->buf);
1848 beg = SvPV(chunk, len);
1849 utf8 = SvUTF8(chunk);
1850 if (p_state->offset == 0 && DOWARN) {
1851 /* Print warnings if we find unexpected Unicode BOM forms */
1852 #ifdef UNICODE_HTML_PARSER
1853 if (p_state->argspec_entity_decode &&
1854 !p_state->utf8_mode && (
1855 (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
1856 (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
1857 (!utf8 && probably_utf8_chunk(aTHX_ beg, len))
1861 warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
1863 if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
1864 warn("Parsing string decoded with wrong endianess");
1867 if (!utf8 && len >= 4 &&
1868 (strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
1869 strnEQ(beg, "\xFE\xFF\x00\x00", 4))
1872 warn("Parsing of undecoded UTF-32");
1874 else if (!utf8 && len >= 2 &&
1875 (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
1878 warn("Parsing of undecoded UTF-16");
1884 return; /* nothing to do */
1887 s = parse_buf(aTHX_ p_state, beg, end, utf8, self);
1889 if (s == end || p_state->eof) {
1891 SvOK_off(p_state->buf);
1895 /* need to keep rest in buffer */
1897 /* chop off some chars at the beginning */
1898 if (SvOK(p_state->buf)) {
1899 sv_chop(p_state->buf, s);
1902 sv_setpvn(p_state->buf, s, end - s);
1904 SvUTF8_on(p_state->buf);
1906 SvUTF8_off(p_state->buf);
1910 p_state->buf = newSVpv(s, end - s);
1912 SvUTF8_on(p_state->buf);