X-Git-Url: http://git.maemo.org/git/?a=blobdiff_plain;ds=sidebyside;f=dev%2Fi386%2Flibhtml-parser-perl%2Flibhtml-parser-perl-3.56%2FParser.xs;fp=dev%2Fi386%2Flibhtml-parser-perl%2Flibhtml-parser-perl-3.56%2FParser.xs;h=a173eb65b0ec7b3645a4abd0a533b4ef8f4d472f;hb=8977e561d8a9eae6959218b0306c9df2056a38a9;hp=0000000000000000000000000000000000000000;hpb=df794b845212301ea0d267c919232538bfef356a;p=dh-make-perl diff --git a/dev/i386/libhtml-parser-perl/libhtml-parser-perl-3.56/Parser.xs b/dev/i386/libhtml-parser-perl/libhtml-parser-perl-3.56/Parser.xs new file mode 100644 index 0000000..a173eb6 --- /dev/null +++ b/dev/i386/libhtml-parser-perl/libhtml-parser-perl-3.56/Parser.xs @@ -0,0 +1,672 @@ +/* $Id: Parser.xs,v 2.137 2007/01/12 10:18:39 gisle Exp $ + * + * Copyright 1999-2005, Gisle Aas. + * Copyright 1999-2000, Michael A. Chase. + * + * This library is free software; you can redistribute it and/or + * modify it under the same terms as Perl itself. + */ + + +/* + * Standard XS greeting. + */ +#ifdef __cplusplus +extern "C" { +#endif +#define PERL_NO_GET_CONTEXT /* we want efficiency */ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" +#ifdef __cplusplus +} +#endif + + + +/* + * Some perl version compatibility gruff. + */ +#include "patchlevel.h" +#if PATCHLEVEL <= 4 /* perl5.004_XX */ + +#ifndef PL_sv_undef + #define PL_sv_undef sv_undef + #define PL_sv_yes sv_yes +#endif + +#ifndef PL_hexdigit + #define PL_hexdigit hexdigit +#endif + +#ifndef ERRSV + #define ERRSV GvSV(errgv) +#endif + +#if (PATCHLEVEL == 4 && SUBVERSION <= 4) +/* The newSVpvn function was introduced in perl5.004_05 */ +static SV * +newSVpvn(char *s, STRLEN len) +{ + register SV *sv = newSV(0); + sv_setpvn(sv,s,len); + return sv; +} +#endif /* not perl5.004_05 */ +#endif /* perl5.004_XX */ + +#ifndef dNOOP + #define dNOOP extern int errno +#endif +#ifndef dTHX + #define dTHX dNOOP + #define pTHX_ + #define aTHX_ +#endif + +#ifndef MEMBER_TO_FPTR + #define MEMBER_TO_FPTR(x) (x) +#endif + +#ifndef INT2PTR + #define INT2PTR(any,d) (any)(d) + #define PTR2IV(p) (IV)(p) +#endif + + +#if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0) + #define RETHROW croak(Nullch) +#else + #define RETHROW { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); } +#endif + +#if PATCHLEVEL < 8 + /* No useable Unicode support */ + /* Make these harmless if present */ + #undef SvUTF8 + #undef SvUTF8_on + #undef SvUTF8_off + #define SvUTF8(sv) 0 + #define SvUTF8_on(sv) 0 + #define SvUTF8_off(sv) 0 +#else + #define UNICODE_HTML_PARSER +#endif + +#ifdef G_WARN_ON + #define DOWARN (PL_dowarn & G_WARN_ON) +#else + #define DOWARN PL_dowarn +#endif + +/* + * Include stuff. We include .c files instead of linking them, + * so that they don't have to pollute the external dll name space. + */ + +#ifdef EXTERN + #undef EXTERN +#endif + +#define EXTERN static /* Don't pollute */ + +#include "hparser.h" +#include "util.c" +#include "hparser.c" + + +/* + * Support functions for the XS glue + */ + +static SV* +check_handler(pTHX_ SV* h) +{ + if (SvROK(h)) { + SV* myref = SvRV(h); + if (SvTYPE(myref) == SVt_PVCV) + return newSVsv(h); + if (SvTYPE(myref) == SVt_PVAV) + return SvREFCNT_inc(myref); + croak("Only code or array references allowed as handler"); + } + return SvOK(h) ? newSVsv(h) : 0; +} + + +static PSTATE* +get_pstate_iv(pTHX_ SV* sv) +{ + PSTATE *p; +#if PATCHLEVEL < 8 + p = INT2PTR(PSTATE*, SvIV(sv)); +#else + MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL; + + if (!mg) + croak("Lost parser state magic"); + p = (PSTATE *)mg->mg_ptr; + if (!p) + croak("Lost parser state magic"); +#endif + if (p->signature != P_SIGNATURE) + croak("Bad signature in parser state object at %p", p); + return p; +} + + +static PSTATE* +get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */ +{ + HV* hv; + SV** svp; + + sv = SvRV(sv); + if (!sv || SvTYPE(sv) != SVt_PVHV) + croak("Not a reference to a hash"); + hv = (HV*)sv; + svp = hv_fetch(hv, "_hparser_xs_state", 17, 0); + if (svp) { + if (SvROK(*svp)) + return get_pstate_iv(aTHX_ SvRV(*svp)); + else + croak("_hparser_xs_state element is not a reference"); + } + croak("Can't find '_hparser_xs_state' element in HTML::Parser hash"); + return 0; +} + + +static void +free_pstate(pTHX_ PSTATE* pstate) +{ + int i; + SvREFCNT_dec(pstate->buf); + SvREFCNT_dec(pstate->pend_text); + SvREFCNT_dec(pstate->skipped_text); +#ifdef MARKED_SECTION + SvREFCNT_dec(pstate->ms_stack); +#endif + SvREFCNT_dec(pstate->bool_attr_val); + for (i = 0; i < EVENT_COUNT; i++) { + SvREFCNT_dec(pstate->handlers[i].cb); + SvREFCNT_dec(pstate->handlers[i].argspec); + } + + SvREFCNT_dec(pstate->report_tags); + SvREFCNT_dec(pstate->ignore_tags); + SvREFCNT_dec(pstate->ignore_elements); + SvREFCNT_dec(pstate->ignoring_element); + + SvREFCNT_dec(pstate->tmp); + + pstate->signature = 0; + Safefree(pstate); +} + +static int +magic_free_pstate(pTHX_ SV *sv, MAGIC *mg) +{ +#if PATCHLEVEL < 8 + free_pstate(aTHX_ get_pstate_iv(aTHX_ sv)); +#else + free_pstate(aTHX_ (PSTATE *)mg->mg_ptr); +#endif + return 0; +} + +#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 + +static PSTATE * +dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params) +{ + PSTATE *pstate2; + int i; + + Newz(56, pstate2, 1, PSTATE); + pstate2->signature = pstate->signature; + + pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params)); + pstate2->offset = pstate->offset; + pstate2->line = pstate->line; + pstate2->column = pstate->column; + pstate2->start_document = pstate->start_document; + pstate2->parsing = pstate->parsing; + pstate2->eof = pstate->eof; + + pstate2->literal_mode = pstate->literal_mode; + pstate2->is_cdata = pstate->is_cdata; + pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end; + pstate2->pending_end_tag = pstate->pending_end_tag; + + pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params)); + pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata; + pstate2->pend_text_offset = pstate->pend_text_offset; + pstate2->pend_text_line = pstate->pend_text_offset; + pstate2->pend_text_column = pstate->pend_text_column; + + pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params)); + +#ifdef MARKED_SECTION + pstate2->ms = pstate->ms; + pstate2->ms_stack = + (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params)); + pstate2->marked_sections = pstate->marked_sections; +#endif + + pstate2->strict_comment = pstate->strict_comment; + pstate2->strict_names = pstate->strict_names; + pstate2->strict_end = pstate->strict_end; + pstate2->xml_mode = pstate->xml_mode; + pstate2->unbroken_text = pstate->unbroken_text; + pstate2->attr_encoded = pstate->attr_encoded; + pstate2->case_sensitive = pstate->case_sensitive; + pstate2->closing_plaintext = pstate->closing_plaintext; + pstate2->utf8_mode = pstate->utf8_mode; + pstate2->empty_element_tags = pstate->empty_element_tags; + pstate2->xml_pic = pstate->xml_pic; + + pstate2->bool_attr_val = + SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params)); + for (i = 0; i < EVENT_COUNT; i++) { + pstate2->handlers[i].cb = + SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params)); + pstate2->handlers[i].argspec = + SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params)); + } + pstate2->argspec_entity_decode = pstate->argspec_entity_decode; + + pstate2->report_tags = + (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params)); + pstate2->ignore_tags = + (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params)); + pstate2->ignore_elements = + (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params)); + + pstate2->ignoring_element = + SvREFCNT_inc(sv_dup(pstate->ignoring_element, params)); + pstate2->ignore_depth = pstate->ignore_depth; + + if (params->flags & CLONEf_JOIN_IN) { + pstate2->entity2char = + perl_get_hv("HTML::Entities::entity2char", TRUE); + } else { + pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params); + } + pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params)); + + return pstate2; +} + +static int +magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params) +{ + mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params); + return 0; +} + +#endif + +MGVTBL vtbl_pstate = +{ + 0, + 0, + 0, + 0, + MEMBER_TO_FPTR(magic_free_pstate), +#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 + 0, + MEMBER_TO_FPTR(magic_dup_pstate), +#endif +}; + + +/* + * XS interface definition. + */ + +MODULE = HTML::Parser PACKAGE = HTML::Parser + +PROTOTYPES: DISABLE + +void +_alloc_pstate(self) + SV* self; + PREINIT: + PSTATE* pstate; + SV* sv; + HV* hv; + MAGIC* mg; + + CODE: + sv = SvRV(self); + if (!sv || SvTYPE(sv) != SVt_PVHV) + croak("Not a reference to a hash"); + hv = (HV*)sv; + + Newz(56, pstate, 1, PSTATE); + pstate->signature = P_SIGNATURE; + pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE); + pstate->tmp = NEWSV(0, 20); + + sv = newSViv(PTR2IV(pstate)); +#if PATCHLEVEL < 8 + sv_magic(sv, 0, '~', 0, 0); +#else + sv_magic(sv, 0, '~', (char *)pstate, 0); +#endif + mg = mg_find(sv, '~'); + assert(mg); + mg->mg_virtual = &vtbl_pstate; +#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 + mg->mg_flags |= MGf_DUP; +#endif + SvREADONLY_on(sv); + + hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0); + +void +parse(self, chunk) + SV* self; + SV* chunk + PREINIT: + PSTATE* p_state = get_pstate_hv(aTHX_ self); + PPCODE: + if (p_state->parsing) + croak("Parse loop not allowed"); + p_state->parsing = 1; + if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) { + SV* generator = chunk; + STRLEN len; + do { + int count; + PUSHMARK(SP); + count = perl_call_sv(generator, G_SCALAR|G_EVAL); + SPAGAIN; + chunk = count ? POPs : 0; + PUTBACK; + + if (SvTRUE(ERRSV)) { + p_state->parsing = 0; + p_state->eof = 0; + RETHROW; + } + + if (chunk && SvOK(chunk)) { + (void)SvPV(chunk, len); /* get length */ + } + else { + len = 0; + } + parse(aTHX_ p_state, len ? chunk : 0, self); + SPAGAIN; + + } while (len && !p_state->eof); + } + else { + parse(aTHX_ p_state, chunk, self); + SPAGAIN; + } + p_state->parsing = 0; + if (p_state->eof) { + p_state->eof = 0; + PUSHs(sv_newmortal()); + } + else { + PUSHs(self); + } + +void +eof(self) + SV* self; + PREINIT: + PSTATE* p_state = get_pstate_hv(aTHX_ self); + PPCODE: + if (p_state->parsing) + p_state->eof = 1; + else { + p_state->parsing = 1; + parse(aTHX_ p_state, 0, self); /* flush */ + p_state->parsing = 0; + } + PUSHs(self); + +SV* +strict_comment(pstate,...) + PSTATE* pstate + ALIAS: + HTML::Parser::strict_comment = 1 + HTML::Parser::strict_names = 2 + HTML::Parser::xml_mode = 3 + HTML::Parser::unbroken_text = 4 + HTML::Parser::marked_sections = 5 + HTML::Parser::attr_encoded = 6 + HTML::Parser::case_sensitive = 7 + HTML::Parser::strict_end = 8 + HTML::Parser::closing_plaintext = 9 + HTML::Parser::utf8_mode = 10 + HTML::Parser::empty_element_tags = 11 + HTML::Parser::xml_pic = 12 + PREINIT: + bool *attr; + CODE: + switch (ix) { + case 1: attr = &pstate->strict_comment; break; + case 2: attr = &pstate->strict_names; break; + case 3: attr = &pstate->xml_mode; break; + case 4: attr = &pstate->unbroken_text; break; + case 5: +#ifdef MARKED_SECTION + attr = &pstate->marked_sections; break; +#else + croak("marked sections not supported"); break; +#endif + case 6: attr = &pstate->attr_encoded; break; + case 7: attr = &pstate->case_sensitive; break; + case 8: attr = &pstate->strict_end; break; + case 9: attr = &pstate->closing_plaintext; break; +#ifdef UNICODE_HTML_PARSER + case 10: attr = &pstate->utf8_mode; break; +#else + case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required"); +#endif + case 11: attr = &pstate->empty_element_tags; break; + case 12: attr = &pstate->xml_pic; break; + default: + croak("Unknown boolean attribute (%d)", ix); + } + RETVAL = boolSV(*attr); + if (items > 1) + *attr = SvTRUE(ST(1)); + OUTPUT: + RETVAL + +SV* +boolean_attribute_value(pstate,...) + PSTATE* pstate + CODE: + RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val) + : &PL_sv_undef; + if (items > 1) { + SvREFCNT_dec(pstate->bool_attr_val); + pstate->bool_attr_val = newSVsv(ST(1)); + } + OUTPUT: + RETVAL + +void +ignore_tags(pstate,...) + PSTATE* pstate + ALIAS: + HTML::Parser::report_tags = 1 + HTML::Parser::ignore_tags = 2 + HTML::Parser::ignore_elements = 3 + PREINIT: + HV** attr; + int i; + CODE: + switch (ix) { + case 1: attr = &pstate->report_tags; break; + case 2: attr = &pstate->ignore_tags; break; + case 3: attr = &pstate->ignore_elements; break; + default: + croak("Unknown tag-list attribute (%d)", ix); + } + if (GIMME_V != G_VOID) + croak("Can't report tag lists yet"); + + items--; /* pstate */ + if (items) { + if (*attr) + hv_clear(*attr); + else + *attr = newHV(); + + for (i = 0; i < items; i++) { + SV* sv = ST(i+1); + if (SvROK(sv)) { + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_PVAV) { + AV* av = (AV*)sv; + STRLEN j; + STRLEN len = av_len(av) + 1; + for (j = 0; j < len; j++) { + SV**svp = av_fetch(av, j, 0); + if (svp) { + hv_store_ent(*attr, *svp, newSViv(0), 0); + } + } + } + else + croak("Tag list must be plain scalars and arrays"); + } + else { + hv_store_ent(*attr, sv, newSViv(0), 0); + } + } + } + else if (*attr) { + SvREFCNT_dec(*attr); + *attr = 0; + } + +void +handler(pstate, eventname,...) + PSTATE* pstate + SV* eventname + PREINIT: + STRLEN name_len; + char *name = SvPV(eventname, name_len); + int event = -1; + int i; + struct p_handler *h; + PPCODE: + /* map event name string to event_id */ + for (i = 0; i < EVENT_COUNT; i++) { + if (strEQ(name, event_id_str[i])) { + event = i; + break; + } + } + if (event < 0) + croak("No handler for %s events", name); + + h = &pstate->handlers[event]; + + /* set up return value */ + if (h->cb) { + PUSHs((SvTYPE(h->cb) == SVt_PVAV) + ? sv_2mortal(newRV_inc(h->cb)) + : sv_2mortal(newSVsv(h->cb))); + } + else { + PUSHs(&PL_sv_undef); + } + + /* update */ + if (items > 3) { + SvREFCNT_dec(h->argspec); + h->argspec = 0; + h->argspec = argspec_compile(ST(3), pstate); + } + if (items > 2) { + SvREFCNT_dec(h->cb); + h->cb = 0; + h->cb = check_handler(aTHX_ ST(2)); + } + + +MODULE = HTML::Parser PACKAGE = HTML::Entities + +void +decode_entities(...) + PREINIT: + int i; + HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE); + PPCODE: + if (GIMME_V == G_SCALAR && items > 1) + items = 1; + for (i = 0; i < items; i++) { + if (GIMME_V != G_VOID) + ST(i) = sv_2mortal(newSVsv(ST(i))); + else if (SvREADONLY(ST(i))) + croak("Can't inline decode readonly string"); + decode_entities(aTHX_ ST(i), entity2char, 0); + } + SP += items; + +void +_decode_entities(string, entities, ...) + SV* string + SV* entities + PREINIT: + HV* entities_hv; + bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0; + CODE: + if (SvOK(entities)) { + if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) { + entities_hv = (HV*)SvRV(entities); + } + else { + croak("2nd argument must be hash reference"); + } + } + else { + entities_hv = 0; + } + if (SvREADONLY(string)) + croak("Can't inline decode readonly string"); + decode_entities(aTHX_ string, entities_hv, expand_prefix); + +bool +_probably_utf8_chunk(string) + SV* string + PREINIT: + STRLEN len; + char *s; + CODE: +#ifdef UNICODE_HTML_PARSER + sv_utf8_downgrade(string, 0); + s = SvPV(string, len); + RETVAL = probably_utf8_chunk(aTHX_ s, len); +#else + RETVAL = 0; /* avoid never initialized complains from compiler */ + croak("_probably_utf8_chunk() only works for Unicode enabled perls"); +#endif + OUTPUT: + RETVAL + +int +UNICODE_SUPPORT() + PROTOTYPE: + CODE: +#ifdef UNICODE_HTML_PARSER + RETVAL = 1; +#else + RETVAL = 0; +#endif + OUTPUT: + RETVAL + + +MODULE = HTML::Parser PACKAGE = HTML::Parser