X-Git-Url: http://git.maemo.org/git/?p=routino;a=blobdiff_plain;f=src%2Fxmlparse.l;fp=src%2Fxmlparse.l;h=20ca05dd5a40cb1bdbb6d55dcaa5641a1191f737;hp=c64fdf0a620c085357b179d3617d8f698baff957;hb=9dffc9de96014e24d1fd1030a79317ba34c504e8;hpb=42c9226fc71c19af4d755c6900120bfa07f7e99c diff --git a/src/xmlparse.l b/src/xmlparse.l index c64fdf0..20ca05d 100644 --- a/src/xmlparse.l +++ b/src/xmlparse.l @@ -1,6 +1,6 @@ %{ /*************************************** - $Header: /home/amb/routino/src/RCS/xmlparse.l,v 1.17 2010/05/25 18:24:20 amb Exp $ + $Header: /home/amb/routino/src/RCS/xmlparse.l,v 1.20 2010/10/09 11:05:28 amb Exp $ A simple generic XML parser where the structure comes from the function parameters. Not intended to be fully conforming to XML staandard or a validating parser but @@ -116,10 +116,28 @@ static int xmlparse_options; %option nounput - /* Grammar based on http://www.w3.org/TR/2004/REC-xml-20040204/ but for ASCII not Unicode. */ + /* Grammar based on http://www.w3.org/TR/2004/REC-xml-20040204/ but for ASCII tags not Unicode. */ S [ \t\r\n] +U1 [\x09\x0A\x0D\x20-\x7F] +U2 [\xC2-\xDF][\x80-\xBF] +U3a \xE0[\xA0-\xBF][\x80-\xBF] +U3b [\xE1-\xEC][\x80-\xBF][\x80-\xBF] +U3c \xED[\x80-\x9F][\x80-\xBF] +U3d [\xEE-\xEF][\x80-\xBF][\x80-\xBF] +U3 {U3a}|{U3b}|{U3c}|{U3d} +U4a \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF] +U4b [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] +U4c \xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF] +U4 {U4a}|{U4b}|{U4c} + +U ({U1}|{U2}|{U3}|{U4}) +UquotedS ([\x09\x0A\x0D\x20-\x25\x28-\x3B\x3D\x3F-\x7F]|{U2}|{U3}|{U4}) +UquotedD ([\x09\x0A\x0D\x20-\x21\x23-\x25\x27-\x3B\x3D\x3F-\x7F]|{U2}|{U3}|{U4}) + +N (\n|\r\n) + letter [a-zA-Z] digit [0-9] xdigit [a-fA-F0-9] @@ -181,28 +199,28 @@ charref &#({digit}+|x{xdigit}+); /* XML Declaration start */ -{name} { BEGIN(XML_DECL); yylval=yytext; return(LEX_XML_DECL_BEGIN); } -.|\n { return(LEX_ERROR_XML_DECL_START); } +xml { BEGIN(XML_DECL); yylval=yytext; return(LEX_XML_DECL_BEGIN); } +.|{N} { return(LEX_ERROR_XML_DECL_START); } /* Tag middle */ "?>" { BEGIN(INITIAL); return(LEX_XML_DECL_FINISH); } {S}+ { } {name} { after_attr=XML_DECL; BEGIN(ATTR_KEY); yylval=yytext; return(LEX_ATTR_KEY); } -.|\n { return(LEX_ERROR_XML_DECL); } +.|{N} { return(LEX_ERROR_XML_DECL); } /* Any tag start */ {name} { BEGIN(TAG); yylval=yytext; return(LEX_TAG_BEGIN); } -.|\n { return(LEX_ERROR_TAG_START); } +.|{N} { return(LEX_ERROR_TAG_START); } /* End-tag start */ {name} { BEGIN(END_TAG2); yylval=yytext; return(LEX_TAG_POP); } -.|\n { return(LEX_ERROR_END_TAG); } +.|{N} { return(LEX_ERROR_END_TAG); } ">" { BEGIN(INITIAL); } -.|\n { return(LEX_ERROR_END_TAG); } +.|{N} { return(LEX_ERROR_END_TAG); } /* Any tag middle */ @@ -210,16 +228,16 @@ charref &#({digit}+|x{xdigit}+); ">" { BEGIN(INITIAL); return(LEX_TAG_PUSH); } {S}+ { } {name} { after_attr=TAG; BEGIN(ATTR_KEY); yylval=yytext; return(LEX_ATTR_KEY); } -.|\n { return(LEX_ERROR_TAG); } +.|{N} { return(LEX_ERROR_TAG); } /* Attributes */ = { BEGIN(ATTR_VAL); } -.|\n { return(LEX_ERROR_ATTR); } +.|{N} { return(LEX_ERROR_ATTR); } \" { BEGIN(DQUOTED); reset_string; } \' { BEGIN(SQUOTED); reset_string; } -.|\n { return(LEX_ERROR_ATTR); } +.|{N} { return(LEX_ERROR_ATTR); } /* Quoted strings */ @@ -228,8 +246,9 @@ charref &#({digit}+|x{xdigit}+); else { const char *str=ParseXML_Decode_Entity_Ref(yytext); if(str) {append_string(str);} else {yylval=yytext; return(LEX_ERROR_ENTITY_REF);} } } {charref} { if(xmlparse_options&XMLPARSE_RETURN_ATTR_ENCODED) {append_string(yytext);} else { const char *str=ParseXML_Decode_Char_Ref(yytext); if(str) {append_string(str);} else {yylval=yytext; return(LEX_ERROR_CHAR_REF);} } } -[<>&] { yylval=yytext; return(LEX_ERROR_ATTR_VAL); } -[^<>&\"]+ { append_string(yytext); } +[<>&\"] { yylval=yytext; return(LEX_ERROR_ATTR_VAL); } +{UquotedD}+ { append_string(yytext); } +. { yylval=yytext; return(LEX_ERROR_ATTR_VAL); } \' { BEGIN(after_attr); yylval=string; return(LEX_ATTR_VAL); } {entityref} { if(xmlparse_options&XMLPARSE_RETURN_ATTR_ENCODED) {append_string(yytext);} @@ -237,7 +256,8 @@ charref &#({digit}+|x{xdigit}+); {charref} { if(xmlparse_options&XMLPARSE_RETURN_ATTR_ENCODED) {append_string(yytext);} else { const char *str=ParseXML_Decode_Char_Ref(yytext); if(str) {append_string(str);} else {yylval=yytext; return(LEX_ERROR_CHAR_REF);} } } [<>&] { yylval=yytext; return(LEX_ERROR_ATTR_VAL); } -[^<>&\']+ { append_string(yytext); } +{UquotedS}+ { append_string(yytext); } +. { yylval=yytext; return(LEX_ERROR_ATTR_VAL); } /* End of file */ @@ -635,7 +655,7 @@ char *ParseXML_Encode_Safe_XML(const char *string) char *result; for(i=0;string[i];i++) - if(string[i]=='<' || string[i]=='>' || string[i]=='&' || string[i]=='\'' || string[i]=='"' || string[i]<32 || string[i]>126) + if(string[i]=='<' || string[i]=='>' || string[i]=='&' || string[i]=='\'' || string[i]=='"' || string[i]<32 || (unsigned char)string[i]>127) break; if(!string[i]) @@ -689,17 +709,59 @@ char *ParseXML_Encode_Safe_XML(const char *string) result[j++]='t'; result[j++]=';'; } - else if(string[i]<32 || string[i]>126) + else if(string[i]>=32 && (unsigned char)string[i]<=127) + result[j++]=string[i]; + else { + unsigned int unicode; + + /* Decode the UTF-8 */ + + if((string[i]&0xE0)==0xC0 && (string[i]&0x1F)>=2 && (string[i+1]&0xC0)==0x80) + { + /* 0000 0080-0000 07FF 110xxxxx 10xxxxxx */ + unicode =(string[i++]&0x1F)<<6; + unicode|= string[i ]&0x3F; + } + else if((string[i]&0xF0)==0xE0 && (string[i+1]&0xC0)==0x80 && (string[i+2]&0xC0)==0x80) + { + /* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx */ + unicode =(string[i++]&0x0F)<<12; + unicode|=(string[i++]&0x3F)<<6; + unicode|= string[i ]&0x3F; + } + else if((string[i]&0xF8)==0xF0 && (string[i+1]&0xC0)==0x80 && (string[i+2]&0xC0)==0x80 && (string[i+3]&0xC0)==0x80) + { + /* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + unicode =(string[i++]&0x07)<<18; + unicode|=(string[i++]&0x3F)<<12; + unicode|=(string[i++]&0x3F)<<6; + unicode|= string[i ]&0x3F; + } + else + unicode=0xFFFD; + + /* Output the character entity */ + result[j++]='&'; result[j++]='#'; result[j++]='x'; - result[j++]=hexstring[(string[i]&0xf0)>>4]; - result[j++]=hexstring[ string[i]&0x0f ]; + + if(unicode&0x00FF0000) + { + result[j++]=hexstring[((unicode>>16)&0xf0)>>4]; + result[j++]=hexstring[((unicode>>16)&0x0f) ]; + } + if(unicode&0x00FFFF00) + { + result[j++]=hexstring[((unicode>>8)&0xf0)>>4]; + result[j++]=hexstring[((unicode>>8)&0x0f) ]; + } + result[j++]=hexstring[(unicode&0xf0)>>4]; + result[j++]=hexstring[(unicode&0x0f) ]; + result[j++]=';'; } - else - result[j++]=string[i]; if(string[i]) /* Not finished */ {