From e05212dce6511bf6c9334dbd6763fcc1fcbc7ba7 Mon Sep 17 00:00:00 2001 From: Yves Marcoz Date: Thu, 18 Aug 2011 21:24:08 -0700 Subject: [PATCH] Upgrade feedparser to 5.0.1 --- src/feedparser.py | 2915 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 1986 insertions(+), 929 deletions(-) diff --git a/src/feedparser.py b/src/feedparser.py index e04d253..0e80a9a 100644 --- a/src/feedparser.py +++ b/src/feedparser.py @@ -6,13 +6,12 @@ Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds Visit http://feedparser.org/ for the latest version Visit http://feedparser.org/docs/ for the latest documentation -Required: Python 2.1 or later -Recommended: Python 2.3 or later +Required: Python 2.4 or later Recommended: CJKCodecs and iconv_codec """ -__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" -__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. +__version__ = "5.0.1" +__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -39,8 +38,11 @@ __contributors__ = ["Jason Diamond ", "John Beimler ", "Fazal Majid ", "Aaron Swartz ", - "Kevin Marks "] -_debug = 0 + "Kevin Marks ", + "Sam Ruby ", + "Ade Oshineye ", + "Martin Pool ", + "Kurt McKee "] # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should @@ -65,23 +67,106 @@ TIDY_MARKUP = 0 # if TIDY_MARKUP = 1 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] +# If you want feedparser to automatically resolve all relative URIs, set this +# to 1. +RESOLVE_RELATIVE_URIS = 1 + +# If you want feedparser to automatically sanitize all potentially unsafe +# HTML content, set this to 1. +SANITIZE_HTML = 1 + +# ---------- Python 3 modules (make it work if possible) ---------- +try: + import rfc822 +except ImportError: + from email import _parseaddr as rfc822 + +try: + # Python 3.1 introduces bytes.maketrans and simultaneously + # deprecates string.maketrans; use bytes.maketrans if possible + _maketrans = bytes.maketrans +except (NameError, AttributeError): + import string + _maketrans = string.maketrans + +# base64 support for Atom feeds that contain embedded binary data +try: + import base64, binascii +except ImportError: + base64 = binascii = None +else: + # Python 3.1 deprecates decodestring in favor of decodebytes + _base64decode = getattr(base64, 'decodebytes', base64.decodestring) + +def _s2bytes(s): + # Convert a UTF-8 str to bytes if the interpreter is Python 3 + try: + return bytes(s, 'utf8') + except (NameError, TypeError): + # In Python 2.5 and below, bytes doesn't exist (NameError) + # In Python 2.6 and above, bytes and str are the same (TypeError) + return s + +def _l2bytes(l): + # Convert a list of ints to bytes if the interpreter is Python 3 + try: + if bytes is not str: + # In Python 2.6 and above, this call won't raise an exception + # but it will return bytes([65]) as '[65]' instead of 'A' + return bytes(l) + raise NameError + except NameError: + return ''.join(map(chr, l)) + +# If you want feedparser to allow all URL schemes, set this to () +# List culled from Python's urlparse documentation at: +# http://docs.python.org/library/urlparse.html +# as well as from "URI scheme" at Wikipedia: +# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme +# Many more will likely need to be added! +ACCEPTABLE_URI_SCHEMES = ( + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto', + 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp', + 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais', + # Additional common-but-unofficial schemes + 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', + 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', +) +#ACCEPTABLE_URI_SCHEMES = () + # ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 +import cgi +import copy +import datetime +import re +import struct +import sys +import time +import types +import urllib +import urllib2 +import urlparse + +from htmlentitydefs import name2codepoint, codepoint2name, entitydefs + try: - from cStringIO import StringIO as _StringIO -except: - from StringIO import StringIO as _StringIO + from io import BytesIO as _StringIO +except ImportError: + try: + from cStringIO import StringIO as _StringIO + except ImportError: + from StringIO import StringIO as _StringIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- # gzip is included with most Python distributions, but may not be available if you compiled your own try: import gzip -except: +except ImportError: gzip = None try: import zlib -except: +except ImportError: zlib = None # If a real XML parser is available, feedparser will attempt to use it. feedparser has @@ -90,44 +175,112 @@ except: # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. try: import xml.sax - xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers from xml.sax.saxutils import escape as _xmlescape - _XML_AVAILABLE = 1 -except: +except ImportError: _XML_AVAILABLE = 0 - def _xmlescape(data): + def _xmlescape(data,entities={}): data = data.replace('&', '&') data = data.replace('>', '>') data = data.replace('<', '<') + for char, entity in entities: + data = data.replace(char, entity) return data +else: + try: + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers + except xml.sax.SAXReaderNotAvailable: + _XML_AVAILABLE = 0 + else: + _XML_AVAILABLE = 1 -# base64 support for Atom feeds that contain embedded binary data +# sgmllib is not available by default in Python 3; if the end user doesn't have +# it available then we'll lose illformed XML parsing, content santizing, and +# microformat support (at least while feedparser depends on BeautifulSoup). try: - import base64, binascii -except: - base64 = binascii = None + import sgmllib +except ImportError: + # This is probably Python 3, which doesn't include sgmllib anymore + _SGML_AVAILABLE = 0 + + # Mock sgmllib enough to allow subclassing later on + class sgmllib(object): + class SGMLParser(object): + def goahead(self, i): + pass + def parse_starttag(self, i): + pass +else: + _SGML_AVAILABLE = 1 + + # sgmllib defines a number of module-level regular expressions that are + # insufficient for the XML parsing feedparser needs. Rather than modify + # the variables directly in sgmllib, they're defined here using the same + # names, and the compiled code objects of several sgmllib.SGMLParser + # methods are copied into _BaseHTMLProcessor so that they execute in + # feedparser's scope instead of sgmllib's scope. + charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);') + tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') + attrfind = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?' + ) + + # Unfortunately, these must be copied over to prevent NameError exceptions + entityref = sgmllib.entityref + incomplete = sgmllib.incomplete + interesting = sgmllib.interesting + shorttag = sgmllib.shorttag + shorttagopen = sgmllib.shorttagopen + starttagopen = sgmllib.starttagopen + + class _EndBracketRegEx: + def __init__(self): + # Overriding the built-in sgmllib.endbracket regex allows the + # parser to find angle brackets embedded in element attributes. + self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') + def search(self, target, index=0): + match = self.endbracket.match(target, index) + if match is not None: + # Returning a new object in the calling thread's context + # resolves a thread-safety. + return EndBracketMatch(match) + return None + class EndBracketMatch: + def __init__(self, match): + self.match = match + def start(self, n): + return self.match.end(n) + endbracket = _EndBracketRegEx() + # cjkcodecs and iconv_codec provide support for more character encodings. # Both are available from http://cjkpython.i18n.org/ try: import cjkcodecs.aliases -except: +except ImportError: pass try: import iconv_codec -except: +except ImportError: pass # chardet library auto-detects character encodings # Download from http://chardet.feedparser.org/ try: import chardet - if _debug: - import chardet.constants - chardet.constants._debug = 1 -except: +except ImportError: chardet = None +# BeautifulSoup parser used for parsing microformats from embedded HTML content +# http://www.crummy.com/software/BeautifulSoup/ +# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the +# older 2.x series. If it doesn't, and you can figure out why, I'll accept a +# patch and modify the compatibility statement accordingly. +try: + import BeautifulSoup +except ImportError: + BeautifulSoup = None + # ---------- don't touch these ---------- class ThingsNobodyCaresAboutButMe(Exception): pass class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass @@ -135,47 +288,31 @@ class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass class UndeclaredNamespace(Exception): pass -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -sgmllib.special = re.compile('' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) + self.contentparams['type'] = u'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml': + if tag.find(':') <> -1: + prefix, tag = tag.split(':', 1) + namespace = self.namespacesInUse.get(prefix, '') + if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': + attrs.append(('xmlns',namespace)) + if tag=='svg' and namespace=='http://www.w3.org/2000/svg': + attrs.append(('xmlns',namespace)) + if tag == 'svg': + self.svgOK += 1 + return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) # match namespaces if tag.find(':') <> -1: @@ -449,17 +638,24 @@ class _FeedParserMixin: self.intextinput = 0 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): self.inimage = 0 - + # call special handler (if defined) or default handler methodname = '_start_' + prefix + suffix try: method = getattr(self, methodname) return method(attrsD) except AttributeError: - return self.push(prefix + suffix, 1) + # Since there's no handler or something has gone wrong we explicitly add the element and its attributes + unknown_tag = prefix + suffix + if len(attrsD) == 0: + # No attributes so merge it into the encosing dictionary + return self.push(unknown_tag, 1) + else: + # Has attributes so create it in its own dictionary + context = self._getContext() + context[unknown_tag] = attrsD def unknown_endtag(self, tag): - if _debug: sys.stderr.write('end %s\n' % tag) # match namespaces if tag.find(':') <> -1: prefix, suffix = tag.split(':', 1) @@ -468,20 +664,26 @@ class _FeedParserMixin: prefix = self.namespacemap.get(prefix, prefix) if prefix: prefix = prefix + '_' + if suffix == 'svg' and self.svgOK: + self.svgOK -= 1 # call special handler (if defined) or default handler methodname = '_end_' + prefix + suffix try: + if self.svgOK: + raise AttributeError() method = getattr(self, methodname) method() except AttributeError: self.pop(prefix + suffix) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'): # element declared itself as escaped markup, but it isn't really - self.contentparams['type'] = 'application/xhtml+xml' - if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': + if tag in ['xhtml:div', 'div']: + return # typepad does this 10/2007 + self.contentparams['type'] = u'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml': tag = tag.split(':')[-1] self.handle_data('' % tag, escape=0) @@ -497,7 +699,8 @@ class _FeedParserMixin: def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' - if not self.elementstack: return + if not self.elementstack: + return ref = ref.lower() if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): text = '&#%s;' % ref @@ -511,30 +714,29 @@ class _FeedParserMixin: def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' - if not self.elementstack: return - if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) + if not self.elementstack: + return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref + elif ref in self.entities.keys(): + text = self.entities[ref] + if text.startswith('&#') and text.endswith(';'): + return self.handle_entityref(text) else: - # entity resolution graciously donated by Aaron Swartz - def name2cp(k): - import htmlentitydefs - if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - k = htmlentitydefs.entitydefs[k] - if k.startswith('&#') and k.endswith(';'): - return int(k[2:-1]) # not in latin-1 - return ord(k) - try: name2cp(ref) - except KeyError: text = '&%s;' % ref - else: text = unichr(name2cp(ref)).encode('utf-8') + try: + name2codepoint[ref] + except KeyError: + text = '&%s;' % ref + else: + text = unichr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text) def handle_data(self, text, escape=1): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references - if not self.elementstack: return - if escape and self.contentparams.get('type') == 'application/xhtml+xml': + if not self.elementstack: + return + if escape and self.contentparams.get('type') == u'application/xhtml+xml': text = _xmlescape(text) self.elementstack[-1][2].append(text) @@ -551,37 +753,43 @@ class _FeedParserMixin: def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks - if _debug: sys.stderr.write('entering parse_declaration\n') if self.rawdata[i:i+9] == '', i) - if k == -1: k = len(self.rawdata) + if k == -1: + # CDATA block began but didn't finish + k = len(self.rawdata) + return k self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) return k+3 else: k = self.rawdata.find('>', i) - return k+1 + if k >= 0: + return k+1 + else: + # We have an incomplete CDATA block. + return k def mapContentType(self, contentType): contentType = contentType.lower() - if contentType == 'text': - contentType = 'text/plain' + if contentType == 'text' or contentType == 'plain': + contentType = u'text/plain' elif contentType == 'html': - contentType = 'text/html' + contentType = u'text/html' elif contentType == 'xhtml': - contentType = 'application/xhtml+xml' + contentType = u'application/xhtml+xml' return contentType - + def trackNamespace(self, prefix, uri): loweruri = uri.lower() if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: - self.version = 'rss090' + self.version = u'rss090' if loweruri == 'http://purl.org/rss/1.0/' and not self.version: - self.version = 'rss10' + self.version = u'rss10' if loweruri == 'http://www.w3.org/2005/atom' and not self.version: - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') <> -1: + self.version = u'atom10' + if loweruri.find(u'backend.userland.com/rss') <> -1: # match any backend.userland.com namespace - uri = 'http://backend.userland.com/rss' + uri = u'http://backend.userland.com/rss' loweruri = uri if self._matchnamespaces.has_key(loweruri): self.namespacemap[prefix] = self._matchnamespaces[loweruri] @@ -590,41 +798,84 @@ class _FeedParserMixin: self.namespacesInUse[prefix or ''] = uri def resolveURI(self, uri): - return _urljoin(self.baseuri or '', uri) - + return _urljoin(self.baseuri or u'', uri) + def decodeEntities(self, element, data): return data + def strattrs(self, attrs): + return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) + def push(self, element, expectingText): self.elementstack.append([element, expectingText, []]) def pop(self, element, stripWhitespace=1): - if not self.elementstack: return - if self.elementstack[-1][0] != element: return - + if not self.elementstack: + return + if self.elementstack[-1][0] != element: + return + element, expectingText, pieces = self.elementstack.pop() - output = ''.join(pieces) + + if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml': + # remove enclosing child element, but only if it is a
and + # only if all the remaining content is nested underneath it. + # This means that the divs would be retained in the following: + #
foo
bar
+ while pieces and len(pieces)>1 and not pieces[-1].strip(): + del pieces[-1] + while pieces and len(pieces)>1 and not pieces[0].strip(): + del pieces[0] + if pieces and (pieces[0] == '
' or pieces[0].startswith('
': + depth = 0 + for piece in pieces[:-1]: + if piece.startswith(''): + depth += 1 + else: + pieces = pieces[1:-1] + + # Ensure each piece is a str for Python 3 + for (i, v) in enumerate(pieces): + if not isinstance(v, unicode): + pieces[i] = v.decode('utf-8') + + output = u''.join(pieces) if stripWhitespace: output = output.strip() - if not expectingText: return output + if not expectingText: + return output # decode base64 content if base64 and self.contentparams.get('base64', 0): try: - output = base64.decodestring(output) + output = _base64decode(output) except binascii.Error: pass except binascii.Incomplete: pass - + except TypeError: + # In Python 3, base64 takes and outputs bytes, not str + # This may not be the most correct way to accomplish this + output = _base64decode(output.encode('utf-8')).decode('utf-8') + # resolve relative URIs if (element in self.can_be_relative_uri) and output: output = self.resolveURI(output) - + # decode entities within embedded markup if not self.contentparams.get('base64', 0): output = self.decodeEntities(element, output) + # some feed formats require consumers to guess + # whether the content is html or plain text + if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain': + if self.lookslikehtml(output): + self.contentparams['type'] = u'text/html' + # remove temporary cruft from contentparams try: del self.contentparams['mode'] @@ -635,26 +886,55 @@ class _FeedParserMixin: except KeyError: pass + is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types # resolve relative URIs within embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: + if is_htmlish and RESOLVE_RELATIVE_URIS: if element in self.can_contain_relative_uris: - output = _resolveRelativeURIs(output, self.baseuri, self.encoding) - + output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html')) + + # parse microformats + # (must do this before sanitizing because some microformats + # rely on elements that we sanitize) + if is_htmlish and element in ['content', 'description', 'summary']: + mfresults = _parseMicroformats(output, self.baseuri, self.encoding) + if mfresults: + for tag in mfresults.get('tags', []): + self._addTag(tag['term'], tag['scheme'], tag['label']) + for enclosure in mfresults.get('enclosures', []): + self._start_enclosure(enclosure) + for xfn in mfresults.get('xfn', []): + self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) + vcard = mfresults.get('vcard') + if vcard: + self._getContext()['vcard'] = vcard + # sanitize embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: + if is_htmlish and SANITIZE_HTML: if element in self.can_contain_dangerous_markup: - output = _sanitizeHTML(output, self.encoding) + output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html')) + + if self.encoding and not isinstance(output, unicode): + output = output.decode(self.encoding, 'ignore') - if self.encoding and type(output) != type(u''): + # address common error where people take data that is already + # utf-8, presume that it is iso-8859-1, and re-encode it. + if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode): try: - output = unicode(output, self.encoding) - except: + output = output.encode('iso-8859-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): pass + # map win-1252 extensions to the proper code points + if isinstance(output, unicode): + output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) + # categories/tags/keywords/whatever are handled in _end_category if element == 'category': return output - + + if element == 'title' and self.hasTitle: + return output + # store output in appropriate place(s) if self.inentry and not self.insource: if element == 'content': @@ -663,9 +943,14 @@ class _FeedParserMixin: contentparams['value'] = output self.entries[-1][element].append(contentparams) elif element == 'link': - self.entries[-1][element] = output - if output: - self.entries[-1]['links'][-1]['href'] = output + if not self.inimage: + # query variables in urls in link elements are improperly + # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're + # unhandled character references. fix this special case. + output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) + self.entries[-1][element] = output + if output: + self.entries[-1]['links'][-1]['href'] = output else: if element == 'description': element = 'summary' @@ -674,12 +959,15 @@ class _FeedParserMixin: contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output self.entries[-1][element + '_detail'] = contentparams - elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): + elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): context = self._getContext() if element == 'description': element = 'subtitle' context[element] = output if element == 'link': + # fix query variables; see above for the explanation + output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) + context[element] = output context['links'][-1]['href'] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) @@ -689,6 +977,8 @@ class _FeedParserMixin: def pushContent(self, tag, attrsD, defaultContentType, expectingText): self.incontent += 1 + if self.lang: + self.lang=self.lang.replace('_','-') self.contentparams = FeedParserDict({ 'type': self.mapContentType(attrsD.get('type', defaultContentType)), 'language': self.lang, @@ -701,7 +991,28 @@ class _FeedParserMixin: self.incontent -= 1 self.contentparams.clear() return value - + + # a number of elements in a number of RSS variants are nominally plain + # text, but this is routinely ignored. This is an attempt to detect + # the most common cases. As false positives often result in silent + # data loss, this function errs on the conservative side. + @staticmethod + def lookslikehtml(s): + # must have a close tag or a entity reference to qualify + if not (re.search(r'',s) or re.search("&#?\w+;",s)): + return + + # all tags must be in a restricted subset of valid HTML tags + if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, + re.findall(r' -1: @@ -710,18 +1021,18 @@ class _FeedParserMixin: prefix = self.namespacemap.get(prefix, prefix) name = prefix + ':' + suffix return name - + def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) def _isBase64(self, attrsD, contentparams): if attrsD.get('mode', '') == 'base64': return 1 - if self.contentparams['type'].startswith('text/'): + if self.contentparams['type'].startswith(u'text/'): return 0 - if self.contentparams['type'].endswith('+xml'): + if self.contentparams['type'].endswith(u'+xml'): return 0 - if self.contentparams['type'].endswith('/xml'): + if self.contentparams['type'].endswith(u'/xml'): return 0 return 1 @@ -738,33 +1049,35 @@ class _FeedParserMixin: pass attrsD['href'] = href return attrsD - - def _save(self, key, value): + + def _save(self, key, value, overwrite=False): context = self._getContext() - context.setdefault(key, value) + if overwrite: + context[key] = value + else: + context.setdefault(key, value) def _start_rss(self, attrsD): - versionmap = {'0.91': 'rss091u', - '0.92': 'rss092', - '0.93': 'rss093', - '0.94': 'rss094'} - if not self.version: + versionmap = {'0.91': u'rss091u', + '0.92': u'rss092', + '0.93': u'rss093', + '0.94': u'rss094'} + #If we're here then this is an RSS feed. + #If we don't have a version or have a version that starts with something + #other than RSS then there's been a mistake. Correct it. + if not self.version or not self.version.startswith(u'rss'): attr_version = attrsD.get('version', '') version = versionmap.get(attr_version) if version: self.version = version elif attr_version.startswith('2.'): - self.version = 'rss20' + self.version = u'rss20' else: - self.version = 'rss' - - def _start_dlhottitles(self, attrsD): - self.version = 'hotrss' + self.version = u'rss' def _start_channel(self, attrsD): self.infeed = 1 self._cdf_common(attrsD) - _start_feedinfo = _start_channel def _cdf_common(self, attrsD): if attrsD.has_key('lastmod'): @@ -775,41 +1088,44 @@ class _FeedParserMixin: self._start_link({}) self.elementstack[-1][-1] = attrsD['href'] self._end_link() - + def _start_feed(self, attrsD): self.infeed = 1 - versionmap = {'0.1': 'atom01', - '0.2': 'atom02', - '0.3': 'atom03'} + versionmap = {'0.1': u'atom01', + '0.2': u'atom02', + '0.3': u'atom03'} if not self.version: attr_version = attrsD.get('version') version = versionmap.get(attr_version) if version: self.version = version else: - self.version = 'atom' + self.version = u'atom' def _end_channel(self): self.infeed = 0 _end_feed = _end_channel - + def _start_image(self, attrsD): + context = self._getContext() + if not self.inentry: + context.setdefault('image', FeedParserDict()) self.inimage = 1 + self.hasTitle = 0 self.push('image', 0) - context = self._getContext() - context.setdefault('image', FeedParserDict()) - + def _end_image(self): self.pop('image') self.inimage = 0 def _start_textinput(self, attrsD): - self.intextinput = 1 - self.push('textinput', 0) context = self._getContext() context.setdefault('textinput', FeedParserDict()) + self.intextinput = 1 + self.hasTitle = 0 + self.push('textinput', 0) _start_textInput = _start_textinput - + def _end_textinput(self): self.pop('textinput') self.intextinput = 0 @@ -818,6 +1134,10 @@ class _FeedParserMixin: def _start_author(self, attrsD): self.inauthor = 1 self.push('author', 1) + # Append a new FeedParserDict when expecting an author + context = self._getContext() + context.setdefault('authors', []) + context['authors'].append(FeedParserDict()) _start_managingeditor = _start_author _start_dc_author = _start_author _start_dc_creator = _start_author @@ -877,7 +1197,7 @@ class _FeedParserMixin: self._save_contributor('name', value) elif self.intextinput: context = self._getContext() - context['textinput']['name'] = value + context['name'] = value _end_itunes_name = _end_name def _start_width(self, attrsD): @@ -887,11 +1207,11 @@ class _FeedParserMixin: value = self.pop('width') try: value = int(value) - except: + except ValueError: value = 0 if self.inimage: context = self._getContext() - context['image']['width'] = value + context['width'] = value def _start_height(self, attrsD): self.push('height', 0) @@ -900,11 +1220,11 @@ class _FeedParserMixin: value = self.pop('height') try: value = int(value) - except: + except ValueError: value = 0 if self.inimage: context = self._getContext() - context['image']['height'] = value + context['height'] = value def _start_url(self, attrsD): self.push('href', 1) @@ -917,12 +1237,6 @@ class _FeedParserMixin: self._save_author('href', value) elif self.incontributor: self._save_contributor('href', value) - elif self.inimage: - context = self._getContext() - context['image']['href'] = value - elif self.intextinput: - context = self._getContext() - context['textinput']['link'] = value _end_homepage = _end_url _end_uri = _end_url @@ -943,6 +1257,10 @@ class _FeedParserMixin: def _getContext(self): if self.insource: context = self.sourcedata + elif self.inimage and self.feeddata.has_key('image'): + context = self.feeddata['image'] + elif self.intextinput: + context = self.feeddata['textinput'] elif self.inentry: context = self.entries[-1] else: @@ -954,6 +1272,8 @@ class _FeedParserMixin: context.setdefault(prefix + '_detail', FeedParserDict()) context[prefix + '_detail'][key] = value self._sync_author_detail() + context.setdefault('authors', [FeedParserDict()]) + context['authors'][-1][key] = value def _save_contributor(self, key, value): context = self._getContext() @@ -967,32 +1287,38 @@ class _FeedParserMixin: name = detail.get('name') email = detail.get('email') if name and email: - context[key] = '%s (%s)' % (name, email) + context[key] = u'%s (%s)' % (name, email) elif name: context[key] = name elif email: context[key] = email else: - author = context.get(key) - if not author: return - emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) - if not emailmatch: return - email = emailmatch.group(0) - # probably a better way to do the following, but it passes all the tests - author = author.replace(email, '') - author = author.replace('()', '') - author = author.strip() - if author and (author[0] == '('): - author = author[1:] - if author and (author[-1] == ')'): - author = author[:-1] - author = author.strip() - context.setdefault('%s_detail' % key, FeedParserDict()) - context['%s_detail' % key]['name'] = author - context['%s_detail' % key]['email'] = email + author, email = context.get(key), None + if not author: + return + emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) + if emailmatch: + email = emailmatch.group(0) + # probably a better way to do the following, but it passes all the tests + author = author.replace(email, u'') + author = author.replace(u'()', u'') + author = author.replace(u'<>', u'') + author = author.replace(u'<>', u'') + author = author.strip() + if author and (author[0] == u'('): + author = author[1:] + if author and (author[-1] == u')'): + author = author[:-1] + author = author.strip() + if author or email: + context.setdefault('%s_detail' % key, FeedParserDict()) + if author: + context['%s_detail' % key]['name'] = author + if email: + context['%s_detail' % key]['email'] = email def _start_subtitle(self, attrsD): - self.pushContent('subtitle', attrsD, 'text/plain', 1) + self.pushContent('subtitle', attrsD, u'text/plain', 1) _start_tagline = _start_subtitle _start_itunes_subtitle = _start_subtitle @@ -1000,9 +1326,9 @@ class _FeedParserMixin: self.popContent('subtitle') _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle - + def _start_rights(self, attrsD): - self.pushContent('rights', attrsD, 'text/plain', 1) + self.pushContent('rights', attrsD, u'text/plain', 1) _start_dc_rights = _start_rights _start_copyright = _start_rights @@ -1016,13 +1342,13 @@ class _FeedParserMixin: self.push('item', 0) self.inentry = 1 self.guidislink = 0 + self.hasTitle = 0 id = self._getAttribute(attrsD, 'rdf:about') if id: context = self._getContext() context['id'] = id self._cdf_common(attrsD) _start_entry = _start_item - _start_product = _start_item def _end_item(self): self.pop('item') @@ -1053,7 +1379,7 @@ class _FeedParserMixin: def _end_published(self): value = self.pop('published') - self._save('published_parsed', _parse_date(value)) + self._save('published_parsed', _parse_date(value), overwrite=True) _end_dcterms_issued = _end_published _end_issued = _end_published @@ -1063,15 +1389,17 @@ class _FeedParserMixin: _start_dcterms_modified = _start_updated _start_pubdate = _start_updated _start_dc_date = _start_updated + _start_lastbuilddate = _start_updated def _end_updated(self): value = self.pop('updated') parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value) + self._save('updated_parsed', parsed_value, overwrite=True) _end_modified = _end_updated _end_dcterms_modified = _end_updated _end_pubdate = _end_updated _end_dc_date = _end_updated + _end_lastbuilddate = _end_updated def _start_created(self, attrsD): self.push('created', 1) @@ -1079,38 +1407,56 @@ class _FeedParserMixin: def _end_created(self): value = self.pop('created') - self._save('created_parsed', _parse_date(value)) + self._save('created_parsed', _parse_date(value), overwrite=True) _end_dcterms_created = _end_created def _start_expirationdate(self, attrsD): self.push('expired', 1) def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired'))) + self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) def _start_cc_license(self, attrsD): - self.push('license', 1) + context = self._getContext() value = self._getAttribute(attrsD, 'rdf:resource') + attrsD = FeedParserDict() + attrsD['rel'] = u'license' if value: - self.elementstack[-1][2].append(value) - self.pop('license') - + attrsD['href']=value + context.setdefault('links', []).append(attrsD) + def _start_creativecommons_license(self, attrsD): self.push('license', 1) + _start_creativeCommons_license = _start_creativecommons_license def _end_creativecommons_license(self): - self.pop('license') + value = self.pop('license') + context = self._getContext() + attrsD = FeedParserDict() + attrsD['rel'] = u'license' + if value: + attrsD['href'] = value + context.setdefault('links', []).append(attrsD) + del context['license'] + _end_creativeCommons_license = _end_creativecommons_license + + def _addXFN(self, relationships, href, name): + context = self._getContext() + xfn = context.setdefault('xfn', []) + value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) + if value not in xfn: + xfn.append(value) def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): return + if (not term) and (not scheme) and (not label): + return value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) if value not in tags: - tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) + tags.append(value) def _start_category(self, attrsD): - if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) term = attrsD.get('term') scheme = attrsD.get('scheme', attrsD.get('domain')) label = attrsD.get('label') @@ -1118,18 +1464,23 @@ class _FeedParserMixin: self.push('category', 1) _start_dc_subject = _start_category _start_keywords = _start_category - + + def _start_media_category(self, attrsD): + attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema') + self._start_category(attrsD) + def _end_itunes_keywords(self): for term in self.pop('itunes_keywords').split(): - self._addTag(term, 'http://www.itunes.com/', None) - + self._addTag(term, u'http://www.itunes.com/', None) + def _start_itunes_category(self, attrsD): - self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) + self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None) self.push('category', 1) - + def _end_category(self): value = self.pop('category') - if not value: return + if not value: + return context = self._getContext() tags = context['tags'] if value and len(tags) and not tags[-1]['term']: @@ -1139,38 +1490,35 @@ class _FeedParserMixin: _end_dc_subject = _end_category _end_keywords = _end_category _end_itunes_category = _end_category + _end_media_category = _end_category def _start_cloud(self, attrsD): self._getContext()['cloud'] = FeedParserDict(attrsD) - + def _start_link(self, attrsD): - attrsD.setdefault('rel', 'alternate') - attrsD.setdefault('type', 'text/html') + attrsD.setdefault('rel', u'alternate') + if attrsD['rel'] == u'self': + attrsD.setdefault('type', u'application/atom+xml') + else: + attrsD.setdefault('type', u'text/html') + context = self._getContext() attrsD = self._itsAnHrefDamnIt(attrsD) if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) expectingText = self.infeed or self.inentry or self.insource - context = self._getContext() context.setdefault('links', []) - context['links'].append(FeedParserDict(attrsD)) - if attrsD['rel'] == 'enclosure': - self._start_enclosure(attrsD) + if not (self.inentry and self.inimage): + context['links'].append(FeedParserDict(attrsD)) if attrsD.has_key('href'): expectingText = 0 - if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): + if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): context['link'] = attrsD['href'] else: self.push('link', expectingText) - _start_producturl = _start_link def _end_link(self): value = self.pop('link') context = self._getContext() - if self.intextinput: - context['textinput']['link'] = value - if self.inimage: - context['image']['link'] = value - _end_producturl = _end_link def _start_guid(self, attrsD): self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') @@ -1185,19 +1533,26 @@ class _FeedParserMixin: self._save('link', value) def _start_title(self, attrsD): - self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) + if self.svgOK: + return self.unknown_starttag('title', attrsD.items()) + self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) _start_dc_title = _start_title _start_media_title = _start_title def _end_title(self): + if self.svgOK: + return value = self.popContent('title') + if not value: + return context = self._getContext() - if self.intextinput: - context['textinput']['title'] = value - elif self.inimage: - context['image']['title'] = value + self.hasTitle = 1 _end_dc_title = _end_title - _end_media_title = _end_title + + def _end_media_title(self): + hasTitle = self.hasTitle + self._end_title() + self.hasTitle = hasTitle def _start_description(self, attrsD): context = self._getContext() @@ -1205,26 +1560,23 @@ class _FeedParserMixin: self._summaryKey = 'content' self._start_content(attrsD) else: - self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) + self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource) + _start_dc_description = _start_description def _start_abstract(self, attrsD): - self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) + self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource) def _end_description(self): if self._summaryKey == 'content': self._end_content() else: value = self.popContent('description') - context = self._getContext() - if self.intextinput: - context['textinput']['description'] = value - elif self.inimage: - context['image']['description'] = value self._summaryKey = None _end_abstract = _end_description + _end_dc_description = _end_description def _start_info(self, attrsD): - self.pushContent('info', attrsD, 'text/plain', 1) + self.pushContent('info', attrsD, u'text/plain', 1) _start_feedburner_browserfriendly = _start_info def _end_info(self): @@ -1244,7 +1596,7 @@ class _FeedParserMixin: context = self._getContext() if context.has_key('generator_detail'): context['generator_detail']['name'] = value - + def _start_admin_generatoragent(self, attrsD): self.push('generator', 1) value = self._getAttribute(attrsD, 'rdf:resource') @@ -1259,7 +1611,7 @@ class _FeedParserMixin: if value: self.elementstack[-1][2].append(value) self.pop('errorreportsto') - + def _start_summary(self, attrsD): context = self._getContext() if context.has_key('summary'): @@ -1267,7 +1619,7 @@ class _FeedParserMixin: self._start_content(attrsD) else: self._summaryKey = 'summary' - self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) + self.pushContent(self._summaryKey, attrsD, u'text/plain', 1) _start_itunes_summary = _start_summary def _end_summary(self): @@ -1277,84 +1629,134 @@ class _FeedParserMixin: self.popContent(self._summaryKey or 'summary') self._summaryKey = None _end_itunes_summary = _end_summary - + def _start_enclosure(self, attrsD): attrsD = self._itsAnHrefDamnIt(attrsD) - self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) - href = attrsD.get('href') - if href: - context = self._getContext() - if not context.get('id'): - context['id'] = href - + context = self._getContext() + attrsD['rel'] = u'enclosure' + context.setdefault('links', []).append(FeedParserDict(attrsD)) + def _start_source(self, attrsD): + if 'url' in attrsD: + # This means that we're processing a source element from an RSS 2.0 feed + self.sourcedata['href'] = attrsD[u'url'] + self.push('source', 1) self.insource = 1 + self.hasTitle = 0 def _end_source(self): self.insource = 0 + value = self.pop('source') + if value: + self.sourcedata['title'] = value self._getContext()['source'] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() def _start_content(self, attrsD): - self.pushContent('content', attrsD, 'text/plain', 1) + self.pushContent('content', attrsD, u'text/plain', 1) src = attrsD.get('src') if src: self.contentparams['src'] = src self.push('content', 1) - def _start_prodlink(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) - def _start_body(self, attrsD): - self.pushContent('content', attrsD, 'application/xhtml+xml', 1) + self.pushContent('content', attrsD, u'application/xhtml+xml', 1) _start_xhtml_body = _start_body def _start_content_encoded(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) + self.pushContent('content', attrsD, u'text/html', 1) _start_fullitem = _start_content_encoded def _end_content(self): - copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) + copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types) value = self.popContent('content') - if copyToDescription: - self._save('description', value) + if copyToSummary: + self._save('summary', value) + _end_body = _end_content _end_xhtml_body = _end_content _end_content_encoded = _end_content _end_fullitem = _end_content - _end_prodlink = _end_content def _start_itunes_image(self, attrsD): self.push('itunes_image', 0) - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) + if attrsD.get('href'): + self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) _start_itunes_link = _start_itunes_image - + def _end_itunes_block(self): value = self.pop('itunes_block', 0) self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 def _end_itunes_explicit(self): value = self.pop('itunes_explicit', 0) - self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 + # Convert 'yes' -> True, 'clean' to False, and any other value to None + # False and None both evaluate as False, so the difference can be ignored + # by applications that only need to know if the content is explicit. + self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] + + def _start_media_content(self, attrsD): + context = self._getContext() + context.setdefault('media_content', []) + context['media_content'].append(attrsD) + + def _start_media_thumbnail(self, attrsD): + context = self._getContext() + context.setdefault('media_thumbnail', []) + self.push('url', 1) # new + context['media_thumbnail'].append(attrsD) + + def _end_media_thumbnail(self): + url = self.pop('url') + context = self._getContext() + if url != None and len(url.strip()) != 0: + if not context['media_thumbnail'][-1].has_key('url'): + context['media_thumbnail'][-1]['url'] = url + + def _start_media_player(self, attrsD): + self.push('media_player', 0) + self._getContext()['media_player'] = FeedParserDict(attrsD) + + def _end_media_player(self): + value = self.pop('media_player') + context = self._getContext() + context['media_player']['content'] = value + + def _start_newlocation(self, attrsD): + self.push('newlocation', 1) + + def _end_newlocation(self): + url = self.pop('newlocation') + context = self._getContext() + # don't set newlocation if the context isn't right + if context is not self.feeddata: + return + context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): - if _debug: sys.stderr.write('trying StrictFeedParser\n') xml.sax.handler.ContentHandler.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None - + self.decls = {} + def startPrefixMapping(self, prefix, uri): + if not uri: + return + # Jython uses '' instead of None; standardize on None + prefix = prefix or None self.trackNamespace(prefix, uri) - + if prefix and uri == 'http://www.w3.org/1999/xlink': + self.decls['xmlns:' + prefix] = uri + def startElementNS(self, name, qname, attrs): namespace, localname = name lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') <> -1: + if lowernamespace.find(u'backend.userland.com/rss') <> -1: # match any backend.userland.com namespace - namespace = 'http://backend.userland.com/rss' + namespace = u'http://backend.userland.com/rss' lowernamespace = namespace if qname and qname.find(':') > 0: givenprefix = qname.split(':')[0] @@ -1363,10 +1765,7 @@ if _XML_AVAILABLE: prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix - if prefix: - localname = prefix + ':' + localname localname = str(localname).lower() - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) # qname implementation is horribly broken in Python 2.1 (it # doesn't report any), and slightly broken in Python 2.2 (it @@ -1375,8 +1774,21 @@ if _XML_AVAILABLE: # the qnames the SAX parser gives us (if indeed it gives us any # at all). Thanks to MatejC for helping me test this and # tirelessly telling me that it didn't work yet. - attrsD = {} - for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): + attrsD, self.decls = self.decls, {} + if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': + attrsD['xmlns']=namespace + if localname=='svg' and namespace=='http://www.w3.org/2000/svg': + attrsD['xmlns']=namespace + + if prefix: + localname = prefix.lower() + ':' + localname + elif namespace and not qname: #Expat + for name,value in self.namespacesInUse.items(): + if name and value == namespace: + localname = name + ':' + localname + break + + for (namespace, attrlocalname), attrvalue in attrs.items(): lowernamespace = (namespace or '').lower() prefix = self._matchnamespaces.get(lowernamespace, '') if prefix: @@ -1399,26 +1811,39 @@ if _XML_AVAILABLE: prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if prefix: localname = prefix + ':' + localname + elif namespace and not qname: #Expat + for name,value in self.namespacesInUse.items(): + if name and value == namespace: + localname = name + ':' + localname + break localname = str(localname).lower() self.unknown_endtag(localname) def error(self, exc): self.bozo = 1 self.exc = exc - + + # drv_libxml2 calls warning() in some cases + warning = error + def fatalError(self, exc): self.error(exc) raise exc class _BaseHTMLProcessor(sgmllib.SGMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - def __init__(self, encoding): + special = re.compile('''[<>'"]''') + bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") + elements_no_end_tag = [ + 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', + 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', + 'source', 'track', 'wbr' + ] + + def __init__(self, encoding, _type): self.encoding = encoding - if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) + self._type = _type sgmllib.SGMLParser.__init__(self) - + def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) @@ -1429,35 +1854,77 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): return '<' + tag + ' />' else: return '<' + tag + '>' - + + # By declaring these methods and overriding their compiled code + # with the code from sgmllib, the original code will execute in + # feedparser's scope instead of sgmllib's. This means that the + # `tagfind` and `charref` regular expressions will be found as + # they're declared above, not as they're declared in sgmllib. + def goahead(self, i): + pass + goahead.func_code = sgmllib.SGMLParser.goahead.func_code + + def __parse_starttag(self, i): + pass + __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code + + def parse_starttag(self,i): + j = self.__parse_starttag(i) + if self._type == 'application/xhtml+xml': + if j>2 and self.rawdata[j-2:j]=='/>': + self.unknown_endtag(self.lasttag) + return j + def feed(self, data): data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace - data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) + data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): - data = data.encode(self.encoding) + try: + bytes + if bytes is str: + raise NameError + self.encoding = self.encoding + u'_INVALID_PYTHON_3' + except NameError: + if self.encoding and isinstance(data, unicode): + data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) + sgmllib.SGMLParser.close(self) def normalize_attrs(self, attrs): + if not attrs: + return attrs # utility method to be called by descendants - attrs = [(k.lower(), v) for k, v in attrs] + attrs = dict([(k.lower(), v) for k, v in attrs]).items() attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + attrs.sort() return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
         uattrs = []
-        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-        for key, value in attrs:
-            if type(value) != type(u''):
-                value = unicode(value, self.encoding)
-            uattrs.append((unicode(key, self.encoding), value))
-        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
+        strattrs=''
+        if attrs:
+            for key, value in attrs:
+                value=value.replace('>','>').replace('<','<').replace('"','"')
+                value = self.bare_ampersand.sub("&", value)
+                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+                if not isinstance(value, unicode):
+                    value = value.decode(self.encoding, 'ignore')
+                try:
+                    # Currently, in Python 3 the key is already a str, and cannot be decoded again
+                    uattrs.append((unicode(key, self.encoding), value))
+                except TypeError:
+                    uattrs.append((key, value))
+            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
+            if self.encoding:
+                try:
+                    strattrs = strattrs.encode(self.encoding)
+                except (UnicodeEncodeError, LookupError):
+                    pass
         if tag in self.elements_no_end_tag:
             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
         else:
@@ -1472,25 +1939,35 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
     def handle_charref(self, ref):
         # called for each character reference, e.g. for ' ', ref will be '160'
         # Reconstruct the original character reference.
-        self.pieces.append('&#%(ref)s;' % locals())
-        
+        if ref.startswith('x'):
+            value = unichr(int(ref[1:],16))
+        else:
+            value = unichr(int(ref))
+
+        if value in _cp1252.keys():
+            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
+        else:
+            self.pieces.append('&#%(ref)s;' % locals())
+
     def handle_entityref(self, ref):
         # called for each entity reference, e.g. for '©', ref will be 'copy'
         # Reconstruct the original entity reference.
-        self.pieces.append('&%(ref)s;' % locals())
+        if name2codepoint.has_key(ref):
+            self.pieces.append('&%(ref)s;' % locals())
+        else:
+            self.pieces.append('&%(ref)s' % locals())
 
     def handle_data(self, text):
         # called for each block of plain text, i.e. outside of any tag and
         # not containing any character or entity references
         # Store the original text verbatim.
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
         self.pieces.append(text)
-        
+
     def handle_comment(self, text):
         # called for each HTML comment, e.g. 
         # Reconstruct the original comment.
         self.pieces.append('' % locals())
-        
+
     def handle_pi(self, text):
         # called for each processing instruction, e.g. 
         # Reconstruct original processing instruction.
@@ -1502,7 +1979,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
         #     "http://www.w3.org/TR/html4/loose.dtd">
         # Reconstruct original DOCTYPE
         self.pieces.append('' % locals())
-        
+
     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
     def _scan_name(self, i, declstartpos):
         rawdata = self.rawdata
@@ -1521,34 +1998,495 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
 #            self.updatepos(declstartpos, i)
             return None, -1
 
+    def convert_charref(self, name):
+        return '&#%s;' % name
+
+    def convert_entityref(self, name):
+        return '&%s;' % name
+
     def output(self):
         '''Return processed HTML as a single string'''
         return ''.join([str(p) for p in self.pieces])
 
+    def parse_declaration(self, i):
+        try:
+            return sgmllib.SGMLParser.parse_declaration(self, i)
+        except sgmllib.SGMLParseError:
+            # escape the doctype declaration and continue parsing
+            self.handle_data('<')
+            return i+1
+
 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
-    def __init__(self, baseuri, baselang, encoding):
+    def __init__(self, baseuri, baselang, encoding, entities):
         sgmllib.SGMLParser.__init__(self)
         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
+        self.entities=entities
 
     def decodeEntities(self, element, data):
         data = data.replace('<', '<')
         data = data.replace('<', '<')
+        data = data.replace('<', '<')
         data = data.replace('>', '>')
         data = data.replace('>', '>')
+        data = data.replace('>', '>')
         data = data.replace('&', '&')
         data = data.replace('&', '&')
         data = data.replace('"', '"')
         data = data.replace('"', '"')
         data = data.replace(''', ''')
         data = data.replace(''', ''')
-        if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
+        if self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
             data = data.replace('<', '<')
             data = data.replace('>', '>')
             data = data.replace('&', '&')
             data = data.replace('"', '"')
             data = data.replace(''', "'")
         return data
-        
+
+    def strattrs(self, attrs):
+        return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
+
+class _MicroformatsParser:
+    STRING = 1
+    DATE = 2
+    URI = 3
+    NODE = 4
+    EMAIL = 5
+
+    known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
+    known_binary_extensions =  ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
+
+    def __init__(self, data, baseuri, encoding):
+        self.document = BeautifulSoup.BeautifulSoup(data)
+        self.baseuri = baseuri
+        self.encoding = encoding
+        if isinstance(data, unicode):
+            data = data.encode(encoding)
+        self.tags = []
+        self.enclosures = []
+        self.xfn = []
+        self.vcard = None
+
+    def vcardEscape(self, s):
+        if isinstance(s, basestring):
+            s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
+        return s
+
+    def vcardFold(self, s):
+        s = re.sub(';+$', '', s)
+        sFolded = ''
+        iMax = 75
+        sPrefix = ''
+        while len(s) > iMax:
+            sFolded += sPrefix + s[:iMax] + '\n'
+            s = s[iMax:]
+            sPrefix = ' '
+            iMax = 74
+        sFolded += sPrefix + s
+        return sFolded
+
+    def normalize(self, s):
+        return re.sub(r'\s+', ' ', s).strip()
+
+    def unique(self, aList):
+        results = []
+        for element in aList:
+            if element not in results:
+                results.append(element)
+        return results
+
+    def toISO8601(self, dt):
+        return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
+
+    def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
+        all = lambda x: 1
+        sProperty = sProperty.lower()
+        bFound = 0
+        bNormalize = 1
+        propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
+        if bAllowMultiple and (iPropertyType != self.NODE):
+            snapResults = []
+            containers = elmRoot(['ul', 'ol'], propertyMatch)
+            for container in containers:
+                snapResults.extend(container('li'))
+            bFound = (len(snapResults) != 0)
+        if not bFound:
+            snapResults = elmRoot(all, propertyMatch)
+            bFound = (len(snapResults) != 0)
+        if (not bFound) and (sProperty == 'value'):
+            snapResults = elmRoot('pre')
+            bFound = (len(snapResults) != 0)
+            bNormalize = not bFound
+            if not bFound:
+                snapResults = [elmRoot]
+                bFound = (len(snapResults) != 0)
+        arFilter = []
+        if sProperty == 'vcard':
+            snapFilter = elmRoot(all, propertyMatch)
+            for node in snapFilter:
+                if node.findParent(all, propertyMatch):
+                    arFilter.append(node)
+        arResults = []
+        for node in snapResults:
+            if node not in arFilter:
+                arResults.append(node)
+        bFound = (len(arResults) != 0)
+        if not bFound:
+            if bAllowMultiple:
+                return []
+            elif iPropertyType == self.STRING:
+                return ''
+            elif iPropertyType == self.DATE:
+                return None
+            elif iPropertyType == self.URI:
+                return ''
+            elif iPropertyType == self.NODE:
+                return None
+            else:
+                return None
+        arValues = []
+        for elmResult in arResults:
+            sValue = None
+            if iPropertyType == self.NODE:
+                if bAllowMultiple:
+                    arValues.append(elmResult)
+                    continue
+                else:
+                    return elmResult
+            sNodeName = elmResult.name.lower()
+            if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
+                sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
+            if sValue:
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+            if (not sValue) and (sNodeName == 'abbr'):
+                sValue = elmResult.get('title')
+            if sValue:
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+            if (not sValue) and (iPropertyType == self.URI):
+                if sNodeName == 'a':
+                    sValue = elmResult.get('href')
+                elif sNodeName == 'img':
+                    sValue = elmResult.get('src')
+                elif sNodeName == 'object':
+                    sValue = elmResult.get('data')
+            if sValue:
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+            if (not sValue) and (sNodeName == 'img'):
+                sValue = elmResult.get('alt')
+            if sValue:
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+            if not sValue:
+                sValue = elmResult.renderContents()
+                sValue = re.sub(r'<\S[^>]*>', '', sValue)
+                sValue = sValue.replace('\r\n', '\n')
+                sValue = sValue.replace('\r', '\n')
+            if sValue:
+                sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+            if not sValue:
+                continue
+            if iPropertyType == self.DATE:
+                sValue = _parse_date_iso8601(sValue)
+            if bAllowMultiple:
+                arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
+            else:
+                return bAutoEscape and self.vcardEscape(sValue) or sValue
+        return arValues
+
+    def findVCards(self, elmRoot, bAgentParsing=0):
+        sVCards = ''
+
+        if not bAgentParsing:
+            arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
+        else:
+            arCards = [elmRoot]
+
+        for elmCard in arCards:
+            arLines = []
+
+            def processSingleString(sProperty):
+                sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
+                if sValue:
+                    arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
+                return sValue or u''
+
+            def processSingleURI(sProperty):
+                sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
+                if sValue:
+                    sContentType = ''
+                    sEncoding = ''
+                    sValueKey = ''
+                    if sValue.startswith('data:'):
+                        sEncoding = ';ENCODING=b'
+                        sContentType = sValue.split(';')[0].split('/').pop()
+                        sValue = sValue.split(',', 1).pop()
+                    else:
+                        elmValue = self.getPropertyValue(elmCard, sProperty)
+                        if elmValue:
+                            if sProperty != 'url':
+                                sValueKey = ';VALUE=uri'
+                            sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
+                    sContentType = sContentType.upper()
+                    if sContentType == 'OCTET-STREAM':
+                        sContentType = ''
+                    if sContentType:
+                        sContentType = ';TYPE=' + sContentType.upper()
+                    arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
+
+            def processTypeValue(sProperty, arDefaultType, arForceType=None):
+                arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
+                for elmResult in arResults:
+                    arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
+                    if arForceType:
+                        arType = self.unique(arForceType + arType)
+                    if not arType:
+                        arType = arDefaultType
+                    sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
+                    if sValue:
+                        arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
+
+            # AGENT
+            # must do this before all other properties because it is destructive
+            # (removes nested class="vcard" nodes so they don't interfere with
+            # this vcard's other properties)
+            arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
+            for elmAgent in arAgent:
+                if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
+                    sAgentValue = self.findVCards(elmAgent, 1) + '\n'
+                    sAgentValue = sAgentValue.replace('\n', '\\n')
+                    sAgentValue = sAgentValue.replace(';', '\\;')
+                    if sAgentValue:
+                        arLines.append(self.vcardFold('AGENT:' + sAgentValue))
+                    # Completely remove the agent element from the parse tree
+                    elmAgent.extract()
+                else:
+                    sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
+                    if sAgentValue:
+                        arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
+
+            # FN (full name)
+            sFN = processSingleString('fn')
+
+            # N (name)
+            elmName = self.getPropertyValue(elmCard, 'n')
+            if elmName:
+                sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
+                sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
+                arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
+                arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
+                arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
+                arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
+                                         sGivenName + ';' +
+                                         ','.join(arAdditionalNames) + ';' +
+                                         ','.join(arHonorificPrefixes) + ';' +
+                                         ','.join(arHonorificSuffixes)))
+            elif sFN:
+                # implied "N" optimization
+                # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
+                arNames = self.normalize(sFN).split()
+                if len(arNames) == 2:
+                    bFamilyNameFirst = (arNames[0].endswith(',') or
+                                        len(arNames[1]) == 1 or
+                                        ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
+                    if bFamilyNameFirst:
+                        arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
+                    else:
+                        arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
+
+            # SORT-STRING
+            sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
+            if sSortString:
+                arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
+
+            # NICKNAME
+            arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
+            if arNickname:
+                arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
+
+            # PHOTO
+            processSingleURI('photo')
+
+            # BDAY
+            dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
+            if dtBday:
+                arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
+
+            # ADR (address)
+            arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
+            for elmAdr in arAdr:
+                arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
+                if not arType:
+                    arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
+                sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
+                sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
+                sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
+                sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
+                sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
+                sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
+                sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
+                arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
+                                         sPostOfficeBox + ';' +
+                                         sExtendedAddress + ';' +
+                                         sStreetAddress + ';' +
+                                         sLocality + ';' +
+                                         sRegion + ';' +
+                                         sPostalCode + ';' +
+                                         sCountryName))
+
+            # LABEL
+            processTypeValue('label', ['intl','postal','parcel','work'])
+
+            # TEL (phone number)
+            processTypeValue('tel', ['voice'])
+
+            # EMAIL
+            processTypeValue('email', ['internet'], ['internet'])
+
+            # MAILER
+            processSingleString('mailer')
+
+            # TZ (timezone)
+            processSingleString('tz')
+
+            # GEO (geographical information)
+            elmGeo = self.getPropertyValue(elmCard, 'geo')
+            if elmGeo:
+                sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
+                sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
+                arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
+
+            # TITLE
+            processSingleString('title')
+
+            # ROLE
+            processSingleString('role')
+
+            # LOGO
+            processSingleURI('logo')
+
+            # ORG (organization)
+            elmOrg = self.getPropertyValue(elmCard, 'org')
+            if elmOrg:
+                sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
+                if not sOrganizationName:
+                    # implied "organization-name" optimization
+                    # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
+                    sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
+                    if sOrganizationName:
+                        arLines.append(self.vcardFold('ORG:' + sOrganizationName))
+                else:
+                    arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
+                    arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
+
+            # CATEGORY
+            arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
+            if arCategory:
+                arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
+
+            # NOTE
+            processSingleString('note')
+
+            # REV
+            processSingleString('rev')
+
+            # SOUND
+            processSingleURI('sound')
+
+            # UID
+            processSingleString('uid')
+
+            # URL
+            processSingleURI('url')
+
+            # CLASS
+            processSingleString('class')
+
+            # KEY
+            processSingleURI('key')
+
+            if arLines:
+                arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
+                # XXX - this is super ugly; properly fix this with issue 148
+                for i, s in enumerate(arLines):
+                    if not isinstance(s, unicode):
+                        arLines[i] = s.decode('utf-8', 'ignore')
+                sVCards += u'\n'.join(arLines) + u'\n'
+
+        return sVCards.strip()
+
+    def isProbablyDownloadable(self, elm):
+        attrsD = elm.attrMap
+        if not attrsD.has_key('href'):
+            return 0
+        linktype = attrsD.get('type', '').strip()
+        if linktype.startswith('audio/') or \
+           linktype.startswith('video/') or \
+           (linktype.startswith('application/') and not linktype.endswith('xml')):
+            return 1
+        path = urlparse.urlparse(attrsD['href'])[2]
+        if path.find('.') == -1:
+            return 0
+        fileext = path.split('.').pop().lower()
+        return fileext in self.known_binary_extensions
+
+    def findTags(self):
+        all = lambda x: 1
+        for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
+            href = elm.get('href')
+            if not href:
+                continue
+            urlscheme, domain, path, params, query, fragment = \
+                       urlparse.urlparse(_urljoin(self.baseuri, href))
+            segments = path.split('/')
+            tag = segments.pop()
+            if not tag:
+                if segments:
+                    tag = segments.pop()
+                else:
+                    # there are no tags
+                    continue
+            tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
+            if not tagscheme.endswith('/'):
+                tagscheme += '/'
+            self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
+
+    def findEnclosures(self):
+        all = lambda x: 1
+        enclosure_match = re.compile(r'\benclosure\b')
+        for elm in self.document(all, {'href': re.compile(r'.+')}):
+            if not enclosure_match.search(elm.get('rel', u'')) and not self.isProbablyDownloadable(elm):
+                continue
+            if elm.attrMap not in self.enclosures:
+                self.enclosures.append(elm.attrMap)
+                if elm.string and not elm.get('title'):
+                    self.enclosures[-1]['title'] = elm.string
+
+    def findXFN(self):
+        all = lambda x: 1
+        for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
+            rels = elm.get('rel', u'').split()
+            xfn_rels = []
+            for rel in rels:
+                if rel in self.known_xfn_relationships:
+                    xfn_rels.append(rel)
+            if xfn_rels:
+                self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
+
+def _parseMicroformats(htmlSource, baseURI, encoding):
+    if not BeautifulSoup:
+        return
+    try:
+        p = _MicroformatsParser(htmlSource, baseURI, encoding)
+    except UnicodeEncodeError:
+        # sgmllib throws this exception when performing lookups of tags
+        # with non-ASCII characters in them.
+        return
+    p.vcard = p.findVCards(p.document)
+    p.findTags()
+    p.findEnclosures()
+    p.findXFN()
+    return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
+
 class _RelativeURIResolver(_BaseHTMLProcessor):
     relative_uris = [('a', 'href'),
                      ('applet', 'codebase'),
@@ -1576,65 +2514,247 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
                      ('q', 'cite'),
                      ('script', 'src')]
 
-    def __init__(self, baseuri, encoding):
-        _BaseHTMLProcessor.__init__(self, encoding)
+    def __init__(self, baseuri, encoding, _type):
+        _BaseHTMLProcessor.__init__(self, encoding, _type)
         self.baseuri = baseuri
 
     def resolveURI(self, uri):
-        return _urljoin(self.baseuri, uri)
-    
+        return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
+
     def unknown_starttag(self, tag, attrs):
         attrs = self.normalize_attrs(attrs)
         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-        
-def _resolveRelativeURIs(htmlSource, baseURI, encoding):
-    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
-    p = _RelativeURIResolver(baseURI, encoding)
+
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
+    if not _SGML_AVAILABLE:
+        return htmlSource
+
+    p = _RelativeURIResolver(baseURI, encoding, _type)
     p.feed(htmlSource)
     return p.output()
 
+def _makeSafeAbsoluteURI(base, rel=None):
+    # bail if ACCEPTABLE_URI_SCHEMES is empty
+    if not ACCEPTABLE_URI_SCHEMES:
+        return _urljoin(base, rel or u'')
+    if not base:
+        return rel or u''
+    if not rel:
+        scheme = urlparse.urlparse(base)[0]
+        if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
+            return base
+        return u''
+    uri = _urljoin(base, rel)
+    if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
+        return u''
+    return uri
+
 class _HTMLSanitizer(_BaseHTMLProcessor):
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
-      'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
-      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
-      'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
-      'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
-      'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
-      'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
-      'thead', 'tr', 'tt', 'u', 'ul', 'var']
+    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+        'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+        'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+        'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+        'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+        'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+        'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+        'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+        'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+        'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+        'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+        'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+        'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
 
     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-      'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
-      'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
-      'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
-      'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
-      'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
-      'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
-      'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
-      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
-      'usemap', 'valign', 'value', 'vspace', 'width']
-
-    unacceptable_elements_with_end_tag = ['script', 'applet']
+      'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+      'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+      'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+      'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+      'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
+      'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
+      'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
+      'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
+      'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
+      'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
+      'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
+      'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
+      'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
+      'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
+      'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
+      'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
+      'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
+      'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
+      'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
+      'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
+      'xml:lang']
+
+    unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
+
+    acceptable_css_properties = ['azimuth', 'background-color',
+      'border-bottom-color', 'border-collapse', 'border-color',
+      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+      'white-space', 'width']
+
+    # survey of common keywords found in feeds
+    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+      'transparent', 'underline', 'white', 'yellow']
+
+    valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
+      '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
+
+    mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
+      'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
+      'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
+      'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+      'munderover', 'none', 'semantics']
+
+    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+      'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+      'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
+      'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
+      'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
+      'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
+      'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+      'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
+      'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
+
+    # svgtiny - foreignObject + linearGradient + radialGradient + stop
+    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
+      'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
+      'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
+      'svg', 'switch', 'text', 'title', 'tspan', 'use']
+
+    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
+    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+       'arabic-form', 'ascent', 'attributeName', 'attributeType',
+       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+       'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
+       'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+       'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+       'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
+       'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
+       'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
+       'min', 'name', 'offset', 'opacity', 'orient', 'origin',
+       'overline-position', 'overline-thickness', 'panose-1', 'path',
+       'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
+       'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
+       'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
+       'stop-color', 'stop-opacity', 'strikethrough-position',
+       'strikethrough-thickness', 'stroke', 'stroke-dasharray',
+       'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
+       'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
+       'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+       'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
+       'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
+       'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+       'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+       'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
+       'y2', 'zoomAndPan']
+
+    svg_attr_map = None
+    svg_elem_map = None
+
+    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+      'stroke-opacity']
 
     def reset(self):
         _BaseHTMLProcessor.reset(self)
         self.unacceptablestack = 0
-        
+        self.mathmlOK = 0
+        self.svgOK = 0
+
     def unknown_starttag(self, tag, attrs):
-        if not tag in self.acceptable_elements:
+        acceptable_attributes = self.acceptable_attributes
+        keymap = {}
+        if not tag in self.acceptable_elements or self.svgOK:
             if tag in self.unacceptable_elements_with_end_tag:
                 self.unacceptablestack += 1
-            return
-        attrs = self.normalize_attrs(attrs)
-        attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
-        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-        
+
+            # add implicit namespaces to html5 inline svg/mathml
+            if self._type.endswith('html'):
+                if not dict(attrs).get('xmlns'):
+                    if tag=='svg':
+                        attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+                    if tag=='math':
+                        attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
+            # not otherwise acceptable, perhaps it is MathML or SVG?
+            if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
+                self.mathmlOK += 1
+            if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
+                self.svgOK += 1
+
+            # chose acceptable attributes based on tag class, else bail
+            if  self.mathmlOK and tag in self.mathml_elements:
+                acceptable_attributes = self.mathml_attributes
+            elif self.svgOK and tag in self.svg_elements:
+                # for most vocabularies, lowercasing is a good idea.  Many
+                # svg elements, however, are camel case
+                if not self.svg_attr_map:
+                    lower=[attr.lower() for attr in self.svg_attributes]
+                    mix=[a for a in self.svg_attributes if a not in lower]
+                    self.svg_attributes = lower
+                    self.svg_attr_map = dict([(a.lower(),a) for a in mix])
+
+                    lower=[attr.lower() for attr in self.svg_elements]
+                    mix=[a for a in self.svg_elements if a not in lower]
+                    self.svg_elements = lower
+                    self.svg_elem_map = dict([(a.lower(),a) for a in mix])
+                acceptable_attributes = self.svg_attributes
+                tag = self.svg_elem_map.get(tag,tag)
+                keymap = self.svg_attr_map
+            elif not tag in self.acceptable_elements:
+                return
+
+        # declare xlink namespace, if needed
+        if self.mathmlOK or self.svgOK:
+            if filter(lambda (n,v): n.startswith('xlink:'),attrs):
+                if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
+                    attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
+
+        clean_attrs = []
+        for key, value in self.normalize_attrs(attrs):
+            if key in acceptable_attributes:
+                key=keymap.get(key,key)
+                # make sure the uri uses an acceptable uri scheme
+                if key == u'href':
+                    value = _makeSafeAbsoluteURI(value)
+                clean_attrs.append((key,value))
+            elif key=='style':
+                clean_value = self.sanitize_style(value)
+                if clean_value:
+                    clean_attrs.append((key,clean_value))
+        _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
+
     def unknown_endtag(self, tag):
         if not tag in self.acceptable_elements:
             if tag in self.unacceptable_elements_with_end_tag:
                 self.unacceptablestack -= 1
-            return
+            if self.mathmlOK and tag in self.mathml_elements:
+                if tag == 'math' and self.mathmlOK:
+                    self.mathmlOK -= 1
+            elif self.svgOK and tag in self.svg_elements:
+                tag = self.svg_elem_map.get(tag,tag)
+                if tag == 'svg' and self.svgOK:
+                    self.svgOK -= 1
+            else:
+                return
         _BaseHTMLProcessor.unknown_endtag(self, tag)
 
     def handle_pi(self, text):
@@ -1647,8 +2767,53 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
         if not self.unacceptablestack:
             _BaseHTMLProcessor.handle_data(self, text)
 
-def _sanitizeHTML(htmlSource, encoding):
-    p = _HTMLSanitizer(encoding)
+    def sanitize_style(self, style):
+        # disallow urls
+        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+        # gauntlet
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
+            return ''
+
+        clean = []
+        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+            if not value:
+                continue
+            if prop.lower() in self.acceptable_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+                for keyword in value.split():
+                    if not keyword in self.acceptable_css_keywords and \
+                        not self.valid_css_values.match(keyword):
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
+
+    def parse_comment(self, i, report=1):
+        ret = _BaseHTMLProcessor.parse_comment(self, i, report)
+        if ret >= 0:
+            return ret
+        # if ret == -1, this may be a malicious attempt to circumvent
+        # sanitization, or a page-destroying unclosed comment
+        match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
+        if match:
+            return match.end()
+        # unclosed comment; deliberately fail to handle_data()
+        return len(self.rawdata)
+
+
+def _sanitizeHTML(htmlSource, encoding, _type):
+    if not _SGML_AVAILABLE:
+        return htmlSource
+    p = _HTMLSanitizer(encoding, _type)
+    htmlSource = htmlSource.replace('= '2.3.3'
-            assert base64 != None
-            user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
-            realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
-            self.add_password(realm, host, user, passw)
-            retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
-            self.reset_retry_count()
-            return retry
-        except:
+        if base64 is None or 'Authorization' not in req.headers \
+                          or 'WWW-Authenticate' not in headers:
             return self.http_error_default(req, fp, code, msg, headers)
-
-def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
+        auth = _base64decode(req.headers['Authorization'].split(' ')[1])
+        user, passw = auth.split(':')
+        realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
+        self.add_password(realm, host, user, passw)
+        retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
+        self.reset_retry_count()
+        return retry
+
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
     """URL, filename, or string --> stream
 
     This function lets you define parsers that take any input source
@@ -1752,10 +2906,12 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
     If the etag argument is supplied, it will be used as the value of an
     If-None-Match request header.
 
-    If the modified argument is supplied, it must be a tuple of 9 integers
-    as returned by gmtime() in the standard Python time module. This MUST
-    be in GMT (Greenwich Mean Time). The formatted date/time will be used
-    as the value of an If-Modified-Since request header.
+    If the modified argument is supplied, it can be a tuple of 9 integers
+    (as returned by gmtime() in the standard Python time module) or a date
+    string in any format supported by feedparser. Regardless, it MUST
+    be in GMT (Greenwich Mean Time). It will be reformatted into an
+    RFC 1123-compliant date and used as the value of an If-Modified-Since
+    request header.
 
     If the agent argument is supplied, it will be used as the value of a
     User-Agent request header.
@@ -1765,6 +2921,9 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
 
     If handlers is supplied, it is a list of handlers used to build a
     urllib2 opener.
+
+    if request_headers is supplied it is a dictionary of HTTP request headers
+    that will override the values generated by FeedParser.
     """
 
     if hasattr(url_file_stream_or_string, 'read'):
@@ -1773,7 +2932,13 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
     if url_file_stream_or_string == '-':
         return sys.stdin
 
-    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
+    if isinstance(url_file_stream_or_string, basestring) \
+       and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
+        # Deal with the feed URI scheme
+        if url_file_stream_or_string.startswith('feed:http'):
+            url_file_stream_or_string = url_file_stream_or_string[5:]
+        elif url_file_stream_or_string.startswith('feed:'):
+            url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
         if not agent:
             agent = USER_AGENT
         # test for inline user:password for basic auth
@@ -1785,56 +2950,99 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
                 user_passwd, realhost = urllib.splituser(realhost)
                 if user_passwd:
                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
-                    auth = base64.encodestring(user_passwd).strip()
+                    auth = base64.standard_b64encode(user_passwd).strip()
+
+        # iri support
+        if isinstance(url_file_stream_or_string, unicode):
+            url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
+
         # try to open with urllib2 (to use optional headers)
-        request = urllib2.Request(url_file_stream_or_string)
-        request.add_header('User-Agent', agent)
-        if etag:
-            request.add_header('If-None-Match', etag)
-        if modified:
-            # format into an RFC 1123-compliant timestamp. We can't use
-            # time.strftime() since the %a and %b directives can be affected
-            # by the current locale, but RFC 2616 states that dates must be
-            # in English.
-            short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
-            months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
-            request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
-        if referrer:
-            request.add_header('Referer', referrer)
-        if gzip and zlib:
-            request.add_header('Accept-encoding', 'gzip, deflate')
-        elif gzip:
-            request.add_header('Accept-encoding', 'gzip')
-        elif zlib:
-            request.add_header('Accept-encoding', 'deflate')
-        else:
-            request.add_header('Accept-encoding', '')
-        if auth:
-            request.add_header('Authorization', 'Basic %s' % auth)
-        if ACCEPT_HEADER:
-            request.add_header('Accept', ACCEPT_HEADER)
-        request.add_header('A-IM', 'feed') # RFC 3229 support
-        opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
+        request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
+        opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
         try:
             return opener.open(request)
         finally:
             opener.close() # JohnD
-    
+
     # try to open with native open function (if url_file_stream_or_string is a filename)
     try:
-        return open(url_file_stream_or_string)
-    except:
+        return open(url_file_stream_or_string, 'rb')
+    except IOError:
         pass
 
     # treat url_file_stream_or_string as string
-    return _StringIO(str(url_file_stream_or_string))
+    if isinstance(url_file_stream_or_string, unicode):
+        return _StringIO(url_file_stream_or_string.encode('utf-8'))
+    return _StringIO(url_file_stream_or_string)
+
+def _convert_to_idn(url):
+    """Convert a URL to IDN notation"""
+    # this function should only be called with a unicode string
+    # strategy: if the host cannot be encoded in ascii, then
+    # it'll be necessary to encode it in idn form
+    parts = list(urlparse.urlsplit(url))
+    try:
+        parts[1].encode('ascii')
+    except UnicodeEncodeError:
+        # the url needs to be converted to idn notation
+        host = parts[1].rsplit(':', 1)
+        newhost = []
+        port = u''
+        if len(host) == 2:
+            port = host.pop()
+        for h in host[0].split('.'):
+            newhost.append(h.encode('idna').decode('utf-8'))
+        parts[1] = '.'.join(newhost)
+        if port:
+            parts[1] += ':' + port
+        return urlparse.urlunsplit(parts)
+    else:
+        return url
+
+def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
+    request = urllib2.Request(url)
+    request.add_header('User-Agent', agent)
+    if etag:
+        request.add_header('If-None-Match', etag)
+    if isinstance(modified, basestring):
+        modified = _parse_date(modified)
+    elif isinstance(modified, datetime.datetime):
+        modified = modified.utctimetuple()
+    if modified:
+        # format into an RFC 1123-compliant timestamp. We can't use
+        # time.strftime() since the %a and %b directives can be affected
+        # by the current locale, but RFC 2616 states that dates must be
+        # in English.
+        short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+        request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+    if referrer:
+        request.add_header('Referer', referrer)
+    if gzip and zlib:
+        request.add_header('Accept-encoding', 'gzip, deflate')
+    elif gzip:
+        request.add_header('Accept-encoding', 'gzip')
+    elif zlib:
+        request.add_header('Accept-encoding', 'deflate')
+    else:
+        request.add_header('Accept-encoding', '')
+    if auth:
+        request.add_header('Authorization', 'Basic %s' % auth)
+    if ACCEPT_HEADER:
+        request.add_header('Accept', ACCEPT_HEADER)
+    # use this for whatever -- cookies, special headers, etc
+    # [('Cookie','Something'),('x-special-header','Another Value')]
+    for header_name, header_value in request_headers.items():
+        request.add_header(header_name, header_value)
+    request.add_header('A-IM', 'feed') # RFC 3229 support
+    return request
 
 _date_handlers = []
 def registerDateHandler(func):
     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
     _date_handlers.insert(0, func)
-    
+
 # ISO-8601 date parsing routines written by Fazal Majid.
 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
 # parser is beyond the scope of feedparser and would be a worthwhile addition
@@ -1844,8 +3052,8 @@ def registerDateHandler(func):
 # 0301-04-01), so we use templates instead.
 # Please note the order in templates is significant because we need a
 # greedy match.
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
-                'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
+                'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
                 '-YY-?MM', '-OOO', '-YY',
                 '--MM-?DD', '--MM',
                 '---DD',
@@ -1860,19 +3068,29 @@ _iso8601_re = [
     'CC', r'(?P\d\d$)')
     + r'(T?(?P\d{2}):(?P\d{2})'
     + r'(:(?P\d{2}))?'
+    + r'(\.(?P\d+))?'
     + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?'
     for tmpl in _iso8601_tmpl]
-del tmpl
+try:
+    del tmpl
+except NameError:
+    pass
 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
-del regex
+try:
+    del regex
+except NameError:
+    pass
 def _parse_date_iso8601(dateString):
     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
     m = None
     for _iso8601_match in _iso8601_matches:
         m = _iso8601_match(dateString)
-        if m: break
-    if not m: return
-    if m.span() == (0, 0): return
+        if m:
+            break
+    if not m:
+        return
+    if m.span() == (0, 0):
+        return
     params = m.groupdict()
     ordinal = params.get('ordinal', 0)
     if ordinal:
@@ -1918,14 +3136,10 @@ def _parse_date_iso8601(dateString):
             params[field] = 0
     hour = int(params.get('hour', 0))
     minute = int(params.get('minute', 0))
-    second = int(params.get('second', 0))
+    second = int(float(params.get('second', 0)))
     # weekday is normalized by mktime(), we can ignore it
     weekday = 0
-    # daylight savings is complex, but not needed for feedparser's purposes
-    # as time zones, if specified, include mention of whether it is active
-    # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
-    # and most implementations have DST bugs
-    daylight_savings_flag = 0
+    daylight_savings_flag = -1
     tm = [year, month, day, hour, minute, second, weekday,
           ordinal, daylight_savings_flag]
     # ISO 8601 time zone adjustments
@@ -1942,9 +3156,9 @@ def _parse_date_iso8601(dateString):
     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
     # which is guaranteed to normalize d/m/y/h/m/s.
     # Many implementations have bugs, but we'll pretend they don't.
-    return time.localtime(time.mktime(tm))
+    return time.localtime(time.mktime(tuple(tm)))
 registerDateHandler(_parse_date_iso8601)
-    
+
 # 8-bit date handling routines written by ytrewq1.
 _korean_year  = u'\ub144' # b3e2 in euc-kr
 _korean_month = u'\uc6d4' # bff9 in euc-kr
@@ -1961,19 +3175,20 @@ _korean_nate_date_re = \
 def _parse_date_onblog(dateString):
     '''Parse a string according to the OnBlog 8-bit date format'''
     m = _korean_onblog_date_re.match(dateString)
-    if not m: return
+    if not m:
+        return
     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
                  'zonediff': '+09:00'}
-    if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
     return _parse_date_w3dtf(w3dtfdate)
 registerDateHandler(_parse_date_onblog)
 
 def _parse_date_nate(dateString):
     '''Parse a string according to the Nate 8-bit date format'''
     m = _korean_nate_date_re.match(dateString)
-    if not m: return
+    if not m:
+        return
     hour = int(m.group(5))
     ampm = m.group(4)
     if (ampm == _korean_pm):
@@ -1985,7 +3200,6 @@ def _parse_date_nate(dateString):
                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
                  'zonediff': '+09:00'}
-    if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
     return _parse_date_w3dtf(w3dtfdate)
 registerDateHandler(_parse_date_nate)
 
@@ -1994,12 +3208,12 @@ _mssql_date_re = \
 def _parse_date_mssql(dateString):
     '''Parse a string according to the MS SQL date format'''
     m = _mssql_date_re.match(dateString)
-    if not m: return
+    if not m:
+        return
     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
                  'zonediff': '+09:00'}
-    if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
     return _parse_date_w3dtf(w3dtfdate)
 registerDateHandler(_parse_date_mssql)
 
@@ -2035,7 +3249,7 @@ _greek_wdays = \
    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
-   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7   
+   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
   }
 
 _greek_date_format_re = \
@@ -2044,17 +3258,14 @@ _greek_date_format_re = \
 def _parse_date_greek(dateString):
     '''Parse a string according to a Greek 8-bit date format.'''
     m = _greek_date_format_re.match(dateString)
-    if not m: return
-    try:
-        wday = _greek_wdays[m.group(1)]
-        month = _greek_months[m.group(3)]
-    except:
+    if not m:
         return
+    wday = _greek_wdays[m.group(1)]
+    month = _greek_months[m.group(3)]
     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
                   'zonediff': m.group(8)}
-    if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
     return _parse_date_rfc822(rfc822date)
 registerDateHandler(_parse_date_greek)
 
@@ -2081,22 +3292,19 @@ _hungarian_date_format_re = \
 def _parse_date_hungarian(dateString):
     '''Parse a string according to a Hungarian 8-bit date format.'''
     m = _hungarian_date_format_re.match(dateString)
-    if not m: return
-    try:
-        month = _hungarian_months[m.group(2)]
-        day = m.group(3)
-        if len(day) == 1:
-            day = '0' + day
-        hour = m.group(4)
-        if len(hour) == 1:
-            hour = '0' + hour
-    except:
-        return
+    if not m or m.group(2) not in _hungarian_months:
+        return None
+    month = _hungarian_months[m.group(2)]
+    day = m.group(3)
+    if len(day) == 1:
+        day = '0' + day
+    hour = m.group(4)
+    if len(hour) == 1:
+        hour = '0' + hour
     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
                 {'year': m.group(1), 'month': month, 'day': day,\
                  'hour': hour, 'minute': m.group(5),\
                  'zonediff': m.group(6)}
-    if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
     return _parse_date_w3dtf(w3dtfdate)
 registerDateHandler(_parse_date_hungarian)
 
@@ -2183,25 +3391,29 @@ def _parse_date_w3dtf(dateString):
 
     __date_re = ('(?P\d\d\d\d)'
                  '(?:(?P-|)'
-                 '(?:(?P\d\d\d)'
-                 '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?')
+                 '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?'
+                 '|(?P\d\d\d)))?')
     __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)'
     __tzd_rx = re.compile(__tzd_re)
     __time_re = ('(?P\d\d)(?P:|)(?P\d\d)'
-                 '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?'
+                 '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?'
                  + __tzd_re)
     __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
     __datetime_rx = re.compile(__datetime_re)
     m = __datetime_rx.match(dateString)
-    if (m is None) or (m.group() != dateString): return
+    if (m is None) or (m.group() != dateString):
+        return
     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
-    if gmt[0] == 0: return
+    if gmt[0] == 0:
+        return
     return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
 registerDateHandler(_parse_date_w3dtf)
 
 def _parse_date_rfc822(dateString):
     '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
     data = dateString.split()
+    if not data:
+        return None
     if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
         del data[0]
     if len(data) == 4:
@@ -2212,31 +3424,57 @@ def _parse_date_rfc822(dateString):
         else:
             data.append('')
         dateString = " ".join(data)
+    # Account for the Etc/GMT timezone by stripping 'Etc/'
+    elif len(data) == 5 and data[4].lower().startswith('etc/'):
+        data[4] = data[4][4:]
+        dateString = " ".join(data)
     if len(data) < 5:
         dateString += ' 00:00:00 GMT'
     tm = rfc822.parsedate_tz(dateString)
     if tm:
+        # Jython doesn't adjust for 2-digit years like CPython does,
+        # so account for it by shifting the year so that it's in the
+        # range 1970-2069 (1970 being the year of the Unix epoch).
+        if tm[0] < 100:
+            tm = (tm[0] + (1900, 2000)[tm[0] < 70],) + tm[1:]
         return time.gmtime(rfc822.mktime_tz(tm))
 # rfc822.py defines several time zones, but we define some extra ones.
 # 'ET' is equivalent to 'EST', etc.
 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
 rfc822._timezones.update(_additional_timezones)
-registerDateHandler(_parse_date_rfc822)    
+registerDateHandler(_parse_date_rfc822)
+
+def _parse_date_perforce(aDateString):
+    """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
+    # Fri, 2006/09/15 08:19:53 EDT
+    _my_date_pattern = re.compile( \
+        r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
+
+    m = _my_date_pattern.search(aDateString)
+    if m is None:
+        return None
+    dow, year, month, day, hour, minute, second, tz = m.groups()
+    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+    dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
+    tm = rfc822.parsedate_tz(dateString)
+    if tm:
+        return time.gmtime(rfc822.mktime_tz(tm))
+registerDateHandler(_parse_date_perforce)
 
 def _parse_date(dateString):
     '''Parses a variety of date formats into a 9-tuple in GMT'''
+    if not dateString:
+        return None
     for handler in _date_handlers:
         try:
             date9tuple = handler(dateString)
-            if not date9tuple: continue
-            if len(date9tuple) != 9:
-                if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
-                raise ValueError
-            map(int, date9tuple)
-            return date9tuple
-        except Exception, e:
-            if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
-            pass
+        except (KeyError, OverflowError, ValueError):
+            continue
+        if not date9tuple:
+            continue
+        if len(date9tuple) != 9:
+            continue
+        return date9tuple
     return None
 
 def _getCharacterEncoding(http_headers, xml_data):
@@ -2244,7 +3482,7 @@ def _getCharacterEncoding(http_headers, xml_data):
 
     http_headers is a dictionary
     xml_data is a raw string (not Unicode)
-    
+
     This is so much trickier than it sounds, it's not even funny.
     According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
     is application/xml, application/*+xml,
@@ -2263,12 +3501,12 @@ def _getCharacterEncoding(http_headers, xml_data):
     served with a Content-Type of text/* and no charset parameter
     must be treated as us-ascii.  (We now do this.)  And also that it
     must always be flagged as non-well-formed.  (We now do this too.)
-    
+
     If Content-Type is unspecified (input was local file or non-HTTP source)
     or unrecognized (server just got it totally wrong), then go by the
     encoding given in the XML prefix of the document and default to
     'iso-8859-1' as per the HTTP specification (RFC 2616).
-    
+
     Then, assuming we didn't find a character encoding in the HTTP headers
     (and the HTTP Content-type allowed us to look in the body), we need
     to sniff the first few bytes of the XML data and try to determine
@@ -2296,130 +3534,115 @@ def _getCharacterEncoding(http_headers, xml_data):
         '''
         content_type = content_type or ''
         content_type, params = cgi.parse_header(content_type)
-        return content_type, params.get('charset', '').replace("'", '')
-
-    sniffed_xml_encoding = ''
-    xml_encoding = ''
-    true_encoding = ''
-    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
+        charset = params.get('charset', '').replace("'", "")
+        if not isinstance(charset, unicode):
+            charset = charset.decode('utf-8', 'ignore')
+        return content_type, charset
+
+    sniffed_xml_encoding = u''
+    xml_encoding = u''
+    true_encoding = u''
+    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
     # Must sniff for non-ASCII-compatible character encodings before
     # searching for XML declaration.  This heuristic is defined in
     # section F of the XML specification:
     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
     try:
-        if xml_data[:4] == '\x4c\x6f\xa7\x94':
+        if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
             # EBCDIC
             xml_data = _ebcdic_to_ascii(xml_data)
-        elif xml_data[:4] == '\x00\x3c\x00\x3f':
+        elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
             # UTF-16BE
-            sniffed_xml_encoding = 'utf-16be'
+            sniffed_xml_encoding = u'utf-16be'
             xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
+        elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
             # UTF-16BE with BOM
-            sniffed_xml_encoding = 'utf-16be'
+            sniffed_xml_encoding = u'utf-16be'
             xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-        elif xml_data[:4] == '\x3c\x00\x3f\x00':
+        elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
             # UTF-16LE
-            sniffed_xml_encoding = 'utf-16le'
+            sniffed_xml_encoding = u'utf-16le'
             xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
+        elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
             # UTF-16LE with BOM
-            sniffed_xml_encoding = 'utf-16le'
+            sniffed_xml_encoding = u'utf-16le'
             xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-        elif xml_data[:4] == '\x00\x00\x00\x3c':
+        elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
             # UTF-32BE
-            sniffed_xml_encoding = 'utf-32be'
+            sniffed_xml_encoding = u'utf-32be'
             xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-        elif xml_data[:4] == '\x3c\x00\x00\x00':
+        elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
             # UTF-32LE
-            sniffed_xml_encoding = 'utf-32le'
+            sniffed_xml_encoding = u'utf-32le'
             xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-        elif xml_data[:4] == '\x00\x00\xfe\xff':
+        elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
             # UTF-32BE with BOM
-            sniffed_xml_encoding = 'utf-32be'
+            sniffed_xml_encoding = u'utf-32be'
             xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-        elif xml_data[:4] == '\xff\xfe\x00\x00':
+        elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
             # UTF-32LE with BOM
-            sniffed_xml_encoding = 'utf-32le'
+            sniffed_xml_encoding = u'utf-32le'
             xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-        elif xml_data[:3] == '\xef\xbb\xbf':
+        elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
             # UTF-8 with BOM
-            sniffed_xml_encoding = 'utf-8'
+            sniffed_xml_encoding = u'utf-8'
             xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
         else:
             # ASCII-compatible
             pass
-        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
-    except:
+        xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
+    except UnicodeDecodeError:
         xml_encoding_match = None
     if xml_encoding_match:
-        xml_encoding = xml_encoding_match.groups()[0].lower()
-        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
+        xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
+        if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
             xml_encoding = sniffed_xml_encoding
     acceptable_content_type = 0
-    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
-    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
+    application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
+    text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
     if (http_content_type in application_content_types) or \
-       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+       (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
         acceptable_content_type = 1
-        true_encoding = http_encoding or xml_encoding or 'utf-8'
+        true_encoding = http_encoding or xml_encoding or u'utf-8'
     elif (http_content_type in text_content_types) or \
-         (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
+         (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
         acceptable_content_type = 1
-        true_encoding = http_encoding or 'us-ascii'
-    elif http_content_type.startswith('text/'):
-        true_encoding = http_encoding or 'us-ascii'
-    elif http_headers and (not http_headers.has_key('content-type')):
-        true_encoding = xml_encoding or 'iso-8859-1'
+        true_encoding = http_encoding or u'us-ascii'
+    elif http_content_type.startswith(u'text/'):
+        true_encoding = http_encoding or u'us-ascii'
+    elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
+        true_encoding = xml_encoding or u'iso-8859-1'
     else:
-        true_encoding = xml_encoding or 'utf-8'
+        true_encoding = xml_encoding or u'utf-8'
+    # some feeds claim to be gb2312 but are actually gb18030.
+    # apparently MSIE and Firefox both do the following switch:
+    if true_encoding.lower() == u'gb2312':
+        true_encoding = u'gb18030'
     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
-    
+
 def _toUTF8(data, encoding):
     '''Changes an XML data stream on the fly to specify a new encoding
 
     data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
     encoding is a string recognized by encodings.aliases
     '''
-    if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
     # strip Byte Order Mark (if present)
-    if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-16be':
-                sys.stderr.write('trying utf-16be instead\n')
+    if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
         encoding = 'utf-16be'
         data = data[2:]
-    elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-16le':
-                sys.stderr.write('trying utf-16le instead\n')
+    elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
         encoding = 'utf-16le'
         data = data[2:]
-    elif data[:3] == '\xef\xbb\xbf':
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-8':
-                sys.stderr.write('trying utf-8 instead\n')
+    elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
         encoding = 'utf-8'
         data = data[3:]
-    elif data[:4] == '\x00\x00\xfe\xff':
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-32be':
-                sys.stderr.write('trying utf-32be instead\n')
+    elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
         encoding = 'utf-32be'
         data = data[4:]
-    elif data[:4] == '\xff\xfe\x00\x00':
-        if _debug:
-            sys.stderr.write('stripping BOM\n')
-            if encoding != 'utf-32le':
-                sys.stderr.write('trying utf-32le instead\n')
+    elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
         encoding = 'utf-32le'
         data = data[4:]
     newdata = unicode(data, encoding)
-    if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
     declmatch = re.compile('^<\?xml[^>]*?>')
     newdecl = ''''''
     if declmatch.search(newdata):
@@ -2434,77 +3657,118 @@ def _stripDoctype(data):
     rss_version may be 'rss091n' or None
     stripped_data is the same XML document, minus the DOCTYPE
     '''
-    entity_pattern = re.compile(r']*?)>', re.MULTILINE)
-    data = entity_pattern.sub('', data)
-    doctype_pattern = re.compile(r']*?)>', re.MULTILINE)
-    doctype_results = doctype_pattern.findall(data)
-    doctype = doctype_results and doctype_results[0] or ''
-    if doctype.lower().count('netscape'):
-        version = 'rss091n'
+    start = re.search(_s2bytes('<\w'), data)
+    start = start and start.start() or -1
+    head,data = data[:start+1], data[start+1:]
+
+    entity_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
+    entity_results=entity_pattern.findall(head)
+    head = entity_pattern.sub(_s2bytes(''), head)
+    doctype_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
+    doctype_results = doctype_pattern.findall(head)
+    doctype = doctype_results and doctype_results[0] or _s2bytes('')
+    if doctype.lower().count(_s2bytes('netscape')):
+        version = u'rss091n'
     else:
         version = None
-    data = doctype_pattern.sub('', data)
-    return version, data
-    
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
-    '''Parse a feed from a URL, file, stream, or string'''
+
+    # only allow in 'safe' inline entity definitions
+    replacement=_s2bytes('')
+    if len(doctype_results)==1 and entity_results:
+       safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
+       safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
+       if safe_entities:
+           replacement=_s2bytes('\n  \n]>')
+    data = doctype_pattern.sub(replacement, head) + data
+
+    return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+    '''Parse a feed from a URL, file, stream, or string.
+
+    request_headers, if given, is a dict from http header name to value to add
+    to the request; this overrides internally generated values.
+    '''
+
+    if handlers is None:
+        handlers = []
+    if request_headers is None:
+        request_headers = {}
+    if response_headers is None:
+        response_headers = {}
+
     result = FeedParserDict()
     result['feed'] = FeedParserDict()
     result['entries'] = []
-    if _XML_AVAILABLE:
-        result['bozo'] = 0
-    if type(handlers) == types.InstanceType:
+    result['bozo'] = 0
+    if not isinstance(handlers, list):
         handlers = [handlers]
     try:
-        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
+        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
         data = f.read()
     except Exception, e:
         result['bozo'] = 1
         result['bozo_exception'] = e
-        data = ''
+        data = None
         f = None
 
+    if hasattr(f, 'headers'):
+        result['headers'] = dict(f.headers)
+    # overwrite existing headers using response_headers
+    if 'headers' in result:
+        result['headers'].update(response_headers)
+    elif response_headers:
+        result['headers'] = copy.deepcopy(response_headers)
+
     # if feed is gzip-compressed, decompress it
-    if f and data and hasattr(f, 'headers'):
-        if gzip and f.headers.get('content-encoding', '') == 'gzip':
+    if f and data and 'headers' in result:
+        if gzip and 'gzip' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
             try:
                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
-            except Exception, e:
+            except (IOError, struct.error), e:
+                # IOError can occur if the gzip header is bad
+                # struct.error can occur if the data is damaged
                 # Some feeds claim to be gzipped but they're not, so
                 # we get garbage.  Ideally, we should re-request the
                 # feed without the 'Accept-encoding: gzip' header,
                 # but we don't.
                 result['bozo'] = 1
                 result['bozo_exception'] = e
-                data = ''
-        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
+                data = None
+        elif zlib and 'deflate' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
             try:
-                data = zlib.decompress(data, -zlib.MAX_WBITS)
-            except Exception, e:
+                data = zlib.decompress(data)
+            except zlib.error, e:
                 result['bozo'] = 1
                 result['bozo_exception'] = e
-                data = ''
-
-    ## Fix for media:title bug
-    data = data.replace("media:title", "mediatitle")
+                data = None
 
     # save HTTP headers
-    if hasattr(f, 'info'):
-        info = f.info()
-        result['etag'] = info.getheader('ETag')
-        last_modified = info.getheader('Last-Modified')
-        if last_modified:
-            result['modified'] = _parse_date(last_modified)
+    if 'headers' in result:
+        if 'etag' in result['headers'] or 'ETag' in result['headers']:
+            etag = result['headers'].get('etag', result['headers'].get('ETag', u''))
+            if not isinstance(etag, unicode):
+                etag = etag.decode('utf-8', 'ignore')
+            if etag:
+                result['etag'] = etag
+        if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
+            modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
+            if modified:
+                result['modified'] = _parse_date(modified)
     if hasattr(f, 'url'):
-        result['href'] = f.url
+        if not isinstance(f.url, unicode):
+            result['href'] = f.url.decode('utf-8', 'ignore')
+        else:
+            result['href'] = f.url
         result['status'] = 200
     if hasattr(f, 'status'):
         result['status'] = f.status
-    if hasattr(f, 'headers'):
-        result['headers'] = f.headers.dict
     if hasattr(f, 'close'):
         f.close()
 
+    if data is None:
+        return result
+
     # there are four encodings to keep track of:
     # - http_encoding is the encoding declared in the Content-Type HTTP header
     # - xml_encoding is the encoding declared in the ; changed
-#  project name
-#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
-#  removed unnecessary urllib code -- urllib2 should always be available anyway;
-#  return actual url, status, and full HTTP headers (as result['url'],
-#  result['status'], and result['headers']) if parsing a remote feed over HTTP --
-#  this should pass all the HTTP tests at ;
-#  added the latest namespace-of-the-week for RSS 2.0
-#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
-#  User-Agent (otherwise urllib2 sends two, which confuses some servers)
-#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
-#  inline  and  as used in some RSS 2.0 feeds
-#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
-#  textInput, and also to return the character encoding (if specified)
-#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
-#  nested divs within content (JohnD); fixed missing sys import (JohanS);
-#  fixed regular expression to capture XML character encoding (Andrei);
-#  added support for Atom 0.3-style links; fixed bug with textInput tracking;
-#  added support for cloud (MartijnP); added support for multiple
-#  category/dc:subject (MartijnP); normalize content model: 'description' gets
-#  description (which can come from description, summary, or full content if no
-#  description), 'content' gets dict of base/language/type/value (which can come
-#  from content:encoded, xhtml:body, content, or fullitem);
-#  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
-#  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
-#   element is not in default namespace (like Pocketsoap feed);
-#  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
-#  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
-#  description, xhtml:body, content, content:encoded, title, subtitle,
-#  summary, info, tagline, and copyright; added support for pingback and
-#  trackback namespaces
-#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
-#  namespaces, as opposed to 2.6 when I said I did but didn't really;
-#  sanitize HTML markup within some elements; added mxTidy support (if
-#  installed) to tidy HTML markup within some elements; fixed indentation
-#  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
-#  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
-#  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
-#  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
-#  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
-#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '.  fixed memory
-#  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
-#  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
-#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed 
tags in -# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); -# fixed relative URI processing for guid (skadz); added ICBM support; added -# base64 support -#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many -# blogspot.com sites); added _debug variable -#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing -#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); -# added several new supported namespaces; fixed bug tracking naked markup in -# description; added support for enclosure; added support for source; re-added -# support for cloud which got dropped somehow; added support for expirationDate -#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking -# xml:base URI, one for documents that don't define one explicitly and one for -# documents that define an outer and an inner xml:base that goes out of scope -# before the end of the document -#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level -#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] -# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; -# added support for creativeCommons:license and cc:license; added support for -# full Atom content model in title, tagline, info, copyright, summary; fixed bug -# with gzip encoding (not always telling server we support it when we do) -#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail -# (dictionary of 'name', 'url', 'email'); map author to author_detail if author -# contains name + email address -#3.0b8 - 1/28/2004 - MAP - added support for contributor -#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added -# support for summary -#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from -# xml.util.iso8601 -#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain -# dangerous markup; fiddled with decodeEntities (not right); liberalized -# date parsing even further -#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); -# added support to Atom 0.2 subtitle; added support for Atom content model -# in copyright; better sanitizing of dangerous HTML elements with end tags -# (script, frameset) -#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, -# etc.) in embedded markup, in either HTML or XHTML form (
,
,
) -#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under -# Python 2.1 -#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; -# fixed bug capturing author and contributor URL; fixed bug resolving relative -# links in author and contributor URL; fixed bug resolvin relative links in -# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's -# namespace tests, and included them permanently in the test suite with his -# permission; fixed namespace handling under Python 2.1 -#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) -#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 -#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); -# use libxml2 (if available) -#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author -# name was in parentheses; removed ultra-problematic mxTidy support; patch to -# workaround crash in PyXML/expat when encountering invalid entities -# (MarkMoraes); support for textinput/textInput -#3.0b20 - 4/7/2004 - MAP - added CDF support -#3.0b21 - 4/14/2004 - MAP - added Hot RSS support -#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in -# results dict; changed results dict to allow getting values with results.key -# as well as results[key]; work around embedded illformed HTML with half -# a DOCTYPE; work around malformed Content-Type header; if character encoding -# is wrong, try several common ones before falling back to regexes (if this -# works, bozo_exception is set to CharacterEncodingOverride); fixed character -# encoding issues in BaseHTMLProcessor by tracking encoding and converting -# from Unicode to raw strings before feeding data to sgmllib.SGMLParser; -# convert each value in results to Unicode (if possible), even if using -# regex-based parsing -#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain -# high-bit characters in attributes in embedded HTML in description (thanks -# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in -# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking -# about a mapped key -#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and -# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could -# cause the same encoding to be tried twice (even if it failed the first time); -# fixed DOCTYPE stripping when DOCTYPE contained entity declarations; -# better textinput and image tracking in illformed RSS 1.0 feeds -#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed -# my blink tag tests -#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that -# failed to parse utf-16 encoded feeds; made source into a FeedParserDict; -# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; -# added support for image; refactored parse() fallback logic to try other -# encodings if SAX parsing fails (previously it would only try other encodings -# if re-encoding failed); remove unichr madness in normalize_attrs now that -# we're properly tracking encoding in and out of BaseHTMLProcessor; set -# feed.language from root-level xml:lang; set entry.id from rdf:about; -# send Accept header -#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between -# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are -# windows-1252); fixed regression that could cause the same encoding to be -# tried twice (even if it failed the first time) -#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; -# recover from malformed content-type header parameter with no equals sign -# ('text/xml; charset:iso-8859-1') -#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities -# to Unicode equivalents in illformed feeds (aaronsw); added and -# passed tests for converting character entities to Unicode equivalents -# in illformed feeds (aaronsw); test for valid parsers when setting -# XML_AVAILABLE; make version and encoding available when server returns -# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like -# digest auth or proxy support); add code to parse username/password -# out of url and send as basic authentication; expose downloading-related -# exceptions in bozo_exception (aaronsw); added __contains__ method to -# FeedParserDict (aaronsw); added publisher_detail (aaronsw) -#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always -# convert feed to UTF-8 before passing to XML parser; completely revamped -# logic for determining character encoding and attempting XML parsing -# (much faster); increased default timeout to 20 seconds; test for presence -# of Location header on redirects; added tests for many alternate character -# encodings; support various EBCDIC encodings; support UTF-16BE and -# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support -# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no -# XML parsers are available; added support for 'Content-encoding: deflate'; -# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules -# are available -#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure -# problem tracking xml:base and xml:lang if element declares it, child -# doesn't, first grandchild redeclares it, and second grandchild doesn't; -# refactored date parsing; defined public registerDateHandler so callers -# can add support for additional date formats at runtime; added support -# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added -# zopeCompatibilityHack() which turns FeedParserDict into a regular -# dictionary, required for Zope compatibility, and also makes command- -# line debugging easier because pprint module formats real dictionaries -# better than dictionary-like objects; added NonXMLContentType exception, -# which is stored in bozo_exception when a feed is served with a non-XML -# media type such as 'text/plain'; respect Content-Language as default -# language if not xml:lang is present; cloud dict is now FeedParserDict; -# generator dict is now FeedParserDict; better tracking of xml:lang, -# including support for xml:lang='' to unset the current language; -# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default -# namespace; don't overwrite final status on redirects (scenarios: -# redirecting to a URL that returns 304, redirecting to a URL that -# redirects to another URL with a different type of redirect); add -# support for HTTP 303 redirects -#4.0 - MAP - support for relative URIs in xml:base attribute; fixed -# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; -# support for Atom 1.0; support for iTunes extensions; new 'tags' for -# categories/keywords/etc. as array of dict -# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 -# terminology; parse RFC 822-style dates with no time; lots of other -# bug fixes -#4.1 - MAP - removed socket timeout; added support for chardet library -- 1.7.9.5