git.maemo.org Git - theonering/blob - src/gvoice/browser_emu.py

   1 """
   2 @author:          Laszlo Nagy
   3 @copyright:   (c) 2005 by Szoftver Messias Bt.
   4 @licence:        BSD style
   5
   6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
   7
   8         - cookie management
   9         - configurable user agent string
  10         - GET and POST
  11         - multipart POST (send files)
  12         - receive content into file
  13
  14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
  15
  16         1. Use firefox
  17         2. Install and open the livehttpheaders plugin
  18         3. Use the website manually with firefox
  19         4. Check the GET and POST requests in the livehttpheaders capture window
  20         5. Create an instance of the above class and send the same GET and POST requests to the server.
  21
  22 Optional steps:
  23
  24         - You can change user agent string in the build_opened method
  25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
  26 """
  27
  28 import urllib2
  29 import cookielib
  30 import logging
  31
  32 import socket
  33
  34
  35 _moduleLogger = logging.getLogger(__name__)
  36 socket.setdefaulttimeout(45)
  37
  38
  39 class MozillaEmulator(object):
  40
  41         def __init__(self, trycount = 1):
  42                 """Create a new MozillaEmulator object.
  43
  44                 @param trycount: The download() method will retry the operation if it
  45                 fails. You can specify -1 for infinite retrying.  A value of 0 means no
  46                 retrying. A value of 1 means one retry. etc."""
  47                 self.debug = False
  48                 self.trycount = trycount
  49                 self._cookies = cookielib.LWPCookieJar()
  50                 self._loadedFromCookies = False
  51                 self._usingCookies = False
  52
  53         def load_cookies(self, path):
  54                 assert not self._loadedFromCookies, "Load cookies only once"
  55                 if not path:
  56                         return
  57
  58                 self._cookies.filename = path
  59                 try:
  60                         self._cookies.load()
  61                 except cookielib.LoadError:
  62                         _moduleLogger.exception("Bad cookie file")
  63                 except IOError:
  64                         _moduleLogger.exception("No cookie file")
  65                 except Exception, e:
  66                         _moduleLogger.exception("Unknown error with cookies")
  67                 else:
  68                         self._loadedFromCookies = True
  69
  70                 self._usingCookies = True
  71                 return self._loadedFromCookies
  72
  73         def save_cookies(self):
  74                 if self._usingCookies:
  75                         self._cookies.save()
  76
  77         def clear_cookies(self):
  78                 if self._usingCookies:
  79                         self._cookies.clear()
  80
  81         def download(self, url,
  82                         postdata = None, extraheaders = None, forbidRedirect = False,
  83                         trycount = None, only_head = False,
  84                 ):
  85                 """Download an URL with GET or POST methods.
  86
  87                 @param postdata: It can be a string that will be POST-ed to the URL.
  88                         When None is given, the method will be GET instead.
  89                 @param extraheaders: You can add/modify HTTP headers with a dict here.
  90                 @param forbidRedirect: Set this flag if you do not want to handle
  91                         HTTP 301 and 302 redirects.
  92                 @param trycount: Specify the maximum number of retries here.
  93                         0 means no retry on error. Using -1 means infinite retring.
  94                         None means the default value (that is self.trycount).
  95                 @param only_head: Create the openerdirector and return it. In other
  96                         words, this will not retrieve any content except HTTP headers.
  97
  98                 @return: The raw HTML page data
  99                 """
 100                 _moduleLogger.debug("Performing download of %s" % url)
 101
 102                 if extraheaders is None:
 103                         extraheaders = {}
 104                 if trycount is None:
 105                         trycount = self.trycount
 106                 cnt = 0
 107
 108                 while True:
 109                         try:
 110                                 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
 111                                 openerdirector = u.open(req)
 112                                 if self.debug:
 113                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
 114                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
 115                                         _moduleLogger.info("%r" % (openerdirector.headers))
 116                                 self._cookies.extract_cookies(openerdirector, req)
 117                                 if only_head:
 118                                         return openerdirector
 119
 120                                 return self._read(openerdirector, trycount)
 121                         except urllib2.URLError, e:
 122                                 _moduleLogger.debug("%s: %s" % (e, url))
 123                                 cnt += 1
 124                                 if (-1 < trycount) and (trycount < cnt):
 125                                         raise
 126
 127                         # Retry :-)
 128                         _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
 129
 130         def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
 131                 if extraheaders is None:
 132                         extraheaders = {}
 133
 134                 txheaders = {
 135                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
 136                         'Accept-Language': 'en,en-us;q=0.5',
 137                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 138                 }
 139                 for key, value in extraheaders.iteritems():
 140                         txheaders[key] = value
 141                 req = urllib2.Request(url, postdata, txheaders)
 142                 self._cookies.add_cookie_header(req)
 143                 if forbidRedirect:
 144                         redirector = HTTPNoRedirector()
 145                         #_moduleLogger.info("Redirection disabled")
 146                 else:
 147                         redirector = urllib2.HTTPRedirectHandler()
 148                         #_moduleLogger.info("Redirection enabled")
 149
 150                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
 151                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
 152
 153                 u = urllib2.build_opener(
 154                         http_handler,
 155                         https_handler,
 156                         urllib2.HTTPCookieProcessor(self._cookies),
 157                         redirector
 158                 )
 159                 u.addheaders = [(
 160                         'User-Agent',
 161                         'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
 162                 )]
 163                 if not postdata is None:
 164                         req.add_data(postdata)
 165                 return (req, u)
 166
 167         def _read(self, openerdirector, trycount):
 168                 chunks = []
 169
 170                 chunk = openerdirector.read()
 171                 chunks.append(chunk)
 172                 #while chunk and cnt < trycount:
 173                 #       time.sleep(1)
 174                 #       cnt += 1
 175                 #       chunk = openerdirector.read()
 176                 #       chunks.append(chunk)
 177
 178                 data = "".join(chunks)
 179
 180                 if "Content-Length" in openerdirector.info():
 181                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
 182                                 openerdirector.info()["Content-Length"],
 183                                 len(data),
 184                         )
 185
 186                 return data
 187
 188
 189 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
 190         """This is a custom http redirect handler that FORBIDS redirection."""
 191
 192         def http_error_302(self, req, fp, code, msg, headers):
 193                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
 194                 if e.code in (301, 302):
 195                         if 'location' in headers:
 196                                 newurl = headers.getheaders('location')[0]
 197                         elif 'uri' in headers:
 198                                 newurl = headers.getheaders('uri')[0]
 199                         e.newurl = newurl
 200                 _moduleLogger.info("New url: %s" % e.newurl)
 201                 raise e