git.maemo.org Git - theonering/blob - src/browser_emu.py

   1 """
   2 @author:          Laszlo Nagy
   3 @copyright:   (c) 2005 by Szoftver Messias Bt.
   4 @licence:        BSD style
   5
   6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
   7
   8         - cookie management
   9         - configurable user agent string
  10         - GET and POST
  11         - multipart POST (send files)
  12         - receive content into file
  13
  14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
  15
  16         1. Use firefox
  17         2. Install and open the livehttpheaders plugin
  18         3. Use the website manually with firefox
  19         4. Check the GET and POST requests in the livehttpheaders capture window
  20         5. Create an instance of the above class and send the same GET and POST requests to the server.
  21
  22 Optional steps:
  23
  24         - You can change user agent string in the build_opened method
  25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
  26 """
  27
  28 import urllib2
  29 import cookielib
  30 import logging
  31
  32 import socket
  33
  34
  35 _moduleLogger = logging.getLogger("browser_emu")
  36 socket.setdefaulttimeout(10)
  37
  38
  39 class MozillaEmulator(object):
  40
  41         def __init__(self, trycount = 1):
  42                 """Create a new MozillaEmulator object.
  43
  44                 @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
  45                          A value of 0 means no retrying. A value of 1 means one retry. etc."""
  46                 self.cookies = cookielib.LWPCookieJar()
  47                 self.debug = False
  48                 self.trycount = trycount
  49
  50         def build_opener(self, url, postdata = None, extraheaders = None, forbid_redirect = False):
  51                 if extraheaders is None:
  52                         extraheaders = {}
  53
  54                 txheaders = {
  55                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
  56                         'Accept-Language': 'en,en-us;q=0.5',
  57                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58                 }
  59                 for key, value in extraheaders.iteritems():
  60                         txheaders[key] = value
  61                 req = urllib2.Request(url, postdata, txheaders)
  62                 self.cookies.add_cookie_header(req)
  63                 if forbid_redirect:
  64                         redirector = HTTPNoRedirector()
  65                 else:
  66                         redirector = urllib2.HTTPRedirectHandler()
  67
  68                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
  69                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
  70
  71                 u = urllib2.build_opener(
  72                         http_handler,
  73                         https_handler,
  74                         urllib2.HTTPCookieProcessor(self.cookies),
  75                         redirector
  76                 )
  77                 u.addheaders = [(
  78                         'User-Agent',
  79                         'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.8) Gecko/20050511 Firefox/1.0.4'
  80                 )]
  81                 if not postdata is None:
  82                         req.add_data(postdata)
  83                 return (req, u)
  84
  85         def download(self, url,
  86                         postdata = None, extraheaders = None, forbid_redirect = False,
  87                         trycount = None, only_head = False,
  88                 ):
  89                 """Download an URL with GET or POST methods.
  90
  91                 @param postdata: It can be a string that will be POST-ed to the URL.
  92                         When None is given, the method will be GET instead.
  93                 @param extraheaders: You can add/modify HTTP headers with a dict here.
  94                 @param forbid_redirect: Set this flag if you do not want to handle
  95                         HTTP 301 and 302 redirects.
  96                 @param trycount: Specify the maximum number of retries here.
  97                         0 means no retry on error. Using -1 means infinite retring.
  98                         None means the default value (that is self.trycount).
  99                 @param only_head: Create the openerdirector and return it. In other
 100                         words, this will not retrieve any content except HTTP headers.
 101
 102                 @return: The raw HTML page data
 103                 """
 104                 _moduleLogger.warning("Performing download of %s" % url)
 105
 106                 if extraheaders is None:
 107                         extraheaders = {}
 108                 if trycount is None:
 109                         trycount = self.trycount
 110                 cnt = 0
 111
 112                 while True:
 113                         try:
 114                                 req, u = self.build_opener(url, postdata, extraheaders, forbid_redirect)
 115                                 openerdirector = u.open(req)
 116                                 if self.debug:
 117                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
 118                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
 119                                         _moduleLogger.info("%r" % (openerdirector.headers))
 120                                 self.cookies.extract_cookies(openerdirector, req)
 121                                 if only_head:
 122                                         return openerdirector
 123
 124                                 return self._read(openerdirector, trycount)
 125                         except urllib2.URLError:
 126                                 cnt += 1
 127                                 if (-1 < trycount) and (trycount < cnt):
 128                                         raise
 129
 130                         # Retry :-)
 131                         _moduleLogger.info("MozillaEmulator: urllib2.URLError, retryting %d" % cnt)
 132
 133         def _read(self, openerdirector, trycount):
 134                 chunks = []
 135
 136                 chunk = openerdirector.read()
 137                 chunks.append(chunk)
 138                 #while chunk and cnt < trycount:
 139                 #       time.sleep(1)
 140                 #       cnt += 1
 141                 #       chunk = openerdirector.read()
 142                 #       chunks.append(chunk)
 143
 144                 data = "".join(chunks)
 145
 146                 if "Content-Length" in openerdirector.info():
 147                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
 148                                 openerdirector.info()["Content-Length"],
 149                                 len(data),
 150                         )
 151
 152                 return data
 153
 154
 155 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
 156         """This is a custom http redirect handler that FORBIDS redirection."""
 157
 158         def http_error_302(self, req, fp, code, msg, headers):
 159                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
 160                 if e.code in (301, 302):
 161                         if 'location' in headers:
 162                                 newurl = headers.getheaders('location')[0]
 163                         elif 'uri' in headers:
 164                                 newurl = headers.getheaders('uri')[0]
 165                         e.newurl = newurl
 166                 raise e