git.maemo.org Git - gc-dialer/blob - src/backends/browser_emu.py

   1 """
   2 @author:          Laszlo Nagy
   3 @copyright:   (c) 2005 by Szoftver Messias Bt.
   4 @licence:        BSD style
   5
   6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
   7
   8         - cookie management
   9         - configurable user agent string
  10         - GET and POST
  11         - multipart POST (send files)
  12         - receive content into file
  13
  14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
  15
  16         1. Use firefox
  17         2. Install and open the livehttpheaders plugin
  18         3. Use the website manually with firefox
  19         4. Check the GET and POST requests in the livehttpheaders capture window
  20         5. Create an instance of the above class and send the same GET and POST requests to the server.
  21
  22 Optional steps:
  23
  24         - You can change user agent string in the build_opened method
  25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
  26 """
  27
  28 import urllib2
  29 import cookielib
  30 import logging
  31
  32 import socket
  33
  34
  35 _moduleLogger = logging.getLogger("browser_emu")
  36 socket.setdefaulttimeout(20)
  37
  38
  39 class MozillaEmulator(object):
  40
  41         def __init__(self, trycount = 1):
  42                 """Create a new MozillaEmulator object.
  43
  44                 @param trycount: The download() method will retry the operation if it
  45                 fails. You can specify -1 for infinite retrying.  A value of 0 means no
  46                 retrying. A value of 1 means one retry. etc."""
  47                 self.debug = False
  48                 self.trycount = trycount
  49                 self._cookies = cookielib.LWPCookieJar()
  50                 self._loadedFromCookies = False
  51
  52         def load_cookies(self, path):
  53                 assert not self._loadedFromCookies, "Load cookies only once"
  54                 if path is None:
  55                         return
  56
  57                 self._cookies.filename = path
  58                 try:
  59                         self._cookies.load()
  60                 except cookielib.LoadError:
  61                         _moduleLogger.exception("Bad cookie file")
  62                 except IOError:
  63                         _moduleLogger.exception("No cookie file")
  64                 except Exception, e:
  65                         _moduleLogger.exception("Unknown error with cookies")
  66                 else:
  67                         self._loadedFromCookies = True
  68
  69                 return self._loadedFromCookies
  70
  71         def save_cookies(self):
  72                 if self._loadedFromCookies:
  73                         self._cookies.save()
  74
  75         def clear_cookies(self):
  76                 if self._loadedFromCookies:
  77                         self._cookies.clear()
  78
  79         def download(self, url,
  80                         postdata = None, extraheaders = None, forbidRedirect = False,
  81                         trycount = None, only_head = False,
  82                 ):
  83                 """Download an URL with GET or POST methods.
  84
  85                 @param postdata: It can be a string that will be POST-ed to the URL.
  86                         When None is given, the method will be GET instead.
  87                 @param extraheaders: You can add/modify HTTP headers with a dict here.
  88                 @param forbidRedirect: Set this flag if you do not want to handle
  89                         HTTP 301 and 302 redirects.
  90                 @param trycount: Specify the maximum number of retries here.
  91                         0 means no retry on error. Using -1 means infinite retring.
  92                         None means the default value (that is self.trycount).
  93                 @param only_head: Create the openerdirector and return it. In other
  94                         words, this will not retrieve any content except HTTP headers.
  95
  96                 @return: The raw HTML page data
  97                 """
  98                 _moduleLogger.debug("Performing download of %s" % url)
  99
 100                 if extraheaders is None:
 101                         extraheaders = {}
 102                 if trycount is None:
 103                         trycount = self.trycount
 104                 cnt = 0
 105
 106                 while True:
 107                         try:
 108                                 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
 109                                 openerdirector = u.open(req)
 110                                 if self.debug:
 111                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
 112                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
 113                                         _moduleLogger.info("%r" % (openerdirector.headers))
 114                                 self._cookies.extract_cookies(openerdirector, req)
 115                                 if only_head:
 116                                         return openerdirector
 117
 118                                 return self._read(openerdirector, trycount)
 119                         except urllib2.URLError, e:
 120                                 _moduleLogger.debug("%s: %s" % (e, url))
 121                                 cnt += 1
 122                                 if (-1 < trycount) and (trycount < cnt):
 123                                         raise
 124
 125                         # Retry :-)
 126                         _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
 127
 128         def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
 129                 if extraheaders is None:
 130                         extraheaders = {}
 131
 132                 txheaders = {
 133                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
 134                         'Accept-Language': 'en,en-us;q=0.5',
 135                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 136                 }
 137                 for key, value in extraheaders.iteritems():
 138                         txheaders[key] = value
 139                 req = urllib2.Request(url, postdata, txheaders)
 140                 self._cookies.add_cookie_header(req)
 141                 if forbidRedirect:
 142                         redirector = HTTPNoRedirector()
 143                         #_moduleLogger.info("Redirection disabled")
 144                 else:
 145                         redirector = urllib2.HTTPRedirectHandler()
 146                         #_moduleLogger.info("Redirection enabled")
 147
 148                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
 149                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
 150
 151                 u = urllib2.build_opener(
 152                         http_handler,
 153                         https_handler,
 154                         urllib2.HTTPCookieProcessor(self._cookies),
 155                         redirector
 156                 )
 157                 u.addheaders = [(
 158                         'User-Agent',
 159                         'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
 160                 )]
 161                 if not postdata is None:
 162                         req.add_data(postdata)
 163                 return (req, u)
 164
 165         def _read(self, openerdirector, trycount):
 166                 chunks = []
 167
 168                 chunk = openerdirector.read()
 169                 chunks.append(chunk)
 170                 #while chunk and cnt < trycount:
 171                 #       time.sleep(1)
 172                 #       cnt += 1
 173                 #       chunk = openerdirector.read()
 174                 #       chunks.append(chunk)
 175
 176                 data = "".join(chunks)
 177
 178                 if "Content-Length" in openerdirector.info():
 179                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
 180                                 openerdirector.info()["Content-Length"],
 181                                 len(data),
 182                         )
 183
 184                 return data
 185
 186
 187 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
 188         """This is a custom http redirect handler that FORBIDS redirection."""
 189
 190         def http_error_302(self, req, fp, code, msg, headers):
 191                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
 192                 if e.code in (301, 302):
 193                         if 'location' in headers:
 194                                 newurl = headers.getheaders('location')[0]
 195                         elif 'uri' in headers:
 196                                 newurl = headers.getheaders('uri')[0]
 197                         e.newurl = newurl
 198                 _moduleLogger.info("New url: %s" % e.newurl)
 199                 raise e