git.maemo.org Git - gc-dialer/blob - gc_dialer/browser_emu.py

   1 """
   2 @author:          Laszlo Nagy
   3 @copyright:   (c) 2005 by Szoftver Messias Bt.
   4 @licence:        BSD style
   5
   6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
   7
   8         - cookie management
   9         - caching
  10         - configurable user agent string
  11         - GET and POST
  12         - multipart POST (send files)
  13         - receive content into file
  14         - progress indicator
  15
  16 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
  17
  18         1. Use firefox
  19         2. Install and open the livehttpheaders plugin
  20         3. Use the website manually with firefox
  21         4. Check the GET and POST requests in the livehttpheaders capture window
  22         5. Create an instance of the above class and send the same GET and POST requests to the server.
  23
  24 Optional steps:
  25
  26         - For testing, use a MozillaCacher instance - this will cache all pages and make testing quicker
  27         - You can change user agent string in the build_opened method
  28         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
  29
  30 TODO:
  31
  32 - should have a method to save/load cookies
  33 """
  34
  35 #from __future__ import with_statement
  36
  37 import os
  38 #import md5
  39 import urllib
  40 import urllib2
  41 #import mimetypes
  42 import cookielib
  43
  44
  45 class MozillaEmulator(object):
  46
  47         def __init__(self,cacher={},trycount=0):
  48                 """Create a new MozillaEmulator object.
  49
  50                 @param cacher: A dictionary like object, that can cache search results on a storage device.
  51                         You can use a simple dictionary here, but it is not recommended.
  52                         You can also put None here to disable caching completely.
  53                 @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
  54                          A value of 0 means no retrying. A value of 1 means one retry. etc."""
  55                 self.cacher = cacher
  56                 self.cookies = cookielib.LWPCookieJar()
  57                 self.debug = False
  58                 self.trycount = trycount
  59
  60         def build_opener(self,url,postdata=None,extraheaders={},forbid_redirect=False):
  61                 txheaders = {
  62                         'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
  63                         'Accept-Language':'en,en-us;q=0.5',
  64 #                       'Accept-Encoding': 'gzip, deflate',
  65                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  66 #                       'Keep-Alive': '300',
  67 #                       'Connection': 'keep-alive',
  68 #                       'Cache-Control': 'max-age=0',
  69                 }
  70                 for key,value in extraheaders.iteritems():
  71                         txheaders[key] = value
  72                 req = urllib2.Request(url, postdata, txheaders)
  73                 self.cookies.add_cookie_header(req)
  74                 if forbid_redirect:
  75                         redirector = HTTPNoRedirector()
  76                 else:
  77                         redirector = urllib2.HTTPRedirectHandler()
  78
  79                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
  80                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
  81
  82                 u = urllib2.build_opener(http_handler,https_handler,urllib2.HTTPCookieProcessor(self.cookies),redirector)
  83                 u.addheaders = [('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')]
  84                 if not postdata is None:
  85                         req.add_data(postdata)
  86                 return (req,u)
  87
  88         def download(self,url,postdata=None,extraheaders={},forbid_redirect=False,
  89                         trycount=None,fd=None,onprogress=None,only_head=False):
  90                 """Download an URL with GET or POST methods.
  91
  92                 @param postdata: It can be a string that will be POST-ed to the URL.
  93                         When None is given, the method will be GET instead.
  94                 @param extraheaders: You can add/modify HTTP headers with a dict here.
  95                 @param forbid_redirect: Set this flag if you do not want to handle
  96                         HTTP 301 and 302 redirects.
  97                 @param trycount: Specify the maximum number of retries here.
  98                         0 means no retry on error. Using -1 means infinite retring.
  99                         None means the default value (that is self.trycount).
 100                 @param fd: You can pass a file descriptor here. In this case,
 101                         the data will be written into the file. Please note that
 102                         when you save the raw data into a file then it won't be cached.
 103                 @param onprogress: A function that has two parameters:
 104                         the size of the resource and the downloaded size. This will be
 105                         called for each 1KB chunk. (If the HTTP header does not contain
 106                         the content-length field, then the size parameter will be zero!)
 107                 @param only_head: Create the openerdirector and return it. In other
 108                         words, this will not retrieve any content except HTTP headers.
 109
 110                 @return: The raw HTML page data, unless fd was specified. When fd
 111                         was given, the return value is undefined.
 112                 """
 113                 print "Performing download of %s" % url
 114                 if trycount is None:
 115                         trycount = self.trycount
 116                 cnt = 0
 117                 while True:
 118                         try:
 119                                 req,u = self.build_opener(url,postdata,extraheaders,forbid_redirect)
 120                                 openerdirector = u.open(req)
 121                                 if self.debug:
 122                                         print req.get_method(),url
 123                                         print openerdirector.code,openerdirector.msg
 124                                         print openerdirector.headers
 125                                 self.cookies.extract_cookies(openerdirector,req)
 126                                 if only_head:
 127                                         return openerdirector
 128                                 #if openerdirector.headers.has_key('content-length'):
 129                                 #       length = long(openerdirector.headers['content-length'])
 130                                 #else:
 131                                 #       length = 0
 132                                 #dlength = 0
 133                                 #if fd:
 134                                 #       while True:
 135                                 #               data = openerdirector.read(1024)
 136                                 #               dlength += len(data)
 137                                 #               fd.write(data)
 138                                 #               if onprogress:
 139                                 #                       onprogress(length,dlength)
 140                                 #               if not data:
 141                                 #                       break
 142                                 #else:
 143                                 #       data = ''
 144                                 #       while True:
 145                                 #               newdata = openerdirector.read(1024)
 146                                 #               dlength += len(newdata)
 147                                 #               data += newdata
 148                                 #               if onprogress:
 149                                 #                       onprogress(length,dlength)
 150                                 #               if not newdata:
 151                                 #                       break
 152                                 #               #data = openerdirector.read()
 153                                 #       if not (self.cacher is None):
 154                                 #               self.cacher[key] = data
 155                                 #try:
 156                                 #       d2= GzipFile(fileobj=cStringIO.StringIO(data)).read()
 157                                 #       data = d2
 158                                 #except IOError:
 159                                 #       pass
 160                                 return openerdirector.read()
 161                         except urllib2.URLError:
 162                                 cnt += 1
 163                                 if (trycount > -1) and (trycount < cnt):
 164                                         raise
 165                                 # Retry :-)
 166                                 if self.debug:
 167                                         print "MozillaEmulator: urllib2.URLError, retryting ",cnt
 168
 169 #       def post_multipart(self,url,fields, files, forbid_redirect=True):
 170 #               """Post fields and files to an http host as multipart/form-data.
 171 #               fields is a sequence of (name, value) elements for regular form fields.
 172 #               files is a sequence of (name, filename, value) elements for data to be uploaded as files
 173 #               Return the server's response page.
 174 #               """
 175 #               content_type, post_data = encode_multipart_formdata(fields, files)
 176 #               result = self.download(url,post_data, {
 177 #                               'Content-Type': content_type,
 178 #                               'Content-Length': str(len(post_data))
 179 #                       },
 180 #                       forbid_redirect=forbid_redirect
 181 #               )
 182 #               return result
 183
 184
 185 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
 186         """This is a custom http redirect handler that FORBIDS redirection."""
 187
 188         def http_error_302(self, req, fp, code, msg, headers):
 189                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
 190                 if e.code in (301,302):
 191                         if 'location' in headers:
 192                                 newurl = headers.getheaders('location')[0]
 193                         elif 'uri' in headers:
 194                                 newurl = headers.getheaders('uri')[0]
 195                         e.newurl = newurl
 196                 raise e
 197
 198
 199 #def encode_multipart_formdata(fields, files):
 200 #       """
 201 #       fields is a sequence of (name, value) elements for regular form fields.
 202 #       files is a sequence of (name, filename, value) elements for data to be uploaded as files
 203 #       Return (content_type, body) ready for httplib.HTTP instance
 204 #       """
 205 #       BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
 206 #       CRLF = '\r\n'
 207 #       L = []
 208 #       for (key, value) in fields:
 209 #               L.append('--' + BOUNDARY)
 210 #               L.append('Content-Disposition: form-data; name="%s"' % key)
 211 #               L.append('')
 212 #               L.append(value)
 213 #       for (key, filename, value) in files:
 214 #               L.append('--' + BOUNDARY)
 215 #               L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
 216 #               L.append('Content-Type: %s' % get_content_type(filename))
 217 #               L.append('')
 218 #               L.append(value)
 219 #       L.append('--' + BOUNDARY + '--')
 220 #       L.append('')
 221 #       body = CRLF.join(L)
 222 #       content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
 223 #       return content_type, body
 224 #
 225 #
 226 #def get_content_type(filename):
 227 #       return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
 228 #