3 @copyright: (c) 2005 by Szoftver Messias Bt.
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
10 - configurable user agent string
12 - multipart POST (send files)
13 - receive content into file
16 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
19 2. Install and open the livehttpheaders plugin
20 3. Use the website manually with firefox
21 4. Check the GET and POST requests in the livehttpheaders capture window
22 5. Create an instance of the above class and send the same GET and POST requests to the server.
26 - For testing, use a MozillaCacher instance - this will cache all pages and make testing quicker
27 - You can change user agent string in the build_opened method
28 - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
32 - should have a method to save/load cookies
35 #from __future__ import with_statement
45 class MozillaEmulator(object):
47 def __init__(self,cacher={},trycount=0):
48 """Create a new MozillaEmulator object.
50 @param cacher: A dictionary like object, that can cache search results on a storage device.
51 You can use a simple dictionary here, but it is not recommended.
52 You can also put None here to disable caching completely.
53 @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
54 A value of 0 means no retrying. A value of 1 means one retry. etc."""
56 self.cookies = cookielib.LWPCookieJar()
58 self.trycount = trycount
60 def build_opener(self,url,postdata=None,extraheaders={},forbid_redirect=False):
62 'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
63 'Accept-Language':'en,en-us;q=0.5',
64 # 'Accept-Encoding': 'gzip, deflate',
65 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
66 # 'Keep-Alive': '300',
67 # 'Connection': 'keep-alive',
68 # 'Cache-Control': 'max-age=0',
70 for key,value in extraheaders.iteritems():
71 txheaders[key] = value
72 req = urllib2.Request(url, postdata, txheaders)
73 self.cookies.add_cookie_header(req)
75 redirector = HTTPNoRedirector()
77 redirector = urllib2.HTTPRedirectHandler()
79 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
80 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
82 u = urllib2.build_opener(http_handler,https_handler,urllib2.HTTPCookieProcessor(self.cookies),redirector)
83 u.addheaders = [('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')]
84 if not postdata is None:
85 req.add_data(postdata)
88 def download(self,url,postdata=None,extraheaders={},forbid_redirect=False,
89 trycount=None,fd=None,onprogress=None,only_head=False):
90 """Download an URL with GET or POST methods.
92 @param postdata: It can be a string that will be POST-ed to the URL.
93 When None is given, the method will be GET instead.
94 @param extraheaders: You can add/modify HTTP headers with a dict here.
95 @param forbid_redirect: Set this flag if you do not want to handle
96 HTTP 301 and 302 redirects.
97 @param trycount: Specify the maximum number of retries here.
98 0 means no retry on error. Using -1 means infinite retring.
99 None means the default value (that is self.trycount).
100 @param fd: You can pass a file descriptor here. In this case,
101 the data will be written into the file. Please note that
102 when you save the raw data into a file then it won't be cached.
103 @param onprogress: A function that has two parameters:
104 the size of the resource and the downloaded size. This will be
105 called for each 1KB chunk. (If the HTTP header does not contain
106 the content-length field, then the size parameter will be zero!)
107 @param only_head: Create the openerdirector and return it. In other
108 words, this will not retrieve any content except HTTP headers.
110 @return: The raw HTML page data, unless fd was specified. When fd
111 was given, the return value is undefined.
113 print "Performing download of %s" % url
115 trycount = self.trycount
119 req,u = self.build_opener(url,postdata,extraheaders,forbid_redirect)
120 openerdirector = u.open(req)
122 print req.get_method(),url
123 print openerdirector.code,openerdirector.msg
124 print openerdirector.headers
125 self.cookies.extract_cookies(openerdirector,req)
127 return openerdirector
128 #if openerdirector.headers.has_key('content-length'):
129 # length = long(openerdirector.headers['content-length'])
135 # data = openerdirector.read(1024)
136 # dlength += len(data)
139 # onprogress(length,dlength)
145 # newdata = openerdirector.read(1024)
146 # dlength += len(newdata)
149 # onprogress(length,dlength)
152 # #data = openerdirector.read()
153 # if not (self.cacher is None):
154 # self.cacher[key] = data
156 # d2= GzipFile(fileobj=cStringIO.StringIO(data)).read()
160 return openerdirector.read()
161 except urllib2.URLError:
163 if (trycount > -1) and (trycount < cnt):
167 print "MozillaEmulator: urllib2.URLError, retryting ",cnt
169 # def post_multipart(self,url,fields, files, forbid_redirect=True):
170 # """Post fields and files to an http host as multipart/form-data.
171 # fields is a sequence of (name, value) elements for regular form fields.
172 # files is a sequence of (name, filename, value) elements for data to be uploaded as files
173 # Return the server's response page.
175 # content_type, post_data = encode_multipart_formdata(fields, files)
176 # result = self.download(url,post_data, {
177 # 'Content-Type': content_type,
178 # 'Content-Length': str(len(post_data))
180 # forbid_redirect=forbid_redirect
185 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
186 """This is a custom http redirect handler that FORBIDS redirection."""
188 def http_error_302(self, req, fp, code, msg, headers):
189 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
190 if e.code in (301,302):
191 if 'location' in headers:
192 newurl = headers.getheaders('location')[0]
193 elif 'uri' in headers:
194 newurl = headers.getheaders('uri')[0]
199 #def encode_multipart_formdata(fields, files):
201 # fields is a sequence of (name, value) elements for regular form fields.
202 # files is a sequence of (name, filename, value) elements for data to be uploaded as files
203 # Return (content_type, body) ready for httplib.HTTP instance
205 # BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
208 # for (key, value) in fields:
209 # L.append('--' + BOUNDARY)
210 # L.append('Content-Disposition: form-data; name="%s"' % key)
213 # for (key, filename, value) in files:
214 # L.append('--' + BOUNDARY)
215 # L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
216 # L.append('Content-Type: %s' % get_content_type(filename))
219 # L.append('--' + BOUNDARY + '--')
221 # body = CRLF.join(L)
222 # content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
223 # return content_type, body
226 #def get_content_type(filename):
227 # return mimetypes.guess_type(filename)[0] or 'application/octet-stream'