3 @copyright: (c) 2005 by Szoftver Messias Bt.
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
9 - configurable user agent string
11 - multipart POST (send files)
12 - receive content into file
14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
17 2. Install and open the livehttpheaders plugin
18 3. Use the website manually with firefox
19 4. Check the GET and POST requests in the livehttpheaders capture window
20 5. Create an instance of the above class and send the same GET and POST requests to the server.
24 - You can change user agent string in the build_opened method
25 - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
35 _moduleLogger = logging.getLogger(__name__)
36 socket.setdefaulttimeout(45)
39 class MozillaEmulator(object):
41 def __init__(self, trycount = 1):
42 """Create a new MozillaEmulator object.
44 @param trycount: The download() method will retry the operation if it
45 fails. You can specify -1 for infinite retrying. A value of 0 means no
46 retrying. A value of 1 means one retry. etc."""
48 self.trycount = trycount
49 self._cookies = cookielib.LWPCookieJar()
50 self._loadedFromCookies = False
51 self._usingCookies = False
53 def load_cookies(self, path):
54 assert not self._loadedFromCookies, "Load cookies only once"
58 self._cookies.filename = path
61 except cookielib.LoadError:
62 _moduleLogger.exception("Bad cookie file")
64 _moduleLogger.exception("No cookie file")
66 _moduleLogger.exception("Unknown error with cookies")
68 self._loadedFromCookies = True
70 self._usingCookies = True
71 return self._loadedFromCookies
73 def save_cookies(self):
74 if self._usingCookies:
77 def clear_cookies(self):
78 if self._usingCookies:
81 def download(self, url,
82 postdata = None, extraheaders = None, forbidRedirect = False,
83 trycount = None, only_head = False,
85 """Download an URL with GET or POST methods.
87 @param postdata: It can be a string that will be POST-ed to the URL.
88 When None is given, the method will be GET instead.
89 @param extraheaders: You can add/modify HTTP headers with a dict here.
90 @param forbidRedirect: Set this flag if you do not want to handle
91 HTTP 301 and 302 redirects.
92 @param trycount: Specify the maximum number of retries here.
93 0 means no retry on error. Using -1 means infinite retring.
94 None means the default value (that is self.trycount).
95 @param only_head: Create the openerdirector and return it. In other
96 words, this will not retrieve any content except HTTP headers.
98 @return: The raw HTML page data
100 _moduleLogger.debug("Performing download of %s" % url)
102 if extraheaders is None:
105 trycount = self.trycount
110 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
111 openerdirector = u.open(req)
113 _moduleLogger.info("%r - %r" % (req.get_method(), url))
114 _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
115 _moduleLogger.info("%r" % (openerdirector.headers))
116 self._cookies.extract_cookies(openerdirector, req)
118 return openerdirector
120 return self._read(openerdirector, trycount)
121 except urllib2.URLError, e:
122 _moduleLogger.debug("%s: %s" % (e, url))
124 if (-1 < trycount) and (trycount < cnt):
128 _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
130 def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
131 if extraheaders is None:
135 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
136 'Accept-Language': 'en,en-us;q=0.5',
137 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
139 for key, value in extraheaders.iteritems():
140 txheaders[key] = value
141 req = urllib2.Request(url, postdata, txheaders)
142 self._cookies.add_cookie_header(req)
144 redirector = HTTPNoRedirector()
145 #_moduleLogger.info("Redirection disabled")
147 redirector = urllib2.HTTPRedirectHandler()
148 #_moduleLogger.info("Redirection enabled")
150 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
151 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
153 u = urllib2.build_opener(
156 urllib2.HTTPCookieProcessor(self._cookies),
161 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
163 if not postdata is None:
164 req.add_data(postdata)
167 def _read(self, openerdirector, trycount):
170 chunk = openerdirector.read()
172 #while chunk and cnt < trycount:
175 # chunk = openerdirector.read()
176 # chunks.append(chunk)
178 data = "".join(chunks)
180 if "Content-Length" in openerdirector.info():
181 assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
182 openerdirector.info()["Content-Length"],
189 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
190 """This is a custom http redirect handler that FORBIDS redirection."""
192 def http_error_302(self, req, fp, code, msg, headers):
193 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
194 if e.code in (301, 302):
195 if 'location' in headers:
196 newurl = headers.getheaders('location')[0]
197 elif 'uri' in headers:
198 newurl = headers.getheaders('uri')[0]
200 _moduleLogger.info("New url: %s" % e.newurl)