3 @copyright: (c) 2005 by Szoftver Messias Bt.
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
9 - configurable user agent string
11 - multipart POST (send files)
12 - receive content into file
14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
17 2. Install and open the livehttpheaders plugin
18 3. Use the website manually with firefox
19 4. Check the GET and POST requests in the livehttpheaders capture window
20 5. Create an instance of the above class and send the same GET and POST requests to the server.
24 - You can change user agent string in the build_opened method
25 - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
35 _moduleLogger = logging.getLogger(__name__)
36 socket.setdefaulttimeout(45)
39 def add_proxy(protocol, url, port):
40 proxyInfo = "%s:%s" % (url, port)
41 proxy = urllib2.ProxyHandler(
44 opener = urllib2.build_opener(proxy)
45 urllib2.install_opener(opener)
48 class MozillaEmulator(object):
50 USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
51 #USER_AGENT = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16"
53 def __init__(self, trycount = 1):
54 """Create a new MozillaEmulator object.
56 @param trycount: The download() method will retry the operation if it
57 fails. You can specify -1 for infinite retrying. A value of 0 means no
58 retrying. A value of 1 means one retry. etc."""
60 self.trycount = trycount
61 self._cookies = cookielib.LWPCookieJar()
62 self._loadedFromCookies = False
63 self._storeCookies = False
65 def load_cookies(self, path):
66 assert not self._loadedFromCookies, "Load cookies only once"
70 self._cookies.filename = path
73 except cookielib.LoadError:
74 _moduleLogger.exception("Bad cookie file")
76 _moduleLogger.exception("No cookie file")
78 _moduleLogger.exception("Unknown error with cookies")
80 self._loadedFromCookies = True
81 self._storeCookies = True
83 return self._loadedFromCookies
85 def save_cookies(self):
86 if self._storeCookies:
89 def clear_cookies(self):
90 if self._storeCookies:
93 def download(self, url,
94 postdata = None, extraheaders = None, forbidRedirect = False,
95 trycount = None, only_head = False,
97 """Download an URL with GET or POST methods.
99 @param postdata: It can be a string that will be POST-ed to the URL.
100 When None is given, the method will be GET instead.
101 @param extraheaders: You can add/modify HTTP headers with a dict here.
102 @param forbidRedirect: Set this flag if you do not want to handle
103 HTTP 301 and 302 redirects.
104 @param trycount: Specify the maximum number of retries here.
105 0 means no retry on error. Using -1 means infinite retring.
106 None means the default value (that is self.trycount).
107 @param only_head: Create the openerdirector and return it. In other
108 words, this will not retrieve any content except HTTP headers.
110 @return: The raw HTML page data
112 _moduleLogger.debug("Performing download of %s" % url)
114 if extraheaders is None:
117 trycount = self.trycount
122 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
123 openerdirector = u.open(req)
125 _moduleLogger.info("%r - %r" % (req.get_method(), url))
126 _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
127 _moduleLogger.info("%r" % (openerdirector.headers))
128 self._cookies.extract_cookies(openerdirector, req)
130 return openerdirector
132 return self._read(openerdirector, trycount)
133 except urllib2.URLError, e:
134 _moduleLogger.debug("%s: %s" % (e, url))
136 if (-1 < trycount) and (trycount < cnt):
140 _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
142 def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
143 if extraheaders is None:
147 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
148 'Accept-Language': 'en,en-us;q=0.5',
149 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
150 'User-Agent': self.USER_AGENT,
152 for key, value in extraheaders.iteritems():
153 txheaders[key] = value
154 req = urllib2.Request(url, postdata, txheaders)
155 self._cookies.add_cookie_header(req)
157 redirector = HTTPNoRedirector()
158 #_moduleLogger.info("Redirection disabled")
160 redirector = urllib2.HTTPRedirectHandler()
161 #_moduleLogger.info("Redirection enabled")
163 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
164 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
166 u = urllib2.build_opener(
169 urllib2.HTTPCookieProcessor(self._cookies),
172 if not postdata is None:
173 req.add_data(postdata)
176 def _read(self, openerdirector, trycount):
179 chunk = openerdirector.read()
181 #while chunk and cnt < trycount:
184 # chunk = openerdirector.read()
185 # chunks.append(chunk)
187 data = "".join(chunks)
189 if "Content-Length" in openerdirector.info():
190 assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
191 openerdirector.info()["Content-Length"],
198 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
199 """This is a custom http redirect handler that FORBIDS redirection."""
201 def http_error_302(self, req, fp, code, msg, headers):
202 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
203 if e.code in (301, 302):
204 if 'location' in headers:
205 newurl = headers.getheaders('location')[0]
206 elif 'uri' in headers:
207 newurl = headers.getheaders('uri')[0]
209 _moduleLogger.info("New url: %s" % e.newurl)