Misc cleanup
[theonering] / src / gvoice / browser_emu.py
1 """
2 @author:          Laszlo Nagy
3 @copyright:   (c) 2005 by Szoftver Messias Bt.
4 @licence:        BSD style
5
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
7
8         - cookie management
9         - configurable user agent string
10         - GET and POST
11         - multipart POST (send files)
12         - receive content into file
13
14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
15
16         1. Use firefox
17         2. Install and open the livehttpheaders plugin
18         3. Use the website manually with firefox
19         4. Check the GET and POST requests in the livehttpheaders capture window
20         5. Create an instance of the above class and send the same GET and POST requests to the server.
21
22 Optional steps:
23
24         - You can change user agent string in the build_opened method
25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
26 """
27
28 import urllib2
29 import cookielib
30 import logging
31
32 import socket
33
34
35 _moduleLogger = logging.getLogger("gvoice.browser_emu")
36 socket.setdefaulttimeout(10)
37
38
39 class MozillaEmulator(object):
40
41         def __init__(self, trycount = 1):
42                 """Create a new MozillaEmulator object.
43
44                 @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
45                          A value of 0 means no retrying. A value of 1 means one retry. etc."""
46                 self.debug = False
47                 self.trycount = trycount
48                 self._cookies = cookielib.LWPCookieJar()
49                 self._loadedFromCookies = False
50
51         def load_cookies(self, path):
52                 assert not self._loadedFromCookies, "Load cookies only once"
53                 if path is None:
54                         return
55
56                 self._cookies.filename = path
57                 try:
58                         self._cookies.load()
59                 except cookielib.LoadError:
60                         _moduleLogger.exception("Bad cookie file")
61                 except IOError:
62                         _moduleLogger.exception("No cookie file")
63                 except Exception, e:
64                         _moduleLogger.exception("Unknown error with cookies")
65                 else:
66                         self._loadedFromCookies = True
67
68                 return self._loadedFromCookies
69
70         def save_cookies(self):
71                 if self._loadedFromCookies:
72                         self._cookies.save()
73
74         def clear_cookies(self):
75                 if self._loadedFromCookies:
76                         self._cookies.clear()
77
78         def download(self, url,
79                         postdata = None, extraheaders = None, forbidRedirect = False,
80                         trycount = None, only_head = False,
81                 ):
82                 """Download an URL with GET or POST methods.
83
84                 @param postdata: It can be a string that will be POST-ed to the URL.
85                         When None is given, the method will be GET instead.
86                 @param extraheaders: You can add/modify HTTP headers with a dict here.
87                 @param forbidRedirect: Set this flag if you do not want to handle
88                         HTTP 301 and 302 redirects.
89                 @param trycount: Specify the maximum number of retries here.
90                         0 means no retry on error. Using -1 means infinite retring.
91                         None means the default value (that is self.trycount).
92                 @param only_head: Create the openerdirector and return it. In other
93                         words, this will not retrieve any content except HTTP headers.
94
95                 @return: The raw HTML page data
96                 """
97                 _moduleLogger.info("Performing download of %s" % url)
98
99                 if extraheaders is None:
100                         extraheaders = {}
101                 if trycount is None:
102                         trycount = self.trycount
103                 cnt = 0
104
105                 while True:
106                         try:
107                                 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
108                                 openerdirector = u.open(req)
109                                 if self.debug:
110                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
111                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
112                                         _moduleLogger.info("%r" % (openerdirector.headers))
113                                 self._cookies.extract_cookies(openerdirector, req)
114                                 if only_head:
115                                         return openerdirector
116
117                                 return self._read(openerdirector, trycount)
118                         except urllib2.URLError, e:
119                                 _moduleLogger.info("%s: %s" % (e, url))
120                                 cnt += 1
121                                 if (-1 < trycount) and (trycount < cnt):
122                                         raise
123
124                         # Retry :-)
125                         _moduleLogger.info("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
126
127         def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
128                 if extraheaders is None:
129                         extraheaders = {}
130
131                 txheaders = {
132                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
133                         'Accept-Language': 'en,en-us;q=0.5',
134                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
135                 }
136                 for key, value in extraheaders.iteritems():
137                         txheaders[key] = value
138                 req = urllib2.Request(url, postdata, txheaders)
139                 self._cookies.add_cookie_header(req)
140                 if forbidRedirect:
141                         redirector = HTTPNoRedirector()
142                         #_moduleLogger.info("Redirection disabled")
143                 else:
144                         redirector = urllib2.HTTPRedirectHandler()
145                         #_moduleLogger.info("Redirection enabled")
146
147                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
148                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
149
150                 u = urllib2.build_opener(
151                         http_handler,
152                         https_handler,
153                         urllib2.HTTPCookieProcessor(self._cookies),
154                         redirector
155                 )
156                 u.addheaders = [(
157                         'User-Agent',
158                         'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
159                 )]
160                 if not postdata is None:
161                         req.add_data(postdata)
162                 return (req, u)
163
164         def _read(self, openerdirector, trycount):
165                 chunks = []
166
167                 chunk = openerdirector.read()
168                 chunks.append(chunk)
169                 #while chunk and cnt < trycount:
170                 #       time.sleep(1)
171                 #       cnt += 1
172                 #       chunk = openerdirector.read()
173                 #       chunks.append(chunk)
174
175                 data = "".join(chunks)
176
177                 if "Content-Length" in openerdirector.info():
178                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
179                                 openerdirector.info()["Content-Length"],
180                                 len(data),
181                         )
182
183                 return data
184
185
186 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
187         """This is a custom http redirect handler that FORBIDS redirection."""
188
189         def http_error_302(self, req, fp, code, msg, headers):
190                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
191                 if e.code in (301, 302):
192                         if 'location' in headers:
193                                 newurl = headers.getheaders('location')[0]
194                         elif 'uri' in headers:
195                                 newurl = headers.getheaders('uri')[0]
196                         e.newurl = newurl
197                 _moduleLogger.info("New url: %s" % e.newurl)
198                 raise e