Speeding up login through cookies
[theonering] / src / gvoice / browser_emu.py
1 """
2 @author:          Laszlo Nagy
3 @copyright:   (c) 2005 by Szoftver Messias Bt.
4 @licence:        BSD style
5
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
7
8         - cookie management
9         - configurable user agent string
10         - GET and POST
11         - multipart POST (send files)
12         - receive content into file
13
14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
15
16         1. Use firefox
17         2. Install and open the livehttpheaders plugin
18         3. Use the website manually with firefox
19         4. Check the GET and POST requests in the livehttpheaders capture window
20         5. Create an instance of the above class and send the same GET and POST requests to the server.
21
22 Optional steps:
23
24         - You can change user agent string in the build_opened method
25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
26 """
27
28 import urllib2
29 import cookielib
30 import logging
31
32 import socket
33
34
35 _moduleLogger = logging.getLogger(__name__)
36 socket.setdefaulttimeout(45)
37
38
39 class MozillaEmulator(object):
40
41         def __init__(self, trycount = 1):
42                 """Create a new MozillaEmulator object.
43
44                 @param trycount: The download() method will retry the operation if it
45                 fails. You can specify -1 for infinite retrying.  A value of 0 means no
46                 retrying. A value of 1 means one retry. etc."""
47                 self.debug = False
48                 self.trycount = trycount
49                 self._cookies = cookielib.LWPCookieJar()
50                 self._loadedFromCookies = False
51                 self._usingCookies = False
52
53         def load_cookies(self, path):
54                 assert not self._loadedFromCookies, "Load cookies only once"
55                 if not path:
56                         return
57
58                 self._cookies.filename = path
59                 try:
60                         self._cookies.load()
61                 except cookielib.LoadError:
62                         _moduleLogger.exception("Bad cookie file")
63                 except IOError:
64                         _moduleLogger.exception("No cookie file")
65                 except Exception, e:
66                         _moduleLogger.exception("Unknown error with cookies")
67                 else:
68                         self._loadedFromCookies = True
69
70                 self._usingCookies = True
71                 return self._loadedFromCookies
72
73         def save_cookies(self):
74                 if self._usingCookies:
75                         self._cookies.save()
76
77         def clear_cookies(self):
78                 if self._usingCookies:
79                         self._cookies.clear()
80
81         def download(self, url,
82                         postdata = None, extraheaders = None, forbidRedirect = False,
83                         trycount = None, only_head = False,
84                 ):
85                 """Download an URL with GET or POST methods.
86
87                 @param postdata: It can be a string that will be POST-ed to the URL.
88                         When None is given, the method will be GET instead.
89                 @param extraheaders: You can add/modify HTTP headers with a dict here.
90                 @param forbidRedirect: Set this flag if you do not want to handle
91                         HTTP 301 and 302 redirects.
92                 @param trycount: Specify the maximum number of retries here.
93                         0 means no retry on error. Using -1 means infinite retring.
94                         None means the default value (that is self.trycount).
95                 @param only_head: Create the openerdirector and return it. In other
96                         words, this will not retrieve any content except HTTP headers.
97
98                 @return: The raw HTML page data
99                 """
100                 _moduleLogger.debug("Performing download of %s" % url)
101
102                 if extraheaders is None:
103                         extraheaders = {}
104                 if trycount is None:
105                         trycount = self.trycount
106                 cnt = 0
107
108                 while True:
109                         try:
110                                 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
111                                 openerdirector = u.open(req)
112                                 if self.debug:
113                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
114                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
115                                         _moduleLogger.info("%r" % (openerdirector.headers))
116                                 self._cookies.extract_cookies(openerdirector, req)
117                                 if only_head:
118                                         return openerdirector
119
120                                 return self._read(openerdirector, trycount)
121                         except urllib2.URLError, e:
122                                 _moduleLogger.debug("%s: %s" % (e, url))
123                                 cnt += 1
124                                 if (-1 < trycount) and (trycount < cnt):
125                                         raise
126
127                         # Retry :-)
128                         _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
129
130         def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
131                 if extraheaders is None:
132                         extraheaders = {}
133
134                 txheaders = {
135                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
136                         'Accept-Language': 'en,en-us;q=0.5',
137                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
138                 }
139                 for key, value in extraheaders.iteritems():
140                         txheaders[key] = value
141                 req = urllib2.Request(url, postdata, txheaders)
142                 self._cookies.add_cookie_header(req)
143                 if forbidRedirect:
144                         redirector = HTTPNoRedirector()
145                         #_moduleLogger.info("Redirection disabled")
146                 else:
147                         redirector = urllib2.HTTPRedirectHandler()
148                         #_moduleLogger.info("Redirection enabled")
149
150                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
151                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
152
153                 u = urllib2.build_opener(
154                         http_handler,
155                         https_handler,
156                         urllib2.HTTPCookieProcessor(self._cookies),
157                         redirector
158                 )
159                 u.addheaders = [(
160                         'User-Agent',
161                         'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
162                 )]
163                 if not postdata is None:
164                         req.add_data(postdata)
165                 return (req, u)
166
167         def _read(self, openerdirector, trycount):
168                 chunks = []
169
170                 chunk = openerdirector.read()
171                 chunks.append(chunk)
172                 #while chunk and cnt < trycount:
173                 #       time.sleep(1)
174                 #       cnt += 1
175                 #       chunk = openerdirector.read()
176                 #       chunks.append(chunk)
177
178                 data = "".join(chunks)
179
180                 if "Content-Length" in openerdirector.info():
181                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
182                                 openerdirector.info()["Content-Length"],
183                                 len(data),
184                         )
185
186                 return data
187
188
189 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
190         """This is a custom http redirect handler that FORBIDS redirection."""
191
192         def http_error_302(self, req, fp, code, msg, headers):
193                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
194                 if e.code in (301, 302):
195                         if 'location' in headers:
196                                 newurl = headers.getheaders('location')[0]
197                         elif 'uri' in headers:
198                                 newurl = headers.getheaders('uri')[0]
199                         e.newurl = newurl
200                 _moduleLogger.info("New url: %s" % e.newurl)
201                 raise e