Starting work on collapsing messages (it'll be annoying due to HTML); removing the...
[gc-dialer] / src / browser_emu.py
1 """
2 @author:          Laszlo Nagy
3 @copyright:   (c) 2005 by Szoftver Messias Bt.
4 @licence:        BSD style
5
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
7
8         - cookie management
9         - configurable user agent string
10         - GET and POST
11         - multipart POST (send files)
12         - receive content into file
13
14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
15
16         1. Use firefox
17         2. Install and open the livehttpheaders plugin
18         3. Use the website manually with firefox
19         4. Check the GET and POST requests in the livehttpheaders capture window
20         5. Create an instance of the above class and send the same GET and POST requests to the server.
21
22 Optional steps:
23
24         - You can change user agent string in the build_opened method
25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
26 """
27
28 import urllib2
29 import cookielib
30 import logging
31
32 import socket
33
34
35 _moduleLogger = logging.getLogger("browser_emu")
36 socket.setdefaulttimeout(10)
37
38
39 class MozillaEmulator(object):
40
41         def __init__(self, trycount = 1):
42                 """Create a new MozillaEmulator object.
43
44                 @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
45                          A value of 0 means no retrying. A value of 1 means one retry. etc."""
46                 self.cookies = cookielib.LWPCookieJar()
47                 self.debug = False
48                 self.trycount = trycount
49
50         def download(self, url,
51                         postdata = None, extraheaders = None, forbidRedirect = False,
52                         trycount = None, only_head = False,
53                 ):
54                 """Download an URL with GET or POST methods.
55
56                 @param postdata: It can be a string that will be POST-ed to the URL.
57                         When None is given, the method will be GET instead.
58                 @param extraheaders: You can add/modify HTTP headers with a dict here.
59                 @param forbidRedirect: Set this flag if you do not want to handle
60                         HTTP 301 and 302 redirects.
61                 @param trycount: Specify the maximum number of retries here.
62                         0 means no retry on error. Using -1 means infinite retring.
63                         None means the default value (that is self.trycount).
64                 @param only_head: Create the openerdirector and return it. In other
65                         words, this will not retrieve any content except HTTP headers.
66
67                 @return: The raw HTML page data
68                 """
69                 _moduleLogger.warning("Performing download of %s" % url)
70
71                 if extraheaders is None:
72                         extraheaders = {}
73                 if trycount is None:
74                         trycount = self.trycount
75                 cnt = 0
76
77                 while True:
78                         try:
79                                 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
80                                 openerdirector = u.open(req)
81                                 if self.debug:
82                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
83                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
84                                         _moduleLogger.info("%r" % (openerdirector.headers))
85                                 self.cookies.extract_cookies(openerdirector, req)
86                                 if only_head:
87                                         return openerdirector
88
89                                 return self._read(openerdirector, trycount)
90                         except urllib2.URLError:
91                                 cnt += 1
92                                 if (-1 < trycount) and (trycount < cnt):
93                                         raise
94
95                         # Retry :-)
96                         _moduleLogger.info("MozillaEmulator: urllib2.URLError, retryting %d" % cnt)
97
98         def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
99                 if extraheaders is None:
100                         extraheaders = {}
101
102                 txheaders = {
103                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
104                         'Accept-Language': 'en,en-us;q=0.5',
105                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
106                 }
107                 for key, value in extraheaders.iteritems():
108                         txheaders[key] = value
109                 req = urllib2.Request(url, postdata, txheaders)
110                 self.cookies.add_cookie_header(req)
111                 if forbidRedirect:
112                         redirector = HTTPNoRedirector()
113                         #_moduleLogger.info("Redirection disabled")
114                 else:
115                         redirector = urllib2.HTTPRedirectHandler()
116                         #_moduleLogger.info("Redirection enabled")
117
118                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
119                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
120
121                 u = urllib2.build_opener(
122                         http_handler,
123                         https_handler,
124                         urllib2.HTTPCookieProcessor(self.cookies),
125                         redirector
126                 )
127                 u.addheaders = [(
128                         'User-Agent',
129                         'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.8) Gecko/20050511 Firefox/1.0.4'
130                 )]
131                 if not postdata is None:
132                         req.add_data(postdata)
133                 return (req, u)
134
135         def _read(self, openerdirector, trycount):
136                 chunks = []
137
138                 chunk = openerdirector.read()
139                 chunks.append(chunk)
140                 #while chunk and cnt < trycount:
141                 #       time.sleep(1)
142                 #       cnt += 1
143                 #       chunk = openerdirector.read()
144                 #       chunks.append(chunk)
145
146                 data = "".join(chunks)
147
148                 if "Content-Length" in openerdirector.info():
149                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
150                                 openerdirector.info()["Content-Length"],
151                                 len(data),
152                         )
153
154                 return data
155
156
157 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
158         """This is a custom http redirect handler that FORBIDS redirection."""
159
160         def http_error_302(self, req, fp, code, msg, headers):
161                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
162                 if e.code in (301, 302):
163                         if 'location' in headers:
164                                 newurl = headers.getheaders('location')[0]
165                         elif 'uri' in headers:
166                                 newurl = headers.getheaders('uri')[0]
167                         e.newurl = newurl
168                 _moduleLogger.info("New url: %s" % e.newurl)
169                 raise e