API wrapper
authorEd Page <eopage@byu.net>
Sat, 1 May 2010 18:38:07 +0000 (13:38 -0500)
committerEd Page <eopage@byu.net>
Mon, 3 May 2010 23:57:16 +0000 (18:57 -0500)
src/backend.py [new file with mode: 0755]
src/browser_emu.py [new file with mode: 0644]

diff --git a/src/backend.py b/src/backend.py
new file mode 100755 (executable)
index 0000000..826cef2
--- /dev/null
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+import urllib
+from xml.etree import ElementTree
+import logging
+
+import browser_emu
+
+
+_moduleLogger = logging.getLogger(__name__)
+
+
+class Backend(object):
+
+       def __init__(self):
+               self._browser = browser_emu.MozillaEmulator()
+
+       def get_languages(self):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.languages.query",
+               )
+               languages = tree.find("languages")
+               return self._process_list(languages, ["name"])
+
+       def get_radio_channels(self):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.radiochannels.query",
+               )
+               channels = tree.find("channels")
+               return self._process_list(channels, ["description", "url", "port"])
+
+       def get_radio_channel_programming(self, chanId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.radiochannels.programming.query",
+                       channelID=chanId,
+               )
+               programs = tree.find("programs")
+               return self._process_list(programs, ["date", "time", "title", "shortdescription", "artist"])
+
+       def get_conferences(self, langId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.conferences.query",
+                       languageID=langId,
+               )
+               conferences = tree.find("conferences")
+               return self._process_list(conferences, ["title", "full_title", "month", "year"])
+
+       def get_conference_sessions(self, confId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.conferences.sessions.query",
+                       conferenceID=confId,
+               )
+               items = tree.find("sessions")
+               return self._process_list(items, ["title", "short_title", "order"])
+
+       def get_conference_talks(self, sessionId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.conferences.sessions.talks.query",
+                       sessionID=sessionId,
+               )
+               items = tree.find("talks")
+               return self._process_list(items, ["title", "order", "url", "speaker"])
+
+       def get_magazines(self, langId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.magazines.query",
+                       languageID=langId,
+               )
+               magazines = tree.find("magazines")
+               return self._process_list(magazines, ["title"])
+
+       def get_magazine_issues(self, magId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.magazines.issues.query",
+                       magazineID=magId,
+               )
+               items = tree.find("issues")
+               return self._process_list(items, ["title", "year", "month", "pictureURL"])
+
+       def get_magazine_articles(self, issueId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.magazines.issues.articles.query",
+                       issueID=issueId,
+               )
+               items = tree.find("articles")
+               return self._process_list(items, ["title", "author", "url"])
+
+       def get_scriptures(self, langId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.scriptures.query",
+                       languageID=langId,
+               )
+               scriptures = tree.find("scriptures")
+               return self._process_list(scriptures, ["title"])
+
+       def get_scripture_books(self, scriptId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.scriptures.books.query",
+                       scriptureID=scriptId,
+               )
+               items = tree.find("books")
+               return self._process_list(items, ["title"])
+
+       def get_scripture_chapters(self, bookId):
+               tree = self._get_page_with_validation(
+                       action="lds.radio.scriptures.books.chapters.query",
+                       bookID=bookId,
+               )
+               items = tree.find("chapters")
+               return self._process_list(items, ["title", "url"])
+
+       def _get_page_with_validation(self, **params):
+               encodedParams = urllib.urlencode(params)
+               page = self._browser.download("http://tech.lds.org/radio?%s" % encodedParams)
+               if not page:
+                       raise RuntimeError("Blank page")
+               tree = ElementTree.fromstring(page)
+
+               if tree.tag == "apiresults":
+                       desc = tree.find("ErrorDescription")
+                       raise RuntimeError(desc.text)
+               else:
+                       results = tree.find("apiresults")
+                       if not results.attrib["success"]:
+                               raise RuntimeError("Could not determine radio languages")
+
+               return tree
+
+       def _process_list(self, tree, elements):
+               for item in tree.getchildren():
+                       data = {"id": item.attrib["ID"]}
+                       for element in elements:
+                               data[element] = item.find(element).text
+                       yield data
+
+
+if __name__ == "__main__":
+       b = Backend()
+
+       print list(b.get_languages())
+
+       if False:
+               channels = list(b.get_radio_channels())
+               print channels
+               for chanData in channels:
+                       programs = list(b.get_radio_channel_programming(chanData["id"]))
+                       print programs
+
+       if False:
+               confs = list(b.get_conferences(1))
+               print confs
+               for confData in confs:
+                       sessions = list(b.get_conference_sessions(confData["id"]))
+                       for sessionData in sessions:
+                               talks = list(b.get_conference_talks(sessionData["id"]))
+                               print talks
+
+       if False:
+               mags = list(b.get_magazines(1))
+               print mags
+               for magData in mags:
+                       issues = list(b.get_magazine_issues(magData["id"]))
+                       issues
+                       for issueData in issues:
+                               articles = list(b.get_magazine_articles(issueData["id"]))
+                               print articles
+
+       if False:
+               mags = list(b.get_scriptures(1))
+               print mags
+               for magData in mags:
+                       books = list(b.get_scripture_books(magData["id"]))
+                       print books
+                       for bookData in books:
+                               chapters = list(b.get_scripture_chapters(bookData["id"]))
+                               print chapters
diff --git a/src/browser_emu.py b/src/browser_emu.py
new file mode 100644 (file)
index 0000000..12d5e61
--- /dev/null
@@ -0,0 +1,199 @@
+"""
+@author:         Laszlo Nagy
+@copyright:   (c) 2005 by Szoftver Messias Bt.
+@licence:       BSD style
+
+Objects of the MozillaEmulator class can emulate a browser that is capable of:
+
+       - cookie management
+       - configurable user agent string
+       - GET and POST
+       - multipart POST (send files)
+       - receive content into file
+
+I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
+
+       1. Use firefox
+       2. Install and open the livehttpheaders plugin
+       3. Use the website manually with firefox
+       4. Check the GET and POST requests in the livehttpheaders capture window
+       5. Create an instance of the above class and send the same GET and POST requests to the server.
+
+Optional steps:
+
+       - You can change user agent string in the build_opened method
+       - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
+"""
+
+import urllib2
+import cookielib
+import logging
+
+import socket
+
+
+_moduleLogger = logging.getLogger("browser_emu")
+socket.setdefaulttimeout(20)
+
+
+class MozillaEmulator(object):
+
+       def __init__(self, trycount = 1):
+               """Create a new MozillaEmulator object.
+
+               @param trycount: The download() method will retry the operation if it
+               fails. You can specify -1 for infinite retrying.  A value of 0 means no
+               retrying. A value of 1 means one retry. etc."""
+               self.debug = False
+               self.trycount = trycount
+               self._cookies = cookielib.LWPCookieJar()
+               self._loadedFromCookies = False
+
+       def load_cookies(self, path):
+               assert not self._loadedFromCookies, "Load cookies only once"
+               if path is None:
+                       return
+
+               self._cookies.filename = path
+               try:
+                       self._cookies.load()
+               except cookielib.LoadError:
+                       _moduleLogger.exception("Bad cookie file")
+               except IOError:
+                       _moduleLogger.exception("No cookie file")
+               except Exception, e:
+                       _moduleLogger.exception("Unknown error with cookies")
+               else:
+                       self._loadedFromCookies = True
+
+               return self._loadedFromCookies
+
+       def save_cookies(self):
+               if self._loadedFromCookies:
+                       self._cookies.save()
+
+       def clear_cookies(self):
+               if self._loadedFromCookies:
+                       self._cookies.clear()
+
+       def download(self, url,
+                       postdata = None, extraheaders = None, forbidRedirect = False,
+                       trycount = None, only_head = False,
+               ):
+               """Download an URL with GET or POST methods.
+
+               @param postdata: It can be a string that will be POST-ed to the URL.
+                       When None is given, the method will be GET instead.
+               @param extraheaders: You can add/modify HTTP headers with a dict here.
+               @param forbidRedirect: Set this flag if you do not want to handle
+                       HTTP 301 and 302 redirects.
+               @param trycount: Specify the maximum number of retries here.
+                       0 means no retry on error. Using -1 means infinite retring.
+                       None means the default value (that is self.trycount).
+               @param only_head: Create the openerdirector and return it. In other
+                       words, this will not retrieve any content except HTTP headers.
+
+               @return: The raw HTML page data
+               """
+               _moduleLogger.debug("Performing download of %s" % url)
+
+               if extraheaders is None:
+                       extraheaders = {}
+               if trycount is None:
+                       trycount = self.trycount
+               cnt = 0
+
+               while True:
+                       try:
+                               req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
+                               openerdirector = u.open(req)
+                               if self.debug:
+                                       _moduleLogger.info("%r - %r" % (req.get_method(), url))
+                                       _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
+                                       _moduleLogger.info("%r" % (openerdirector.headers))
+                               self._cookies.extract_cookies(openerdirector, req)
+                               if only_head:
+                                       return openerdirector
+
+                               return self._read(openerdirector, trycount)
+                       except urllib2.URLError, e:
+                               _moduleLogger.debug("%s: %s" % (e, url))
+                               cnt += 1
+                               if (-1 < trycount) and (trycount < cnt):
+                                       raise
+
+                       # Retry :-)
+                       _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
+
+       def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
+               if extraheaders is None:
+                       extraheaders = {}
+
+               txheaders = {
+                       'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
+                       'Accept-Language': 'en,en-us;q=0.5',
+                       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
+               }
+               for key, value in extraheaders.iteritems():
+                       txheaders[key] = value
+               req = urllib2.Request(url, postdata, txheaders)
+               self._cookies.add_cookie_header(req)
+               if forbidRedirect:
+                       redirector = HTTPNoRedirector()
+                       #_moduleLogger.info("Redirection disabled")
+               else:
+                       redirector = urllib2.HTTPRedirectHandler()
+                       #_moduleLogger.info("Redirection enabled")
+
+               http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
+               https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
+
+               u = urllib2.build_opener(
+                       http_handler,
+                       https_handler,
+                       urllib2.HTTPCookieProcessor(self._cookies),
+                       redirector
+               )
+               u.addheaders = [(
+                       'User-Agent',
+                       'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
+               )]
+               if not postdata is None:
+                       req.add_data(postdata)
+               return (req, u)
+
+       def _read(self, openerdirector, trycount):
+               chunks = []
+
+               chunk = openerdirector.read()
+               chunks.append(chunk)
+               #while chunk and cnt < trycount:
+               #       time.sleep(1)
+               #       cnt += 1
+               #       chunk = openerdirector.read()
+               #       chunks.append(chunk)
+
+               data = "".join(chunks)
+
+               if "Content-Length" in openerdirector.info():
+                       assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
+                               openerdirector.info()["Content-Length"],
+                               len(data),
+                       )
+
+               return data
+
+
+class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
+       """This is a custom http redirect handler that FORBIDS redirection."""
+
+       def http_error_302(self, req, fp, code, msg, headers):
+               e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
+               if e.code in (301, 302):
+                       if 'location' in headers:
+                               newurl = headers.getheaders('location')[0]
+                       elif 'uri' in headers:
+                               newurl = headers.getheaders('uri')[0]
+                       e.newurl = newurl
+               _moduleLogger.info("New url: %s" % e.newurl)
+               raise e