X-Git-Url: http://git.maemo.org/git/?a=blobdiff_plain;f=src%2Frss.py;h=a105f3df80d9a5f781449676079b056694dad09b;hb=c9baa9f145dc2c041befe1b88e9a35d5075e4a3f;hp=a680ddc92ac64b6c8906ee0e36e2856a81e6bd08;hpb=73fa888eb92a3e17521b18b21f27696601005d3d;p=feedingit diff --git a/src/rss.py b/src/rss.py index a680ddc..a105f3d 100644 --- a/src/rss.py +++ b/src/rss.py @@ -23,80 +23,45 @@ # Description : Simple RSS Reader # ============================================================================ -from os.path import isfile -from os.path import isdir +from os.path import isfile, isdir from shutil import rmtree -from os import mkdir, remove +from os import mkdir, remove, utime import pickle import md5 import feedparser import time import urllib2 from BeautifulSoup import BeautifulSoup -from urlparse import urlparse +from urlparse import urljoin #CONFIGDIR="/home/user/.feedingit/" def getId(string): return md5.new(string).hexdigest() -def getProxy(): - import gconf - if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'): - port = gconf.client_get_default().get_int('/system/http_proxy/port') - http = gconf.client_get_default().get_string('/system/http_proxy/host') - proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} ) - return (True, proxy) - return (False, None) +#def getProxy(): +# import gconf +# if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'): +# port = gconf.client_get_default().get_int('/system/http_proxy/port') +# http = gconf.client_get_default().get_string('/system/http_proxy/host') +# proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} ) +# return (True, proxy) +# return (False, None) # Enable proxy support for images and ArchivedArticles -(proxy_support, proxy) = getProxy() -if proxy_support: - opener = urllib2.build_opener(proxy) - urllib2.install_opener(opener) +#(proxy_support, proxy) = getProxy() +#if proxy_support: +# opener = urllib2.build_opener(proxy) +# urllib2.install_opener(opener) # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] } class ImageHandler: def __init__(self, configdir): - self.configdir = configdir - self.images = {} - - def addImage(self, key, baseurl, url): - filename = self.configdir+key+".d/"+getId(url) - if not isfile(filename): - try: - if url.startswith("http"): - f = urllib2.urlopen(url) - else: - f = urllib2.urlopen(baseurl+"/"+url) - outf = open(filename, "w") - outf.write(f.read()) - f.close() - outf.close() - except: - print "Could not download" + url - if filename in self.images: - self.images[filename] += 1 - else: - self.images[filename] = 1 - return filename - - def removeImage(self, key, filename): - #filename = self.configdir+key+".d/"+getId(url) - try: - self.images[filename] -= 1 - except: - self.images[filename] = 0 #Delete image - try: - if self.images[filename] == 0: - remove(filename) #os.remove - del self.images[filename] - except: - print "Could not remove image %s" % filename + pass class Feed: - def __init__(self, uniqueId, name, url, imageHandler): + def __init__(self, uniqueId, name, url): self.titles = [] self.entries = {} self.ids = [] @@ -106,7 +71,29 @@ class Feed: self.countUnread = 0 self.updateTime = "Never" self.uniqueId = uniqueId - self.imageHandler = imageHandler + self.etag = None + self.modified = None + + def addImage(self, configdir, key, baseurl, url): + filename = configdir+key+".d/"+getId(url) + if not isfile(filename): + try: + #if url.startswith("http"): + # f = urllib2.urlopen(url) + #else: + f = urllib2.urlopen(urljoin(baseurl,url)) + outf = open(filename, "w") + outf.write(f.read()) + f.close() + outf.close() + except: + print "Could not download " + url + else: + #open(filename,"a").close() # "Touch" the file + file = open(filename,"a") + utime(filename, None) + file.close() + return filename def editFeed(self, url): self.url = url @@ -142,21 +129,55 @@ class Feed: def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False): # Expiry time is in hours if proxy == None: - tmp=feedparser.parse(self.url) + tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified) else: - tmp=feedparser.parse(self.url, handlers = [proxy]) + tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified, handlers = [proxy]) expiry = float(expiryTime) * 3600. + # Check if the parse was succesful (number of entries > 0, else do nothing) if len(tmp["entries"])>0: - #reversedEntries = self.getEntries() - #reversedEntries.reverse() + # The etag and modified value should only be updated if the content was not null + try: + self.etag = tmp["etag"] + except KeyError: + self.etag = None + try: + self.modified = tmp["modified"] + except KeyError: + self.modified = None + #if len(tmp["entries"])>0: if not isdir(configdir+self.uniqueId+".d"): mkdir(configdir+self.uniqueId+".d") + try: + f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico")) + data = f.read() + f.close() + outf = open(configdir+self.uniqueId+".d/favicon.ico", "w") + outf.write(data) + outf.close() + del data + except: + #import traceback + #traceback.print_exc() + pass + + + #reversedEntries = self.getEntries() + #reversedEntries.reverse() + currentTime = time.time() tmpEntries = {} tmpIds = [] for entry in tmp["entries"]: (dateTuple, date) = self.extractDate(entry) + try: + entry["title"] + except: + entry["title"] = "No Title" + try: + entry["link"] + except: + entry["link"] = "" tmpEntry = {"title":entry["title"], "content":self.extractContent(entry), "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] } id = self.generateUniqueId(tmpEntry) @@ -165,15 +186,15 @@ class Feed: if not id in self.ids: soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"]) images = soup('img') - baseurl = ''.join(urlparse(tmpEntry["link"])[:-1]) + baseurl = tmpEntry["link"] if imageCache: for img in images: try: - filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src']) + filename = self.addImage(configdir, self.uniqueId, baseurl, img['src']) img['src']=filename tmpEntry["images"].append(filename) except: - print "Error downloading image %s" %img + print "Error downloading image %s" % img tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html" file = open(tmpEntry["contentLink"], "w") file.write(soup.prettify()) @@ -183,8 +204,19 @@ class Feed: if id not in self.readItems: self.readItems[id] = False else: - tmpEntries[id] = self.entries[id] - tmpIds.append(id) + try: + filename = configdir+self.uniqueId+".d/"+id+".html" + file = open(filename,"a") + utime(filename, None) + file.close() + for image in self.entries[id]["images"]: + file = open(image,"a") + utime(image, None) + file.close() + except: + pass + tmpEntries[id] = self.entries[id] + tmpIds.append(id) oldIds = self.ids[:] for entryId in oldIds: @@ -215,10 +247,39 @@ class Feed: self.readItems[id] = False if self.readItems[id]==False: tmpUnread = tmpUnread + 1 + keys = self.readItems.keys() + for id in keys: + if not id in self.ids: + del self.readItems[id] del tmp self.countUnread = tmpUnread self.updateTime = time.asctime() self.saveFeed(configdir) + from glob import glob + from os import stat + for file in glob(configdir+self.uniqueId+".d/*"): + # + stats = stat(file) + # + # put the two dates into matching format + # + lastmodDate = stats[8] + # + expDate = time.time()-expiry*3 + # check if image-last-modified-date is outdated + # + if expDate > lastmodDate: + # + try: + # + #print 'Removing', file + # + remove(file) # commented out for testing + # + except OSError: + # + print 'Could not remove', file + def extractContent(self, entry): content = "" @@ -255,7 +316,9 @@ class Feed: self.readItems[id] = False def isEntryRead(self, id): - return self.readItems[id] + # Check if an entry is read; return False if the read + # status of an entry is unknown (id not in readItems) + return self.readItems.get(id, False) def getTitle(self, id): return self.entries[id]["title"] @@ -319,9 +382,6 @@ class Feed: #try: if self.entries.has_key(id): entry = self.entries[id] - if entry.has_key("images"): - for img in entry["images"]: - self.imageHandler.removeImage(self.uniqueId, img) if entry.has_key("contentLink"): try: @@ -397,26 +457,10 @@ class ArchivedArticles(Feed): f.close() soup = BeautifulSoup(html) images = soup('img') - baseurl = ''.join(urlparse(entry["link"])[:-1]) + baseurl = entry["link"] for img in images: - filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src']) - #filename = configdir+self.uniqueId+".d/"+getId(img['src']) - #if not isfile(filename): - # try: - # if img['src'].startswith("http"): - # f = urllib2.urlopen(img['src']) - # else: - # f = urllib2.urlopen(baseurl+"/"+img['src']) - # #print baseurl+"/"+img['src'] - # print filename - # outf = open(filename, "w") - # outf.write(f.read()) - # f.close() - # outf.close() - # except: - # print "Could not download" + img['src'] + filename = self.addImage(configdir, self.uniqueId, baseurl, img['src']) img['src']=filename - entry["images"].append(filename) entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html" file = open(entry["contentLink"], "w") file.write(soup.prettify()) @@ -427,16 +471,26 @@ class ArchivedArticles(Feed): self.setEntryUnread(id) #except: # pass - currentTime = time.time() - expiry = float(expiryTime) * 3600 - if currentTime - entry["time"] > expiry: - if self.isEntryRead(id): - self.removeEntry(id) - else: - if currentTime - entry["time"] > 2*expiry: - self.removeEntry(id) + #currentTime = time.time() + #expiry = float(expiryTime) * 3600 + #if currentTime - entry["time"] > expiry: + # if self.isEntryRead(id): + # self.removeEntry(id) + # else: + # if currentTime - entry["time"] > 2*expiry: + # self.removeEntry(id) self.updateTime = time.asctime() self.saveFeed(configdir) + + def purgeReadArticles(self): + ids = self.getIds() + for id in ids: + entry = self.entries[id] + if self.isEntryRead(id): + self.removeEntry(id) + + def removeArticle(self, id): + self.removeEntry(id) def getArticle(self, index): self.setEntryRead(index) @@ -454,13 +508,7 @@ class Listing: self.listOfFeeds = pickle.load(file) file.close() else: - self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, } - if isfile(self.configdir+"images.pickle"): - file = open(self.configdir+"images.pickle") - self.imageHandler = pickle.load(file) - file.close() - else: - self.imageHandler = ImageHandler(self.configdir) + self.listOfFeeds = {getId("Maemo News"):{"title":"Maemo News", "url":"http://maemo.org/news/items.xml", "unread":0, "updateTime":"Never"}, } if self.listOfFeeds.has_key("font"): del self.listOfFeeds["font"] if self.listOfFeeds.has_key("feedingit-order"): @@ -471,7 +519,7 @@ class Listing: self.sortedKeys.remove("font") self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj)) list = self.sortedKeys[:] - self.closeCurrentlyDisplayedFeed() + #self.closeCurrentlyDisplayedFeed() def addArchivedArticle(self, key, index): feed = self.getFeed(key) @@ -494,19 +542,29 @@ class Listing: file.close() try: feed.uniqueId - feed.imageHandler except AttributeError: feed.uniqueId = getId(feed.name) - feed.imageHandler = self.imageHandler + try: + del feed.imageHandler + except: + pass + try: + feed.etag + except AttributeError: + feed.etag = None + try: + feed.modified + except AttributeError: + feed.modified = None #feed.reloadUnread(self.configdir) else: #print key title = self.listOfFeeds[key]["title"] url = self.listOfFeeds[key]["url"] if key == "ArchivedArticles": - feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler) + feed = ArchivedArticles("ArchivedArticles", title, url) else: - feed = Feed(getId(title), title, url, self.imageHandler) + feed = Feed(getId(title), title, url) return feed def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False): @@ -529,8 +587,23 @@ class Listing: feed.editFeed(url) def getFeed(self, key): - feed = self.loadFeed(key) - feed.reloadUnread(self.configdir) + try: + feed = self.loadFeed(key) + feed.reloadUnread(self.configdir) + except: + # If the feed file gets corrupted, we need to reset the feed. + import traceback + traceback.print_exc() + import dbus + bus = dbus.SessionBus() + remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name + "/org/freedesktop/Notifications" # Object's path + ) + iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications') + iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key)) + if isdir(self.configdir+key+".d/"): + rmtree(self.configdir+key+".d/") + feed = self.loadFeed(key) return feed def getFeedUpdateTime(self, key): @@ -556,6 +629,13 @@ class Listing: def getListOfFeeds(self): return self.sortedKeys + def getFavicon(self, key): + filename = self.configdir+key+".d/favicon.ico" + if isfile(filename): + return filename + else: + return False + def addFeed(self, title, url): if not self.listOfFeeds.has_key(getId(title)): self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"} @@ -579,9 +659,6 @@ class Listing: file = open(self.configdir+"feeds.pickle", "w") pickle.dump(self.listOfFeeds, file) file.close() - file = open(self.configdir+"images.pickle", "w") - pickle.dump(self.imageHandler, file) - file.close() def moveUp(self, key): index = self.sortedKeys.index(key) @@ -593,13 +670,6 @@ class Listing: index2 = (index+1)%len(self.sortedKeys) self.sortedKeys[index] = self.sortedKeys[index2] self.sortedKeys[index2] = key - - def setCurrentlyDisplayedFeed(self, key): - self.currentlyDisplayedFeed = key - def closeCurrentlyDisplayedFeed(self): - self.currentlyDisplayedFeed = False - def getCurrentlyDisplayedFeed(self): - return self.currentlyDisplayedFeed if __name__ == "__main__": listing = Listing('/home/user/.feedingit/') @@ -607,4 +677,4 @@ if __name__ == "__main__": #list.reverse() for key in list: if key.startswith('d8'): - print listing.getFeedUpdateTime(key) \ No newline at end of file + print listing.getFeedUpdateTime(key)