Fixed etag value of None if a feed did not contain updates, plus parsing for gzipped...
[feedingit] / src / rss.py
index 59fa485..a105f3d 100644 (file)
@@ -25,7 +25,7 @@
 
 from os.path import isfile, isdir
 from shutil import rmtree
-from os import mkdir, remove
+from os import mkdir, remove, utime
 import pickle
 import md5
 import feedparser
@@ -58,11 +58,24 @@ def getId(string):
 
 class ImageHandler:
     def __init__(self, configdir):
-        self.configdir = configdir
-        self.images = {}
-        
-    def addImage(self, key, baseurl, url):
-        filename = self.configdir+key+".d/"+getId(url)
+        pass
+
+class Feed:
+    def __init__(self, uniqueId, name, url):
+        self.titles = []
+        self.entries = {}
+        self.ids = []
+        self.readItems = {}
+        self.name = name
+        self.url = url
+        self.countUnread = 0
+        self.updateTime = "Never"
+        self.uniqueId = uniqueId
+        self.etag = None
+        self.modified = None
+
+    def addImage(self, configdir, key, baseurl, url):
+        filename = configdir+key+".d/"+getId(url)
         if not isfile(filename):
             try:
                 #if url.startswith("http"):
@@ -76,38 +89,11 @@ class ImageHandler:
             except:
                 print "Could not download " + url
         else:
-            open(filename,"a").close()  # "Touch" the file
-        if filename in self.images:
-            self.images[filename] += 1
-        else:
-            self.images[filename] = 1
+            #open(filename,"a").close()  # "Touch" the file
+            file = open(filename,"a")
+            utime(filename, None)
+            file.close()
         return filename
-        
-    def removeImage(self, key, filename):
-        #filename = self.configdir+key+".d/"+getId(url)
-        try:
-            self.images[filename] -= 1
-        except:
-            self.images[filename] = 0 #Delete image
-        try:
-            if self.images[filename] == 0:
-                remove(filename) #os.remove
-                del self.images[filename]
-        except:
-            print "Could not remove image %s" % filename
-
-class Feed:
-    def __init__(self, uniqueId, name, url, imageHandler):
-        self.titles = []
-        self.entries = {}
-        self.ids = []
-        self.readItems = {}
-        self.name = name
-        self.url = url
-        self.countUnread = 0
-        self.updateTime = "Never"
-        self.uniqueId = uniqueId
-        self.imageHandler = imageHandler
 
     def editFeed(self, url):
         self.url = url
@@ -143,16 +129,42 @@ class Feed:
     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
         # Expiry time is in hours
         if proxy == None:
-            tmp=feedparser.parse(self.url)
+            tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified)
         else:
-            tmp=feedparser.parse(self.url, handlers = [proxy])
+            tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified, handlers = [proxy])
         expiry = float(expiryTime) * 3600.
+
         # Check if the parse was succesful (number of entries > 0, else do nothing)
         if len(tmp["entries"])>0:
-           #reversedEntries = self.getEntries()
-           #reversedEntries.reverse()
+           # The etag and modified value should only be updated if the content was not null
+           try:
+               self.etag = tmp["etag"]
+           except KeyError:
+               self.etag = None
+           try:
+               self.modified = tmp["modified"]
+           except KeyError:
+               self.modified = None
+           #if len(tmp["entries"])>0:
            if not isdir(configdir+self.uniqueId+".d"):
                mkdir(configdir+self.uniqueId+".d")
+           try:
+               f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico"))
+               data = f.read()
+               f.close()
+               outf = open(configdir+self.uniqueId+".d/favicon.ico", "w")
+               outf.write(data)
+               outf.close()
+               del data
+           except:
+               #import traceback
+               #traceback.print_exc()
+                pass
+
+
+           #reversedEntries = self.getEntries()
+           #reversedEntries.reverse()
+
            currentTime = time.time()
            tmpEntries = {}
            tmpIds = []
@@ -178,7 +190,7 @@ class Feed:
                    if imageCache:
                       for img in images:
                           try:
-                            filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
+                            filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
                             img['src']=filename
                             tmpEntry["images"].append(filename)
                           except:
@@ -192,8 +204,19 @@ class Feed:
                    if id not in self.readItems:
                        self.readItems[id] = False
                else:
-                    tmpEntries[id] = self.entries[id]
-                    tmpIds.append(id)
+                   try:
+                       filename = configdir+self.uniqueId+".d/"+id+".html"
+                       file = open(filename,"a")
+                       utime(filename, None)
+                       file.close()
+                       for image in self.entries[id]["images"]:
+                            file = open(image,"a")
+                            utime(image, None)
+                            file.close()
+                   except:
+                       pass
+                   tmpEntries[id] = self.entries[id]
+                   tmpIds.append(id)
             
            oldIds = self.ids[:]
            for entryId in oldIds:
@@ -232,6 +255,31 @@ class Feed:
            self.countUnread = tmpUnread
            self.updateTime = time.asctime()
            self.saveFeed(configdir)
+           from glob import glob
+           from os import stat
+           for file in glob(configdir+self.uniqueId+".d/*"):
+                #
+                stats = stat(file)
+                #
+                # put the two dates into matching format
+                #
+                lastmodDate = stats[8]
+                #
+                expDate = time.time()-expiry*3
+                # check if image-last-modified-date is outdated
+                #
+                if expDate > lastmodDate:
+                    #
+                    try:
+                        #
+                        #print 'Removing', file
+                        #
+                        remove(file) # commented out for testing
+                        #
+                    except OSError:
+                        #
+                        print 'Could not remove', file
+           
 
     def extractContent(self, entry):
         content = ""
@@ -268,7 +316,9 @@ class Feed:
             self.readItems[id] = False
     
     def isEntryRead(self, id):
-        return self.readItems[id]
+        # Check if an entry is read; return False if the read
+        # status of an entry is unknown (id not in readItems)
+        return self.readItems.get(id, False)
     
     def getTitle(self, id):
         return self.entries[id]["title"]
@@ -332,9 +382,6 @@ class Feed:
         #try:
         if self.entries.has_key(id):
             entry = self.entries[id]
-            if entry.has_key("images"):
-                for img in entry["images"]:
-                    self.imageHandler.removeImage(self.uniqueId, img)
             
             if entry.has_key("contentLink"):
                 try:
@@ -412,9 +459,8 @@ class ArchivedArticles(Feed):
                     images = soup('img')
                     baseurl = entry["link"]
                     for img in images:
-                        filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
+                        filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
                         img['src']=filename
-                        entry["images"].append(filename)
                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
                     file = open(entry["contentLink"], "w")
                     file.write(soup.prettify())
@@ -462,13 +508,7 @@ class Listing:
             self.listOfFeeds = pickle.load(file)
             file.close()
         else:
-            self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
-        try:
-            file = open(self.configdir+"images.pickle")
-            self.imageHandler = pickle.load(file)
-            file.close()
-        except:
-            self.imageHandler = ImageHandler(self.configdir)
+            self.listOfFeeds = {getId("Maemo News"):{"title":"Maemo News", "url":"http://maemo.org/news/items.xml", "unread":0, "updateTime":"Never"}, }
         if self.listOfFeeds.has_key("font"):
             del self.listOfFeeds["font"]
         if self.listOfFeeds.has_key("feedingit-order"):
@@ -502,19 +542,29 @@ class Listing:
                 file.close()
                 try:
                     feed.uniqueId
-                    feed.imageHandler
                 except AttributeError:
                     feed.uniqueId = getId(feed.name)
-                    feed.imageHandler = self.imageHandler
+                try:
+                    del feed.imageHandler
+                except:
+                    pass
+                try:
+                    feed.etag
+                except AttributeError:
+                    feed.etag = None
+                try:
+                    feed.modified
+                except AttributeError:
+                    feed.modified = None
                 #feed.reloadUnread(self.configdir)
             else:
                 #print key
                 title = self.listOfFeeds[key]["title"]
                 url = self.listOfFeeds[key]["url"]
                 if key == "ArchivedArticles":
-                    feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
+                    feed = ArchivedArticles("ArchivedArticles", title, url)
                 else:
-                    feed = Feed(getId(title), title, url, self.imageHandler)
+                    feed = Feed(getId(title), title, url)
             return feed
         
     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
@@ -542,6 +592,8 @@ class Listing:
             feed.reloadUnread(self.configdir)
         except:
             # If the feed file gets corrupted, we need to reset the feed.
+            import traceback
+            traceback.print_exc()
             import dbus
             bus = dbus.SessionBus()
             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
@@ -577,6 +629,13 @@ class Listing:
     def getListOfFeeds(self):
         return self.sortedKeys
     
+    def getFavicon(self, key):
+        filename = self.configdir+key+".d/favicon.ico"
+        if isfile(filename):
+            return filename
+        else:
+            return False
+    
     def addFeed(self, title, url):
         if not self.listOfFeeds.has_key(getId(title)):
             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
@@ -600,9 +659,6 @@ class Listing:
         file = open(self.configdir+"feeds.pickle", "w")
         pickle.dump(self.listOfFeeds, file)
         file.close()
-        file = open(self.configdir+"images.pickle", "w")
-        pickle.dump(self.imageHandler, file)
-        file.close()
         
     def moveUp(self, key):
         index = self.sortedKeys.index(key)