0.6.1-7, fix for broken feeds
[feedingit] / src / rss.py
index 4dc3b78..59fa485 100644 (file)
 # ============================================================================
 # Name        : FeedingIt.py
 # Author      : Yves Marcoz
-# Version     : 0.5.0
+# Version     : 0.5.4
 # Description : Simple RSS Reader
 # ============================================================================
 
-from os.path import isfile
-from os.path import isdir
+from os.path import isfile, isdir
 from shutil import rmtree
-from os import mkdir
+from os import mkdir, remove
 import pickle
 import md5
 import feedparser
 import time
 import urllib2
 from BeautifulSoup import BeautifulSoup
-from urlparse import urlparse
+from urlparse import urljoin
 
 #CONFIGDIR="/home/user/.feedingit/"
 
 def getId(string):
     return md5.new(string).hexdigest()
 
+#def getProxy():
+#    import gconf
+#    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
+#        port = gconf.client_get_default().get_int('/system/http_proxy/port')
+#        http = gconf.client_get_default().get_string('/system/http_proxy/host')
+#        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
+#        return (True, proxy)
+#    return (False, None)
+
+# Enable proxy support for images and ArchivedArticles
+#(proxy_support, proxy) = getProxy()
+#if proxy_support:
+#    opener = urllib2.build_opener(proxy)
+#    urllib2.install_opener(opener)
+
 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
 
 class ImageHandler:
@@ -51,55 +65,36 @@ class ImageHandler:
         filename = self.configdir+key+".d/"+getId(url)
         if not isfile(filename):
             try:
-                if url.startswith("http"):
-                    f = urllib2.urlopen(url)
-                else:
-                    f = urllib2.urlopen(baseurl+"/"+url)
+                #if url.startswith("http"):
+                #    f = urllib2.urlopen(url)
+                #else:
+                f = urllib2.urlopen(urljoin(baseurl,url))
                 outf = open(filename, "w")
                 outf.write(f.read())
                 f.close()
                 outf.close()
             except:
-                print "Could not download" + url
-        if url in self.images:
-            self.images[url] += 1
+                print "Could not download " + url
         else:
-            self.images[url] = 1
-        return "file://" + filename
-        
-    def removeImage(self, key, url):
-        filename = self.configdir+key+".d/"+getId(url)
-        self.images[url] -= 1
-        if self.images[url] == 0:
-            os.remove(filename)
-            del self.images[url]
-
-class UnreadTracker:
-    def __init__(self):
-        self.readItems = {}
-        self.countUnread
-        
-    def setEntryUnread(self, id):
-        if self.readItems.has_key(id):
-            if self.readItems[id]==True:
-                self.countUnread = self.countUnread + 1
-                self.readItems[id] = False
+            open(filename,"a").close()  # "Touch" the file
+        if filename in self.images:
+            self.images[filename] += 1
         else:
-            self.readItems[id] = False
-            self.countUnread = self.countUnread + 1
-    
-    def setEntryRead(self, id):
-        if self.readItems[id]==False:
-            self.countUnread = self.countUnread - 1
-            self.readItems[id] = True
-
-    def isRead(self, id):
-        return self.readItems[id]
-    
-    def removeEntry(self, id):
-        if self.readItems[id]==False:
-            self.countUnread = self.countUnread - 1
-        del self.readItems[id]
+            self.images[filename] = 1
+        return filename
+        
+    def removeImage(self, key, filename):
+        #filename = self.configdir+key+".d/"+getId(url)
+        try:
+            self.images[filename] -= 1
+        except:
+            self.images[filename] = 0 #Delete image
+        try:
+            if self.images[filename] == 0:
+                remove(filename) #os.remove
+                del self.images[filename]
+        except:
+            print "Could not remove image %s" % filename
 
 class Feed:
     def __init__(self, uniqueId, name, url, imageHandler):
@@ -145,49 +140,96 @@ class Feed:
             pass
         return self.countUnread
 
-    def updateFeed(self, configdir, expiryTime=24):
+    def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
         # Expiry time is in hours
-        tmp=feedparser.parse(self.url)
+        if proxy == None:
+            tmp=feedparser.parse(self.url)
+        else:
+            tmp=feedparser.parse(self.url, handlers = [proxy])
+        expiry = float(expiryTime) * 3600.
         # Check if the parse was succesful (number of entries > 0, else do nothing)
         if len(tmp["entries"])>0:
            #reversedEntries = self.getEntries()
            #reversedEntries.reverse()
+           if not isdir(configdir+self.uniqueId+".d"):
+               mkdir(configdir+self.uniqueId+".d")
+           currentTime = time.time()
            tmpEntries = {}
            tmpIds = []
            for entry in tmp["entries"]:
                (dateTuple, date) = self.extractDate(entry)
+               try:
+                   entry["title"]
+               except:
+                   entry["title"] = "No Title"
+               try:
+                   entry["link"]
+               except:
+                   entry["link"] = ""
                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
                id = self.generateUniqueId(tmpEntry)
-               tmpEntries[id] = tmpEntry
-               tmpIds.append(id)               
-           for entryId in self.getIds():
-               currentTime = time.time()
-               expiry = float(expiryTime) * 3600.
-               articleTime = time.mktime(self.entries[entryId]["dateTuple"])
-               if currentTime - articleTime < expiry:
-                   if not entryId in tmpIds:
-                       tmpEntries[entryId] = self.entries[entryId]
-                       tmpIds.append(entryId)
+               
+               #articleTime = time.mktime(self.entries[id]["dateTuple"])
+               if not id in self.ids:
+                   soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
+                   images = soup('img')
+                   baseurl = tmpEntry["link"]
+                   if imageCache:
+                      for img in images:
+                          try:
+                            filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
+                            img['src']=filename
+                            tmpEntry["images"].append(filename)
+                          except:
+                              print "Error downloading image %s" % img
+                   tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
+                   file = open(tmpEntry["contentLink"], "w")
+                   file.write(soup.prettify())
+                   file.close()
+                   tmpEntries[id] = tmpEntry
+                   tmpIds.append(id)
+                   if id not in self.readItems:
+                       self.readItems[id] = False
                else:
-                    if (not self.isEntryRead(entryId)) and (currentTime - articleTime < 2*expiry):
+                    tmpEntries[id] = self.entries[id]
+                    tmpIds.append(id)
+            
+           oldIds = self.ids[:]
+           for entryId in oldIds:
+                if not entryId in tmpIds:
+                    try:
+                        articleTime = time.mktime(self.entries[entryId]["dateTuple"])
+                        if (currentTime - articleTime > 2*expiry):
+                            self.removeEntry(entryId)
+                            continue
+                        if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
+                            # Entry is over 24 hours, and already read
+                            self.removeEntry(entryId)
+                            continue
                         tmpEntries[entryId] = self.entries[entryId]
                         tmpIds.append(entryId)
-                   
+                    except:
+                        print "Error purging old articles %s" % entryId
+                        self.removeEntry(entryId)
+
            self.entries = tmpEntries
            self.ids = tmpIds
-           self.countUnread = 0
-           # Initialize the new articles to unread
-           tmpReadItems = self.readItems
-           self.readItems = {}
-           for id in self.getIds():
-               if not tmpReadItems.has_key(id):
+           tmpUnread = 0
+           
+
+           ids = self.ids[:]
+           for id in ids:
+               if not self.readItems.has_key(id):
                    self.readItems[id] = False
-               else:
-                   self.readItems[id] = tmpReadItems[id]
                if self.readItems[id]==False:
-                  self.countUnread = self.countUnread + 1
+                  tmpUnread = tmpUnread + 1
+           keys = self.readItems.keys()
+           for id in keys:
+               if not id in self.ids:
+                   del self.readItems[id]
            del tmp
+           self.countUnread = tmpUnread
            self.updateTime = time.asctime()
            self.saveFeed(configdir)
 
@@ -231,11 +273,14 @@ class Feed:
     def getTitle(self, id):
         return self.entries[id]["title"]
     
-    def getLink(self, id):
+    def getContentLink(self, id):
         if self.entries[id].has_key("contentLink"):
             return self.entries[id]["contentLink"]
         return self.entries[id]["link"]
     
+    def getExternalLink(self, id):
+        return self.entries[id]["link"]
+    
     def getDate(self, id):
         return self.entries[id]["date"]
 
@@ -284,20 +329,37 @@ class Feed:
         return self.entries[id]["content"]
     
     def removeEntry(self, id):
-        entry = self.entries[id]
-        for img in entry["images"]:
-            self.imageHandler.removeImage(self.uniqueId, img)
-        if entry.has_key["contentLink"]:
-            os.remove(entry["contentLink"])
-        self.entries.remove(id)
-        self.ids.remove(id)
-        if self.readItems[id]==False:
-            self.countUnread = self.countUnread - 1
-        self.readItems.remove(id)
+        #try:
+        if self.entries.has_key(id):
+            entry = self.entries[id]
+            if entry.has_key("images"):
+                for img in entry["images"]:
+                    self.imageHandler.removeImage(self.uniqueId, img)
+            
+            if entry.has_key("contentLink"):
+                try:
+                    remove(entry["contentLink"])  #os.remove
+                except:
+                    print "File not found for deletion: %s" % entry["contentLink"]
+            del self.entries[id]
+        else:
+            print "Entries has no %s key" % id
+        if id in self.ids:
+            self.ids.remove(id)
+        else:
+            print "Ids has no %s key" % id
+        if self.readItems.has_key(id):
+            if self.readItems[id]==False:
+                self.countUnread = self.countUnread - 1
+            del self.readItems[id]
+        else:
+            print "ReadItems has no %s key" % id
+        #except:
+        #    print "Error removing entry %s" %id
     
-    def getArticle(self, id):
-        self.setEntryRead(id)
-        entry = self.entries[id]
+    def getArticle(self, entry):
+        #self.setEntryRead(id)
+        #entry = self.entries[id]
         title = entry['title']
         #content = entry.get('content', entry.get('summary_detail', {}))
         content = entry["content"]
@@ -337,7 +399,7 @@ class ArchivedArticles(Feed):
         self.saveFeed(configdir)
         self.saveUnread(configdir)
         
-    def updateFeed(self, configdir, expiryTime=24):
+    def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
         for id in self.getIds():
             entry = self.entries[id]
             if not entry["downloaded"]:
@@ -347,25 +409,10 @@ class ArchivedArticles(Feed):
                     html = f.read()
                     f.close()
                     soup = BeautifulSoup(html)
-                    images = soup.body('img')
-                    baseurl = ''.join(urlparse(entry["link"])[:-1])
+                    images = soup('img')
+                    baseurl = entry["link"]
                     for img in images:
                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
-                        #filename = configdir+self.uniqueId+".d/"+getId(img['src'])
-                        #if not isfile(filename):
-                        #    try:
-                        #        if img['src'].startswith("http"):
-                        #            f = urllib2.urlopen(img['src'])
-                        #        else:
-                        #            f = urllib2.urlopen(baseurl+"/"+img['src'])
-                        #            #print baseurl+"/"+img['src']
-                        #        print filename
-                        #        outf = open(filename, "w")
-                        #        outf.write(f.read())
-                        #        f.close()
-                        #        outf.close()
-                        #    except:
-                        #        print "Could not download" + img['src']
                         img['src']=filename
                         entry["images"].append(filename)
                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
@@ -378,16 +425,26 @@ class ArchivedArticles(Feed):
                         self.setEntryUnread(id)
                 #except:
                 #    pass
-            currentTime = time.time()
-            expiry = float(expiryTime) * 3600
-            if currentTime - entry["time"] > expiry:
-                if self.isEntryRead(id):
-                    self.removeEntry(id)
-                else:
-                    if currentTime - entry["time"] > 2*expiry:
-                        self.removeEntry(id)
+            #currentTime = time.time()
+            #expiry = float(expiryTime) * 3600
+            #if currentTime - entry["time"] > expiry:
+            #    if self.isEntryRead(id):
+            #        self.removeEntry(id)
+            #    else:
+            #        if currentTime - entry["time"] > 2*expiry:
+            #            self.removeEntry(id)
         self.updateTime = time.asctime()
         self.saveFeed(configdir)
+        
+    def purgeReadArticles(self):
+        ids = self.getIds()
+        for id in ids:
+            entry = self.entries[id]
+            if self.isEntryRead(id):
+                self.removeEntry(id)
+                
+    def removeArticle(self, id):
+        self.removeEntry(id)
 
     def getArticle(self, index):
         self.setEntryRead(index)
@@ -406,11 +463,11 @@ class Listing:
             file.close()
         else:
             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
-        if isfile(self.configdir+"images.pickle"):
+        try:
             file = open(self.configdir+"images.pickle")
             self.imageHandler = pickle.load(file)
             file.close()
-        else:
+        except:
             self.imageHandler = ImageHandler(self.configdir)
         if self.listOfFeeds.has_key("font"):
             del self.listOfFeeds["font"]
@@ -422,25 +479,12 @@ class Listing:
                 self.sortedKeys.remove("font")
             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
         list = self.sortedKeys[:]
-        #for key in list:
-        #    try:
-        #        self.loadFeed(key)
-        #    except:
-                #import traceback
-                #if key.startswith('d8'):
-                #traceback.print_exc()
-        #        self.sortedKeys.remove(key)
-            #print key
-                #print key in self.sortedKeys
-        #print "d8eb3f07572892a7b5ed9c81c5bb21a2" in self.sortedKeys
-        #print self.listOfFeeds["d8eb3f07572892a7b5ed9c81c5bb21a2"]
-        self.closeCurrentlyDisplayedFeed()
-        #self.saveConfig()
+        #self.closeCurrentlyDisplayedFeed()
 
     def addArchivedArticle(self, key, index):
         feed = self.getFeed(key)
         title = feed.getTitle(index)
-        link = feed.getLink(index)
+        link = feed.getExternalLink(index)
         date = feed.getDateTuple(index)
         if not self.listOfFeeds.has_key("ArchivedArticles"):
             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
@@ -473,16 +517,16 @@ class Listing:
                     feed = Feed(getId(title), title, url, self.imageHandler)
             return feed
         
-    def updateFeeds(self, expiryTime=24):
+    def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
         for key in self.getListOfFeeds():
             feed = self.loadFeed(key)
-            feed.updateFeed(self.configdir, expiryTime)
+            feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
             
-    def updateFeed(self, key, expiryTime=24):
+    def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
         feed = self.getFeed(key)
-        feed.updateFeed(self.configdir, expiryTime)
+        feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
         
@@ -493,8 +537,21 @@ class Listing:
         feed.editFeed(url)
 
     def getFeed(self, key):
-        feed = self.loadFeed(key)
-        feed.reloadUnread(self.configdir)
+        try:
+            feed = self.loadFeed(key)
+            feed.reloadUnread(self.configdir)
+        except:
+            # If the feed file gets corrupted, we need to reset the feed.
+            import dbus
+            bus = dbus.SessionBus()
+            remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
+                               "/org/freedesktop/Notifications" # Object's path
+                              )
+            iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
+            iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
+            if isdir(self.configdir+key+".d/"):
+                rmtree(self.configdir+key+".d/")
+            feed = self.loadFeed(key)
         return feed
     
     def getFeedUpdateTime(self, key):
@@ -520,12 +577,6 @@ class Listing:
     def getListOfFeeds(self):
         return self.sortedKeys
     
-    #def getNumberOfUnreadItems(self, key):
-    #    if self.listOfFeeds.has_key("unread"):
-    #       return self.listOfFeeds[key]["unread"]
-    #    else:
-    #       return 0
-    
     def addFeed(self, title, url):
         if not self.listOfFeeds.has_key(getId(title)):
             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
@@ -563,13 +614,6 @@ class Listing:
         index2 = (index+1)%len(self.sortedKeys)
         self.sortedKeys[index] = self.sortedKeys[index2]
         self.sortedKeys[index2] = key
-        
-    def setCurrentlyDisplayedFeed(self, key):
-        self.currentlyDisplayedFeed = key
-    def closeCurrentlyDisplayedFeed(self):
-        self.currentlyDisplayedFeed = False
-    def getCurrentlyDisplayedFeed(self):
-        return self.currentlyDisplayedFeed
     
 if __name__ == "__main__":
     listing = Listing('/home/user/.feedingit/')
@@ -577,4 +621,4 @@ if __name__ == "__main__":
         #list.reverse()
     for key in list:
         if key.startswith('d8'):
-            print listing.getFeedUpdateTime(key)
\ No newline at end of file
+            print listing.getFeedUpdateTime(key)