0.6.1-7, fix for broken feeds
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urljoin
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         self.configdir = configdir
62         self.images = {}
63         
64     def addImage(self, key, baseurl, url):
65         filename = self.configdir+key+".d/"+getId(url)
66         if not isfile(filename):
67             try:
68                 #if url.startswith("http"):
69                 #    f = urllib2.urlopen(url)
70                 #else:
71                 f = urllib2.urlopen(urljoin(baseurl,url))
72                 outf = open(filename, "w")
73                 outf.write(f.read())
74                 f.close()
75                 outf.close()
76             except:
77                 print "Could not download " + url
78         else:
79             open(filename,"a").close()  # "Touch" the file
80         if filename in self.images:
81             self.images[filename] += 1
82         else:
83             self.images[filename] = 1
84         return filename
85         
86     def removeImage(self, key, filename):
87         #filename = self.configdir+key+".d/"+getId(url)
88         try:
89             self.images[filename] -= 1
90         except:
91             self.images[filename] = 0 #Delete image
92         try:
93             if self.images[filename] == 0:
94                 remove(filename) #os.remove
95                 del self.images[filename]
96         except:
97             print "Could not remove image %s" % filename
98
99 class Feed:
100     def __init__(self, uniqueId, name, url, imageHandler):
101         self.titles = []
102         self.entries = {}
103         self.ids = []
104         self.readItems = {}
105         self.name = name
106         self.url = url
107         self.countUnread = 0
108         self.updateTime = "Never"
109         self.uniqueId = uniqueId
110         self.imageHandler = imageHandler
111
112     def editFeed(self, url):
113         self.url = url
114
115     def saveFeed(self, configdir):
116         if not isdir(configdir+self.uniqueId+".d"):
117              mkdir(configdir+self.uniqueId+".d")
118         file = open(configdir+self.uniqueId+".d/feed", "w")
119         pickle.dump(self, file )
120         file.close()
121         self.saveUnread(configdir)
122         
123     def saveUnread(self, configdir):
124         if not isdir(configdir+self.uniqueId+".d"):
125             mkdir(configdir+self.uniqueId+".d")
126         file = open(configdir+self.uniqueId+".d/unread", "w")
127         pickle.dump(self.readItems, file )
128         file.close()
129
130     def reloadUnread(self, configdir):
131         try:
132             file = open(configdir+self.uniqueId+".d/unread", "r")
133             self.readItems = pickle.load( file )
134             file.close()
135             self.countUnread = 0
136             for id in self.getIds():
137                if self.readItems[id]==False:
138                   self.countUnread = self.countUnread + 1
139         except:
140             pass
141         return self.countUnread
142
143     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
144         # Expiry time is in hours
145         if proxy == None:
146             tmp=feedparser.parse(self.url)
147         else:
148             tmp=feedparser.parse(self.url, handlers = [proxy])
149         expiry = float(expiryTime) * 3600.
150         # Check if the parse was succesful (number of entries > 0, else do nothing)
151         if len(tmp["entries"])>0:
152            #reversedEntries = self.getEntries()
153            #reversedEntries.reverse()
154            if not isdir(configdir+self.uniqueId+".d"):
155                mkdir(configdir+self.uniqueId+".d")
156            currentTime = time.time()
157            tmpEntries = {}
158            tmpIds = []
159            for entry in tmp["entries"]:
160                (dateTuple, date) = self.extractDate(entry)
161                try:
162                    entry["title"]
163                except:
164                    entry["title"] = "No Title"
165                try:
166                    entry["link"]
167                except:
168                    entry["link"] = ""
169                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
170                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
171                id = self.generateUniqueId(tmpEntry)
172                
173                #articleTime = time.mktime(self.entries[id]["dateTuple"])
174                if not id in self.ids:
175                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
176                    images = soup('img')
177                    baseurl = tmpEntry["link"]
178                    if imageCache:
179                       for img in images:
180                           try:
181                             filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
182                             img['src']=filename
183                             tmpEntry["images"].append(filename)
184                           except:
185                               print "Error downloading image %s" % img
186                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
187                    file = open(tmpEntry["contentLink"], "w")
188                    file.write(soup.prettify())
189                    file.close()
190                    tmpEntries[id] = tmpEntry
191                    tmpIds.append(id)
192                    if id not in self.readItems:
193                        self.readItems[id] = False
194                else:
195                     tmpEntries[id] = self.entries[id]
196                     tmpIds.append(id)
197             
198            oldIds = self.ids[:]
199            for entryId in oldIds:
200                 if not entryId in tmpIds:
201                     try:
202                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
203                         if (currentTime - articleTime > 2*expiry):
204                             self.removeEntry(entryId)
205                             continue
206                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
207                             # Entry is over 24 hours, and already read
208                             self.removeEntry(entryId)
209                             continue
210                         tmpEntries[entryId] = self.entries[entryId]
211                         tmpIds.append(entryId)
212                     except:
213                         print "Error purging old articles %s" % entryId
214                         self.removeEntry(entryId)
215
216            self.entries = tmpEntries
217            self.ids = tmpIds
218            tmpUnread = 0
219            
220
221            ids = self.ids[:]
222            for id in ids:
223                if not self.readItems.has_key(id):
224                    self.readItems[id] = False
225                if self.readItems[id]==False:
226                   tmpUnread = tmpUnread + 1
227            keys = self.readItems.keys()
228            for id in keys:
229                if not id in self.ids:
230                    del self.readItems[id]
231            del tmp
232            self.countUnread = tmpUnread
233            self.updateTime = time.asctime()
234            self.saveFeed(configdir)
235
236     def extractContent(self, entry):
237         content = ""
238         if entry.has_key('summary'):
239             content = entry.get('summary', '')
240         if entry.has_key('content'):
241             if len(entry.content[0].value) > len(content):
242                 content = entry.content[0].value
243         if content == "":
244             content = entry.get('description', '')
245         return content
246         
247     def extractDate(self, entry):
248         if entry.has_key("updated_parsed"):
249             date1 = entry["updated_parsed"]
250             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
251         elif entry.has_key("published_parsed"):
252             date1 = entry["published_parsed"]
253             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
254         else:
255             date1= ""
256             date = ""
257         #print date1, date
258         return (date1, date)
259
260     def setEntryRead(self, id):
261         if self.readItems[id]==False:
262             self.countUnread = self.countUnread - 1
263             self.readItems[id] = True
264             
265     def setEntryUnread(self, id):
266         if self.readItems[id]==True:
267             self.countUnread = self.countUnread + 1
268             self.readItems[id] = False
269     
270     def isEntryRead(self, id):
271         return self.readItems[id]
272     
273     def getTitle(self, id):
274         return self.entries[id]["title"]
275     
276     def getContentLink(self, id):
277         if self.entries[id].has_key("contentLink"):
278             return self.entries[id]["contentLink"]
279         return self.entries[id]["link"]
280     
281     def getExternalLink(self, id):
282         return self.entries[id]["link"]
283     
284     def getDate(self, id):
285         return self.entries[id]["date"]
286
287     def getDateTuple(self, id):
288         return self.entries[id]["dateTuple"]
289  
290     def getUniqueId(self, index):
291         return self.ids[index]
292     
293     def generateUniqueId(self, entry):
294         return getId(entry["date"] + entry["title"])
295     
296     def getUpdateTime(self):
297         return self.updateTime
298     
299     def getEntries(self):
300         return self.entries
301     
302     def getIds(self):
303         return self.ids
304     
305     def getNextId(self, id):
306         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
307     
308     def getPreviousId(self, id):
309         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
310     
311     def getNumberOfUnreadItems(self):
312         return self.countUnread
313     
314     def getNumberOfEntries(self):
315         return len(self.ids)
316     
317     def getItem(self, id):
318         try:
319             return self.entries[id]
320         except:
321             return []
322     
323     def getContent(self, id):
324         if self.entries[id].has_key("contentLink"):
325             file = open(self.entries[id]["contentLink"])
326             content = file.read()
327             file.close()
328             return content
329         return self.entries[id]["content"]
330     
331     def removeEntry(self, id):
332         #try:
333         if self.entries.has_key(id):
334             entry = self.entries[id]
335             if entry.has_key("images"):
336                 for img in entry["images"]:
337                     self.imageHandler.removeImage(self.uniqueId, img)
338             
339             if entry.has_key("contentLink"):
340                 try:
341                     remove(entry["contentLink"])  #os.remove
342                 except:
343                     print "File not found for deletion: %s" % entry["contentLink"]
344             del self.entries[id]
345         else:
346             print "Entries has no %s key" % id
347         if id in self.ids:
348             self.ids.remove(id)
349         else:
350             print "Ids has no %s key" % id
351         if self.readItems.has_key(id):
352             if self.readItems[id]==False:
353                 self.countUnread = self.countUnread - 1
354             del self.readItems[id]
355         else:
356             print "ReadItems has no %s key" % id
357         #except:
358         #    print "Error removing entry %s" %id
359     
360     def getArticle(self, entry):
361         #self.setEntryRead(id)
362         #entry = self.entries[id]
363         title = entry['title']
364         #content = entry.get('content', entry.get('summary_detail', {}))
365         content = entry["content"]
366
367         link = entry['link']
368         date = entry["date"]
369
370         #text = '''<div style="color: black; background-color: white;">'''
371         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
372         text += "<html><head><title>" + title + "</title>"
373         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
374         #text += '<style> body {-webkit-user-select: none;} </style>'
375         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
376         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
377         text += "<BR /><BR />"
378         text += content
379         text += "</body></html>"
380         return text
381         
382 class ArchivedArticles(Feed):    
383     def addArchivedArticle(self, title, link, updated_parsed, configdir):
384         entry = {}
385         entry["title"] = title
386         entry["link"] = link
387         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
388         entry["updated_parsed"] = updated_parsed
389         entry["time"] = time.time()
390         #print entry
391         (dateTuple, date) = self.extractDate(entry)
392         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
393                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
394         id = self.generateUniqueId(tmpEntry)
395         self.entries[id] = tmpEntry
396         self.ids.append(id)  
397         self.readItems[id] = False
398         self.countUnread = self.countUnread + 1
399         self.saveFeed(configdir)
400         self.saveUnread(configdir)
401         
402     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
403         for id in self.getIds():
404             entry = self.entries[id]
405             if not entry["downloaded"]:
406                 #try:
407                     f = urllib2.urlopen(entry["link"])
408                     #entry["content"] = f.read()
409                     html = f.read()
410                     f.close()
411                     soup = BeautifulSoup(html)
412                     images = soup('img')
413                     baseurl = entry["link"]
414                     for img in images:
415                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
416                         img['src']=filename
417                         entry["images"].append(filename)
418                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
419                     file = open(entry["contentLink"], "w")
420                     file.write(soup.prettify())
421                     file.close()
422                     if len(entry["content"]) > 0:
423                         entry["downloaded"] = True
424                         entry["time"] = time.time()
425                         self.setEntryUnread(id)
426                 #except:
427                 #    pass
428             #currentTime = time.time()
429             #expiry = float(expiryTime) * 3600
430             #if currentTime - entry["time"] > expiry:
431             #    if self.isEntryRead(id):
432             #        self.removeEntry(id)
433             #    else:
434             #        if currentTime - entry["time"] > 2*expiry:
435             #            self.removeEntry(id)
436         self.updateTime = time.asctime()
437         self.saveFeed(configdir)
438         
439     def purgeReadArticles(self):
440         ids = self.getIds()
441         for id in ids:
442             entry = self.entries[id]
443             if self.isEntryRead(id):
444                 self.removeEntry(id)
445                 
446     def removeArticle(self, id):
447         self.removeEntry(id)
448
449     def getArticle(self, index):
450         self.setEntryRead(index)
451         content = self.getContent(index)
452         return content
453
454
455 class Listing:
456     # Lists all the feeds in a dictionary, and expose the data
457     def __init__(self, configdir):
458         self.configdir = configdir
459         #self.feeds = {}
460         if isfile(self.configdir+"feeds.pickle"):
461             file = open(self.configdir+"feeds.pickle")
462             self.listOfFeeds = pickle.load(file)
463             file.close()
464         else:
465             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
466         try:
467             file = open(self.configdir+"images.pickle")
468             self.imageHandler = pickle.load(file)
469             file.close()
470         except:
471             self.imageHandler = ImageHandler(self.configdir)
472         if self.listOfFeeds.has_key("font"):
473             del self.listOfFeeds["font"]
474         if self.listOfFeeds.has_key("feedingit-order"):
475             self.sortedKeys = self.listOfFeeds["feedingit-order"]
476         else:
477             self.sortedKeys = self.listOfFeeds.keys()
478             if "font" in self.sortedKeys:
479                 self.sortedKeys.remove("font")
480             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
481         list = self.sortedKeys[:]
482         #self.closeCurrentlyDisplayedFeed()
483
484     def addArchivedArticle(self, key, index):
485         feed = self.getFeed(key)
486         title = feed.getTitle(index)
487         link = feed.getExternalLink(index)
488         date = feed.getDateTuple(index)
489         if not self.listOfFeeds.has_key("ArchivedArticles"):
490             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
491             self.sortedKeys.append("ArchivedArticles")
492             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
493             self.saveConfig()
494         archFeed = self.getFeed("ArchivedArticles")
495         archFeed.addArchivedArticle(title, link, date, self.configdir)
496         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
497         
498     def loadFeed(self, key):
499             if isfile(self.configdir+key+".d/feed"):
500                 file = open(self.configdir+key+".d/feed")
501                 feed = pickle.load(file)
502                 file.close()
503                 try:
504                     feed.uniqueId
505                     feed.imageHandler
506                 except AttributeError:
507                     feed.uniqueId = getId(feed.name)
508                     feed.imageHandler = self.imageHandler
509                 #feed.reloadUnread(self.configdir)
510             else:
511                 #print key
512                 title = self.listOfFeeds[key]["title"]
513                 url = self.listOfFeeds[key]["url"]
514                 if key == "ArchivedArticles":
515                     feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
516                 else:
517                     feed = Feed(getId(title), title, url, self.imageHandler)
518             return feed
519         
520     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
521         for key in self.getListOfFeeds():
522             feed = self.loadFeed(key)
523             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
524             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
525             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
526             
527     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
528         feed = self.getFeed(key)
529         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
530         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
531         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
532         
533     def editFeed(self, key, title, url):
534         self.listOfFeeds[key]["title"] = title
535         self.listOfFeeds[key]["url"] = url
536         feed = self.loadFeed(key)
537         feed.editFeed(url)
538
539     def getFeed(self, key):
540         try:
541             feed = self.loadFeed(key)
542             feed.reloadUnread(self.configdir)
543         except:
544             # If the feed file gets corrupted, we need to reset the feed.
545             import dbus
546             bus = dbus.SessionBus()
547             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
548                                "/org/freedesktop/Notifications" # Object's path
549                               )
550             iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
551             iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
552             if isdir(self.configdir+key+".d/"):
553                 rmtree(self.configdir+key+".d/")
554             feed = self.loadFeed(key)
555         return feed
556     
557     def getFeedUpdateTime(self, key):
558         #print self.listOfFeeds.has_key(key)
559         if not self.listOfFeeds[key].has_key("updateTime"):
560             self.listOfFeeds[key]["updateTime"] = "Never"
561         return self.listOfFeeds[key]["updateTime"]
562     
563     def getFeedNumberOfUnreadItems(self, key):
564         if not self.listOfFeeds[key].has_key("unread"):
565             self.listOfFeeds[key]["unread"] = 0
566         return self.listOfFeeds[key]["unread"]
567
568     def updateUnread(self, key, unreadItems):
569         self.listOfFeeds[key]["unread"] = unreadItems
570    
571     def getFeedTitle(self, key):
572         return self.listOfFeeds[key]["title"]
573     
574     def getFeedUrl(self, key):
575         return self.listOfFeeds[key]["url"]
576     
577     def getListOfFeeds(self):
578         return self.sortedKeys
579     
580     def addFeed(self, title, url):
581         if not self.listOfFeeds.has_key(getId(title)):
582             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
583             self.sortedKeys.append(getId(title))
584             self.saveConfig()
585             #self.feeds[getId(title)] = Feed(title, url)
586             return True
587         else:
588             return False
589         
590     def removeFeed(self, key):
591         del self.listOfFeeds[key]
592         self.sortedKeys.remove(key)
593         #del self.feeds[key]
594         if isdir(self.configdir+key+".d/"):
595            rmtree(self.configdir+key+".d/")
596         self.saveConfig()
597     
598     def saveConfig(self):
599         self.listOfFeeds["feedingit-order"] = self.sortedKeys
600         file = open(self.configdir+"feeds.pickle", "w")
601         pickle.dump(self.listOfFeeds, file)
602         file.close()
603         file = open(self.configdir+"images.pickle", "w")
604         pickle.dump(self.imageHandler, file)
605         file.close()
606         
607     def moveUp(self, key):
608         index = self.sortedKeys.index(key)
609         self.sortedKeys[index] = self.sortedKeys[index-1]
610         self.sortedKeys[index-1] = key
611         
612     def moveDown(self, key):
613         index = self.sortedKeys.index(key)
614         index2 = (index+1)%len(self.sortedKeys)
615         self.sortedKeys[index] = self.sortedKeys[index2]
616         self.sortedKeys[index2] = key
617     
618 if __name__ == "__main__":
619     listing = Listing('/home/user/.feedingit/')
620     list = listing.getListOfFeeds()[:]
621         #list.reverse()
622     for key in list:
623         if key.startswith('d8'):
624             print listing.getFeedUpdateTime(key)