Removed imageHandler class. Expiry of disk cache is time-based now
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove, utime
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urljoin
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         pass
62
63 class Feed:
64     def __init__(self, uniqueId, name, url):
65         self.titles = []
66         self.entries = {}
67         self.ids = []
68         self.readItems = {}
69         self.name = name
70         self.url = url
71         self.countUnread = 0
72         self.updateTime = "Never"
73         self.uniqueId = uniqueId
74
75     def addImage(self, configdir, key, baseurl, url):
76         filename = configdir+key+".d/"+getId(url)
77         if not isfile(filename):
78             try:
79                 #if url.startswith("http"):
80                 #    f = urllib2.urlopen(url)
81                 #else:
82                 f = urllib2.urlopen(urljoin(baseurl,url))
83                 outf = open(filename, "w")
84                 outf.write(f.read())
85                 f.close()
86                 outf.close()
87             except:
88                 print "Could not download " + url
89         else:
90             #open(filename,"a").close()  # "Touch" the file
91             file = open(filename,"a")
92             utime(filename, None)
93             file.close()
94         return filename
95
96     def editFeed(self, url):
97         self.url = url
98
99     def saveFeed(self, configdir):
100         if not isdir(configdir+self.uniqueId+".d"):
101              mkdir(configdir+self.uniqueId+".d")
102         file = open(configdir+self.uniqueId+".d/feed", "w")
103         pickle.dump(self, file )
104         file.close()
105         self.saveUnread(configdir)
106         
107     def saveUnread(self, configdir):
108         if not isdir(configdir+self.uniqueId+".d"):
109             mkdir(configdir+self.uniqueId+".d")
110         file = open(configdir+self.uniqueId+".d/unread", "w")
111         pickle.dump(self.readItems, file )
112         file.close()
113
114     def reloadUnread(self, configdir):
115         try:
116             file = open(configdir+self.uniqueId+".d/unread", "r")
117             self.readItems = pickle.load( file )
118             file.close()
119             self.countUnread = 0
120             for id in self.getIds():
121                if self.readItems[id]==False:
122                   self.countUnread = self.countUnread + 1
123         except:
124             pass
125         return self.countUnread
126
127     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
128         # Expiry time is in hours
129         if proxy == None:
130             tmp=feedparser.parse(self.url)
131         else:
132             tmp=feedparser.parse(self.url, handlers = [proxy])
133         expiry = float(expiryTime) * 3600.
134         # Check if the parse was succesful (number of entries > 0, else do nothing)
135         if len(tmp["entries"])>0:
136            if not isdir(configdir+self.uniqueId+".d"):
137                mkdir(configdir+self.uniqueId+".d")
138            try:
139                f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico"))
140                data = f.read()
141                f.close()
142                outf = open(configdir+self.uniqueId+".d/favicon.ico", "w")
143                outf.write(data)
144                outf.close()
145                del data
146            except:
147                 import traceback
148                 traceback.print_exc()
149
150
151            #reversedEntries = self.getEntries()
152            #reversedEntries.reverse()
153
154            currentTime = time.time()
155            tmpEntries = {}
156            tmpIds = []
157            for entry in tmp["entries"]:
158                (dateTuple, date) = self.extractDate(entry)
159                try:
160                    entry["title"]
161                except:
162                    entry["title"] = "No Title"
163                try:
164                    entry["link"]
165                except:
166                    entry["link"] = ""
167                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
168                             "date":date, "dateTuple":dateTuple, "link":entry["link"] }
169                id = self.generateUniqueId(tmpEntry)
170                
171                #articleTime = time.mktime(self.entries[id]["dateTuple"])
172                if not id in self.ids:
173                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
174                    images = soup('img')
175                    baseurl = tmpEntry["link"]
176                    if imageCache:
177                       for img in images:
178                           try:
179                             filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
180                             img['src']=filename
181                           except:
182                               print "Error downloading image %s" % img
183                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
184                    file = open(tmpEntry["contentLink"], "w")
185                    file.write(soup.prettify())
186                    file.close()
187                    tmpEntries[id] = tmpEntry
188                    tmpIds.append(id)
189                    if id not in self.readItems:
190                        self.readItems[id] = False
191                else:
192                    try:
193                        filename = configdir+self.uniqueId+".d/"+id+".html"
194                        file = open(filename,"a")
195                        utime(filename, None)
196                        file.close()
197                    except:
198                        pass
199                    tmpEntries[id] = self.entries[id]
200                    tmpIds.append(id)
201             
202            oldIds = self.ids[:]
203            for entryId in oldIds:
204                 if not entryId in tmpIds:
205                     try:
206                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
207                         if (currentTime - articleTime > 2*expiry):
208                             self.removeEntry(entryId)
209                             continue
210                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
211                             # Entry is over 24 hours, and already read
212                             self.removeEntry(entryId)
213                             continue
214                         tmpEntries[entryId] = self.entries[entryId]
215                         tmpIds.append(entryId)
216                     except:
217                         print "Error purging old articles %s" % entryId
218                         self.removeEntry(entryId)
219
220            self.entries = tmpEntries
221            self.ids = tmpIds
222            tmpUnread = 0
223            
224
225            ids = self.ids[:]
226            for id in ids:
227                if not self.readItems.has_key(id):
228                    self.readItems[id] = False
229                if self.readItems[id]==False:
230                   tmpUnread = tmpUnread + 1
231            keys = self.readItems.keys()
232            for id in keys:
233                if not id in self.ids:
234                    del self.readItems[id]
235            del tmp
236            self.countUnread = tmpUnread
237            self.updateTime = time.asctime()
238            self.saveFeed(configdir)
239            from glob import glob
240            from os import stat
241            for file in glob(configdir+self.uniqueId+".d/*"):
242                 #
243                 stats = stat(file)
244                 #
245                 # put the two dates into matching format
246                 #
247                 lastmodDate = stats[8]
248                 #
249                 expDate = time.time()-expiry*3
250                 # check if image-last-modified-date is outdated
251                 #
252                 if expDate > lastmodDate:
253                     #
254                     try:
255                         #
256                         #print 'Removing', file
257                         #
258                         remove(file) # commented out for testing
259                         #
260                     except OSError:
261                         #
262                         print 'Could not remove', file
263            
264
265     def extractContent(self, entry):
266         content = ""
267         if entry.has_key('summary'):
268             content = entry.get('summary', '')
269         if entry.has_key('content'):
270             if len(entry.content[0].value) > len(content):
271                 content = entry.content[0].value
272         if content == "":
273             content = entry.get('description', '')
274         return content
275         
276     def extractDate(self, entry):
277         if entry.has_key("updated_parsed"):
278             date1 = entry["updated_parsed"]
279             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
280         elif entry.has_key("published_parsed"):
281             date1 = entry["published_parsed"]
282             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
283         else:
284             date1= ""
285             date = ""
286         #print date1, date
287         return (date1, date)
288
289     def setEntryRead(self, id):
290         if self.readItems[id]==False:
291             self.countUnread = self.countUnread - 1
292             self.readItems[id] = True
293             
294     def setEntryUnread(self, id):
295         if self.readItems[id]==True:
296             self.countUnread = self.countUnread + 1
297             self.readItems[id] = False
298     
299     def isEntryRead(self, id):
300         return self.readItems[id]
301     
302     def getTitle(self, id):
303         return self.entries[id]["title"]
304     
305     def getContentLink(self, id):
306         if self.entries[id].has_key("contentLink"):
307             return self.entries[id]["contentLink"]
308         return self.entries[id]["link"]
309     
310     def getExternalLink(self, id):
311         return self.entries[id]["link"]
312     
313     def getDate(self, id):
314         return self.entries[id]["date"]
315
316     def getDateTuple(self, id):
317         return self.entries[id]["dateTuple"]
318  
319     def getUniqueId(self, index):
320         return self.ids[index]
321     
322     def generateUniqueId(self, entry):
323         return getId(entry["date"] + entry["title"])
324     
325     def getUpdateTime(self):
326         return self.updateTime
327     
328     def getEntries(self):
329         return self.entries
330     
331     def getIds(self):
332         return self.ids
333     
334     def getNextId(self, id):
335         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
336     
337     def getPreviousId(self, id):
338         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
339     
340     def getNumberOfUnreadItems(self):
341         return self.countUnread
342     
343     def getNumberOfEntries(self):
344         return len(self.ids)
345     
346     def getItem(self, id):
347         try:
348             return self.entries[id]
349         except:
350             return []
351     
352     def getContent(self, id):
353         if self.entries[id].has_key("contentLink"):
354             file = open(self.entries[id]["contentLink"])
355             content = file.read()
356             file.close()
357             return content
358         return self.entries[id]["content"]
359     
360     def removeEntry(self, id):
361         #try:
362         if self.entries.has_key(id):
363             entry = self.entries[id]
364             
365             if entry.has_key("contentLink"):
366                 try:
367                     remove(entry["contentLink"])  #os.remove
368                 except:
369                     print "File not found for deletion: %s" % entry["contentLink"]
370             del self.entries[id]
371         else:
372             print "Entries has no %s key" % id
373         if id in self.ids:
374             self.ids.remove(id)
375         else:
376             print "Ids has no %s key" % id
377         if self.readItems.has_key(id):
378             if self.readItems[id]==False:
379                 self.countUnread = self.countUnread - 1
380             del self.readItems[id]
381         else:
382             print "ReadItems has no %s key" % id
383         #except:
384         #    print "Error removing entry %s" %id
385     
386     def getArticle(self, entry):
387         #self.setEntryRead(id)
388         #entry = self.entries[id]
389         title = entry['title']
390         #content = entry.get('content', entry.get('summary_detail', {}))
391         content = entry["content"]
392
393         link = entry['link']
394         date = entry["date"]
395
396         #text = '''<div style="color: black; background-color: white;">'''
397         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
398         text += "<html><head><title>" + title + "</title>"
399         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
400         #text += '<style> body {-webkit-user-select: none;} </style>'
401         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
402         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
403         text += "<BR /><BR />"
404         text += content
405         text += "</body></html>"
406         return text
407         
408 class ArchivedArticles(Feed):    
409     def addArchivedArticle(self, title, link, updated_parsed, configdir):
410         entry = {}
411         entry["title"] = title
412         entry["link"] = link
413         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
414         entry["updated_parsed"] = updated_parsed
415         entry["time"] = time.time()
416         #print entry
417         (dateTuple, date) = self.extractDate(entry)
418         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
419                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
420         id = self.generateUniqueId(tmpEntry)
421         self.entries[id] = tmpEntry
422         self.ids.append(id)  
423         self.readItems[id] = False
424         self.countUnread = self.countUnread + 1
425         self.saveFeed(configdir)
426         self.saveUnread(configdir)
427         
428     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
429         for id in self.getIds():
430             entry = self.entries[id]
431             if not entry["downloaded"]:
432                 #try:
433                     f = urllib2.urlopen(entry["link"])
434                     #entry["content"] = f.read()
435                     html = f.read()
436                     f.close()
437                     soup = BeautifulSoup(html)
438                     images = soup('img')
439                     baseurl = entry["link"]
440                     for img in images:
441                         filename = self.addImage(self.uniqueId, baseurl, img['src'])
442                         img['src']=filename
443                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
444                     file = open(entry["contentLink"], "w")
445                     file.write(soup.prettify())
446                     file.close()
447                     if len(entry["content"]) > 0:
448                         entry["downloaded"] = True
449                         entry["time"] = time.time()
450                         self.setEntryUnread(id)
451                 #except:
452                 #    pass
453             #currentTime = time.time()
454             #expiry = float(expiryTime) * 3600
455             #if currentTime - entry["time"] > expiry:
456             #    if self.isEntryRead(id):
457             #        self.removeEntry(id)
458             #    else:
459             #        if currentTime - entry["time"] > 2*expiry:
460             #            self.removeEntry(id)
461         self.updateTime = time.asctime()
462         self.saveFeed(configdir)
463         
464     def purgeReadArticles(self):
465         ids = self.getIds()
466         for id in ids:
467             entry = self.entries[id]
468             if self.isEntryRead(id):
469                 self.removeEntry(id)
470                 
471     def removeArticle(self, id):
472         self.removeEntry(id)
473
474     def getArticle(self, index):
475         self.setEntryRead(index)
476         content = self.getContent(index)
477         return content
478
479
480 class Listing:
481     # Lists all the feeds in a dictionary, and expose the data
482     def __init__(self, configdir):
483         self.configdir = configdir
484         #self.feeds = {}
485         if isfile(self.configdir+"feeds.pickle"):
486             file = open(self.configdir+"feeds.pickle")
487             self.listOfFeeds = pickle.load(file)
488             file.close()
489         else:
490             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
491         if self.listOfFeeds.has_key("font"):
492             del self.listOfFeeds["font"]
493         if self.listOfFeeds.has_key("feedingit-order"):
494             self.sortedKeys = self.listOfFeeds["feedingit-order"]
495         else:
496             self.sortedKeys = self.listOfFeeds.keys()
497             if "font" in self.sortedKeys:
498                 self.sortedKeys.remove("font")
499             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
500         list = self.sortedKeys[:]
501         #self.closeCurrentlyDisplayedFeed()
502
503     def addArchivedArticle(self, key, index):
504         feed = self.getFeed(key)
505         title = feed.getTitle(index)
506         link = feed.getExternalLink(index)
507         date = feed.getDateTuple(index)
508         if not self.listOfFeeds.has_key("ArchivedArticles"):
509             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
510             self.sortedKeys.append("ArchivedArticles")
511             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
512             self.saveConfig()
513         archFeed = self.getFeed("ArchivedArticles")
514         archFeed.addArchivedArticle(title, link, date, self.configdir)
515         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
516         
517     def loadFeed(self, key):
518             if isfile(self.configdir+key+".d/feed"):
519                 file = open(self.configdir+key+".d/feed")
520                 feed = pickle.load(file)
521                 file.close()
522                 try:
523                     feed.uniqueId
524                 except AttributeError:
525                     feed.uniqueId = getId(feed.name)
526                 try:
527                     del feed.imageHandler
528                 except:
529                     pass
530                 #feed.reloadUnread(self.configdir)
531             else:
532                 #print key
533                 title = self.listOfFeeds[key]["title"]
534                 url = self.listOfFeeds[key]["url"]
535                 if key == "ArchivedArticles":
536                     feed = ArchivedArticles("ArchivedArticles", title, url)
537                 else:
538                     feed = Feed(getId(title), title, url)
539             return feed
540         
541     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
542         for key in self.getListOfFeeds():
543             feed = self.loadFeed(key)
544             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
545             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
546             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
547             
548     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
549         feed = self.getFeed(key)
550         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
551         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
552         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
553         
554     def editFeed(self, key, title, url):
555         self.listOfFeeds[key]["title"] = title
556         self.listOfFeeds[key]["url"] = url
557         feed = self.loadFeed(key)
558         feed.editFeed(url)
559
560     def getFeed(self, key):
561         try:
562             feed = self.loadFeed(key)
563             feed.reloadUnread(self.configdir)
564         except:
565             # If the feed file gets corrupted, we need to reset the feed.
566             import traceback
567             traceback.print_exc()
568             import dbus
569             bus = dbus.SessionBus()
570             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
571                                "/org/freedesktop/Notifications" # Object's path
572                               )
573             iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
574             iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
575             if isdir(self.configdir+key+".d/"):
576                 rmtree(self.configdir+key+".d/")
577             feed = self.loadFeed(key)
578         return feed
579     
580     def getFeedUpdateTime(self, key):
581         #print self.listOfFeeds.has_key(key)
582         if not self.listOfFeeds[key].has_key("updateTime"):
583             self.listOfFeeds[key]["updateTime"] = "Never"
584         return self.listOfFeeds[key]["updateTime"]
585     
586     def getFeedNumberOfUnreadItems(self, key):
587         if not self.listOfFeeds[key].has_key("unread"):
588             self.listOfFeeds[key]["unread"] = 0
589         return self.listOfFeeds[key]["unread"]
590
591     def updateUnread(self, key, unreadItems):
592         self.listOfFeeds[key]["unread"] = unreadItems
593    
594     def getFeedTitle(self, key):
595         return self.listOfFeeds[key]["title"]
596     
597     def getFeedUrl(self, key):
598         return self.listOfFeeds[key]["url"]
599     
600     def getListOfFeeds(self):
601         return self.sortedKeys
602     
603     def getFavicon(self, key):
604         filename = self.configdir+key+".d/favicon.ico"
605         if isfile(filename):
606             return filename
607         else:
608             return False
609     
610     def addFeed(self, title, url):
611         if not self.listOfFeeds.has_key(getId(title)):
612             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
613             self.sortedKeys.append(getId(title))
614             self.saveConfig()
615             #self.feeds[getId(title)] = Feed(title, url)
616             return True
617         else:
618             return False
619         
620     def removeFeed(self, key):
621         del self.listOfFeeds[key]
622         self.sortedKeys.remove(key)
623         #del self.feeds[key]
624         if isdir(self.configdir+key+".d/"):
625            rmtree(self.configdir+key+".d/")
626         self.saveConfig()
627     
628     def saveConfig(self):
629         self.listOfFeeds["feedingit-order"] = self.sortedKeys
630         file = open(self.configdir+"feeds.pickle", "w")
631         pickle.dump(self.listOfFeeds, file)
632         file.close()
633         
634     def moveUp(self, key):
635         index = self.sortedKeys.index(key)
636         self.sortedKeys[index] = self.sortedKeys[index-1]
637         self.sortedKeys[index-1] = key
638         
639     def moveDown(self, key):
640         index = self.sortedKeys.index(key)
641         index2 = (index+1)%len(self.sortedKeys)
642         self.sortedKeys[index] = self.sortedKeys[index2]
643         self.sortedKeys[index2] = key
644     
645 if __name__ == "__main__":
646     listing = Listing('/home/user/.feedingit/')
647     list = listing.getListOfFeeds()[:]
648         #list.reverse()
649     for key in list:
650         if key.startswith('d8'):
651             print listing.getFeedUpdateTime(key)