0.5.3 - Image caching, first try
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.2
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile
27 from os.path import isdir
28 from shutil import rmtree
29 from os import mkdir, remove
30 import pickle
31 import md5
32 import feedparser
33 import time
34 import urllib2
35 from BeautifulSoup import BeautifulSoup
36 from urlparse import urlparse
37
38 #CONFIGDIR="/home/user/.feedingit/"
39
40 def getId(string):
41     return md5.new(string).hexdigest()
42
43 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
44
45 class ImageHandler:
46     def __init__(self, configdir):
47         self.configdir = configdir
48         self.images = {}
49         
50     def addImage(self, key, baseurl, url):
51         filename = self.configdir+key+".d/"+getId(url)
52         if not isfile(filename):
53             try:
54                 if url.startswith("http"):
55                     f = urllib2.urlopen(url)
56                 else:
57                     f = urllib2.urlopen(baseurl+"/"+url)
58                 outf = open(filename, "w")
59                 outf.write(f.read())
60                 f.close()
61                 outf.close()
62             except:
63                 print "Could not download" + url
64         if filename in self.images:
65             self.images[filename] += 1
66         else:
67             self.images[filename] = 1
68         return filename
69         
70     def removeImage(self, key, filename):
71         #filename = self.configdir+key+".d/"+getId(url)
72         try:
73             self.images[filename] -= 1
74         except:
75             self.images[filename] = 0 #Delete image
76         try:
77             if self.images[filename] == 0:
78                 remove(filename) #os.remove
79                 del self.images[filename]
80         except:
81             print "Could not remove image %s" % filename
82
83 class Feed:
84     def __init__(self, uniqueId, name, url, imageHandler):
85         self.titles = []
86         self.entries = {}
87         self.ids = []
88         self.readItems = {}
89         self.name = name
90         self.url = url
91         self.countUnread = 0
92         self.updateTime = "Never"
93         self.uniqueId = uniqueId
94         self.imageHandler = imageHandler
95
96     def editFeed(self, url):
97         self.url = url
98
99     def saveFeed(self, configdir):
100         if not isdir(configdir+self.uniqueId+".d"):
101              mkdir(configdir+self.uniqueId+".d")
102         file = open(configdir+self.uniqueId+".d/feed", "w")
103         pickle.dump(self, file )
104         file.close()
105         self.saveUnread(configdir)
106         
107     def saveUnread(self, configdir):
108         if not isdir(configdir+self.uniqueId+".d"):
109             mkdir(configdir+self.uniqueId+".d")
110         file = open(configdir+self.uniqueId+".d/unread", "w")
111         pickle.dump(self.readItems, file )
112         file.close()
113
114     def reloadUnread(self, configdir):
115         try:
116             file = open(configdir+self.uniqueId+".d/unread", "r")
117             self.readItems = pickle.load( file )
118             file.close()
119             self.countUnread = 0
120             for id in self.getIds():
121                if self.readItems[id]==False:
122                   self.countUnread = self.countUnread + 1
123         except:
124             pass
125         return self.countUnread
126
127     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
128         # Expiry time is in hours
129         if proxy == None:
130             tmp=feedparser.parse(self.url)
131         else:
132             tmp=feedparser.parse(self.url, handlers = [proxy])
133         # Check if the parse was succesful (number of entries > 0, else do nothing)
134         if len(tmp["entries"])>0:
135            #reversedEntries = self.getEntries()
136            #reversedEntries.reverse()
137            if not isdir(configdir+self.uniqueId+".d"):
138                mkdir(configdir+self.uniqueId+".d")
139            tmpEntries = {}
140            tmpIds = []
141            for entry in tmp["entries"]:
142                (dateTuple, date) = self.extractDate(entry)
143                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
144                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
145                id = self.generateUniqueId(tmpEntry)
146                if not id in self.ids:
147                    
148                    soup = BeautifulSoup(tmpEntry["content"])
149                    images = soup('img')
150                    baseurl = ''.join(urlparse(tmpEntry["link"])[:-1])
151                    if imageCache:
152                         for img in images:
153                             filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
154                             img['src']=filename
155                             tmpEntry["images"].append(filename)
156                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
157                    file = open(tmpEntry["contentLink"], "w")
158                    file.write(soup.prettify())
159                    file.close()
160                    tmpEntries[id] = tmpEntry
161                    tmpIds.append(id)
162             
163            for entryId in self.getIds()[:]:
164                 currentTime = time.time()
165                 expiry = float(expiryTime) * 3600.
166                 try:
167                     articleTime = time.mktime(self.entries[entryId]["dateTuple"])
168                     if currentTime - articleTime < expiry:
169                        tmpEntries[entryId] = self.entries[entryId]
170                        tmpIds.append(entryId)
171                     else:
172                         if (not self.isEntryRead(entryId)) and (currentTime - articleTime < 2*expiry):
173                             tmpEntries[entryId] = self.entries[entryId]
174                             tmpIds.append(entryId)
175                         else:
176                             self.removeEntry(id)
177                 except:
178                     self.removeEntry(id)
179                     print "Error purging old articles %s" % id
180                     
181                    
182            self.entries = tmpEntries
183            self.ids = tmpIds
184            self.countUnread = 0
185            # Initialize the new articles to unread
186            tmpReadItems = self.readItems
187            self.readItems = {}
188            for id in self.getIds():
189                if not tmpReadItems.has_key(id):
190                    self.readItems[id] = False
191                else:
192                    self.readItems[id] = tmpReadItems[id]
193                if self.readItems[id]==False:
194                   self.countUnread = self.countUnread + 1
195            del tmp
196            self.updateTime = time.asctime()
197            self.saveFeed(configdir)
198
199     def extractContent(self, entry):
200         content = ""
201         if entry.has_key('summary'):
202             content = entry.get('summary', '')
203         if entry.has_key('content'):
204             if len(entry.content[0].value) > len(content):
205                 content = entry.content[0].value
206         if content == "":
207             content = entry.get('description', '')
208         return content
209         
210     def extractDate(self, entry):
211         if entry.has_key("updated_parsed"):
212             date1 = entry["updated_parsed"]
213             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
214         elif entry.has_key("published_parsed"):
215             date1 = entry["published_parsed"]
216             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
217         else:
218             date1= ""
219             date = ""
220         #print date1, date
221         return (date1, date)
222
223     def setEntryRead(self, id):
224         if self.readItems[id]==False:
225             self.countUnread = self.countUnread - 1
226             self.readItems[id] = True
227             
228     def setEntryUnread(self, id):
229         if self.readItems[id]==True:
230             self.countUnread = self.countUnread + 1
231             self.readItems[id] = False
232     
233     def isEntryRead(self, id):
234         return self.readItems[id]
235     
236     def getTitle(self, id):
237         return self.entries[id]["title"]
238     
239     def getContentLink(self, id):
240         if self.entries[id].has_key("contentLink"):
241             return self.entries[id]["contentLink"]
242         return self.entries[id]["link"]
243     
244     def getExternalLink(self, id):
245         return self.entries[id]["link"]
246     
247     def getDate(self, id):
248         return self.entries[id]["date"]
249
250     def getDateTuple(self, id):
251         return self.entries[id]["dateTuple"]
252  
253     def getUniqueId(self, index):
254         return self.ids[index]
255     
256     def generateUniqueId(self, entry):
257         return getId(entry["date"] + entry["title"])
258     
259     def getUpdateTime(self):
260         return self.updateTime
261     
262     def getEntries(self):
263         return self.entries
264     
265     def getIds(self):
266         return self.ids
267     
268     def getNextId(self, id):
269         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
270     
271     def getPreviousId(self, id):
272         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
273     
274     def getNumberOfUnreadItems(self):
275         return self.countUnread
276     
277     def getNumberOfEntries(self):
278         return len(self.ids)
279     
280     def getItem(self, id):
281         try:
282             return self.entries[id]
283         except:
284             return []
285     
286     def getContent(self, id):
287         if self.entries[id].has_key("contentLink"):
288             file = open(self.entries[id]["contentLink"])
289             content = file.read()
290             file.close()
291             return content
292         return self.entries[id]["content"]
293     
294     def removeEntry(self, id):
295         #try:
296         if self.entries.has_key(id):
297             entry = self.entries[id]
298             if entry.has_key("images"):
299                 for img in entry["images"]:
300                     self.imageHandler.removeImage(self.uniqueId, img)
301             
302             if entry.has_key("contentLink"):
303                 try:
304                     remove(entry["contentLink"])  #os.remove
305                 except:
306                     print "File not found for deletion: %s" % entry["contentLink"]
307             del self.entries[id]
308         else:
309             print "Entries has no %s key" % id
310         if id in self.ids:
311             self.ids.remove(id)
312         else:
313             print "Ids has no %s key" % id
314         if self.readItems.has_key(id):
315             if self.readItems[id]==False:
316                 self.countUnread = self.countUnread - 1
317             del self.readItems[id]
318         else:
319             print "ReadItems has no %s key" % id
320         #except:
321         #    print "Error removing entry %s" %id
322     
323     def getArticle(self, id):
324         self.setEntryRead(id)
325         entry = self.entries[id]
326         title = entry['title']
327         #content = entry.get('content', entry.get('summary_detail', {}))
328         content = entry["content"]
329
330         link = entry['link']
331         date = entry["date"]
332
333         #text = '''<div style="color: black; background-color: white;">'''
334         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
335         text += "<html><head><title>" + title + "</title>"
336         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
337         #text += '<style> body {-webkit-user-select: none;} </style>'
338         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
339         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
340         text += "<BR /><BR />"
341         text += content
342         text += "</body></html>"
343         return text
344         
345 class ArchivedArticles(Feed):    
346     def addArchivedArticle(self, title, link, updated_parsed, configdir):
347         entry = {}
348         entry["title"] = title
349         entry["link"] = link
350         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
351         entry["updated_parsed"] = updated_parsed
352         entry["time"] = time.time()
353         #print entry
354         (dateTuple, date) = self.extractDate(entry)
355         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
356                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
357         id = self.generateUniqueId(tmpEntry)
358         self.entries[id] = tmpEntry
359         self.ids.append(id)  
360         self.readItems[id] = False
361         self.countUnread = self.countUnread + 1
362         self.saveFeed(configdir)
363         self.saveUnread(configdir)
364         
365     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
366         for id in self.getIds():
367             entry = self.entries[id]
368             if not entry["downloaded"]:
369                 #try:
370                     f = urllib2.urlopen(entry["link"])
371                     #entry["content"] = f.read()
372                     html = f.read()
373                     f.close()
374                     soup = BeautifulSoup(html)
375                     images = soup('img')
376                     baseurl = ''.join(urlparse(entry["link"])[:-1])
377                     for img in images:
378                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
379                         #filename = configdir+self.uniqueId+".d/"+getId(img['src'])
380                         #if not isfile(filename):
381                         #    try:
382                         #        if img['src'].startswith("http"):
383                         #            f = urllib2.urlopen(img['src'])
384                         #        else:
385                         #            f = urllib2.urlopen(baseurl+"/"+img['src'])
386                         #            #print baseurl+"/"+img['src']
387                         #        print filename
388                         #        outf = open(filename, "w")
389                         #        outf.write(f.read())
390                         #        f.close()
391                         #        outf.close()
392                         #    except:
393                         #        print "Could not download" + img['src']
394                         img['src']=filename
395                         entry["images"].append(filename)
396                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
397                     file = open(entry["contentLink"], "w")
398                     file.write(soup.prettify())
399                     file.close()
400                     if len(entry["content"]) > 0:
401                         entry["downloaded"] = True
402                         entry["time"] = time.time()
403                         self.setEntryUnread(id)
404                 #except:
405                 #    pass
406             currentTime = time.time()
407             expiry = float(expiryTime) * 3600
408             if currentTime - entry["time"] > expiry:
409                 if self.isEntryRead(id):
410                     self.removeEntry(id)
411                 else:
412                     if currentTime - entry["time"] > 2*expiry:
413                         self.removeEntry(id)
414         self.updateTime = time.asctime()
415         self.saveFeed(configdir)
416
417     def getArticle(self, index):
418         self.setEntryRead(index)
419         content = self.getContent(index)
420         return content
421
422
423 class Listing:
424     # Lists all the feeds in a dictionary, and expose the data
425     def __init__(self, configdir):
426         self.configdir = configdir
427         #self.feeds = {}
428         if isfile(self.configdir+"feeds.pickle"):
429             file = open(self.configdir+"feeds.pickle")
430             self.listOfFeeds = pickle.load(file)
431             file.close()
432         else:
433             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
434         if isfile(self.configdir+"images.pickle"):
435             file = open(self.configdir+"images.pickle")
436             self.imageHandler = pickle.load(file)
437             file.close()
438         else:
439             self.imageHandler = ImageHandler(self.configdir)
440         if self.listOfFeeds.has_key("font"):
441             del self.listOfFeeds["font"]
442         if self.listOfFeeds.has_key("feedingit-order"):
443             self.sortedKeys = self.listOfFeeds["feedingit-order"]
444         else:
445             self.sortedKeys = self.listOfFeeds.keys()
446             if "font" in self.sortedKeys:
447                 self.sortedKeys.remove("font")
448             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
449         list = self.sortedKeys[:]
450         #for key in list:
451         #    try:
452         #        self.loadFeed(key)
453         #    except:
454                 #import traceback
455                 #if key.startswith('d8'):
456                 #traceback.print_exc()
457         #        self.sortedKeys.remove(key)
458             #print key
459                 #print key in self.sortedKeys
460         #print "d8eb3f07572892a7b5ed9c81c5bb21a2" in self.sortedKeys
461         #print self.listOfFeeds["d8eb3f07572892a7b5ed9c81c5bb21a2"]
462         self.closeCurrentlyDisplayedFeed()
463         #self.saveConfig()
464
465     def addArchivedArticle(self, key, index):
466         feed = self.getFeed(key)
467         title = feed.getTitle(index)
468         link = feed.getLink(index)
469         date = feed.getDateTuple(index)
470         if not self.listOfFeeds.has_key("ArchivedArticles"):
471             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
472             self.sortedKeys.append("ArchivedArticles")
473             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
474             self.saveConfig()
475         archFeed = self.getFeed("ArchivedArticles")
476         archFeed.addArchivedArticle(title, link, date, self.configdir)
477         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
478         
479     def loadFeed(self, key):
480             if isfile(self.configdir+key+".d/feed"):
481                 file = open(self.configdir+key+".d/feed")
482                 feed = pickle.load(file)
483                 file.close()
484                 try:
485                     feed.uniqueId
486                     feed.imageHandler
487                 except AttributeError:
488                     feed.uniqueId = getId(feed.name)
489                     feed.imageHandler = self.imageHandler
490                 #feed.reloadUnread(self.configdir)
491             else:
492                 #print key
493                 title = self.listOfFeeds[key]["title"]
494                 url = self.listOfFeeds[key]["url"]
495                 if key == "ArchivedArticles":
496                     feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
497                 else:
498                     feed = Feed(getId(title), title, url, self.imageHandler)
499             return feed
500         
501     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
502         for key in self.getListOfFeeds():
503             feed = self.loadFeed(key)
504             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
505             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
506             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
507             
508     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
509         feed = self.getFeed(key)
510         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
511         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
512         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
513         
514     def editFeed(self, key, title, url):
515         self.listOfFeeds[key]["title"] = title
516         self.listOfFeeds[key]["url"] = url
517         feed = self.loadFeed(key)
518         feed.editFeed(url)
519
520     def getFeed(self, key):
521         feed = self.loadFeed(key)
522         feed.reloadUnread(self.configdir)
523         return feed
524     
525     def getFeedUpdateTime(self, key):
526         #print self.listOfFeeds.has_key(key)
527         if not self.listOfFeeds[key].has_key("updateTime"):
528             self.listOfFeeds[key]["updateTime"] = "Never"
529         return self.listOfFeeds[key]["updateTime"]
530     
531     def getFeedNumberOfUnreadItems(self, key):
532         if not self.listOfFeeds[key].has_key("unread"):
533             self.listOfFeeds[key]["unread"] = 0
534         return self.listOfFeeds[key]["unread"]
535
536     def updateUnread(self, key, unreadItems):
537         self.listOfFeeds[key]["unread"] = unreadItems
538    
539     def getFeedTitle(self, key):
540         return self.listOfFeeds[key]["title"]
541     
542     def getFeedUrl(self, key):
543         return self.listOfFeeds[key]["url"]
544     
545     def getListOfFeeds(self):
546         return self.sortedKeys
547     
548     #def getNumberOfUnreadItems(self, key):
549     #    if self.listOfFeeds.has_key("unread"):
550     #       return self.listOfFeeds[key]["unread"]
551     #    else:
552     #       return 0
553     
554     def addFeed(self, title, url):
555         if not self.listOfFeeds.has_key(getId(title)):
556             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
557             self.sortedKeys.append(getId(title))
558             self.saveConfig()
559             #self.feeds[getId(title)] = Feed(title, url)
560             return True
561         else:
562             return False
563         
564     def removeFeed(self, key):
565         del self.listOfFeeds[key]
566         self.sortedKeys.remove(key)
567         #del self.feeds[key]
568         if isdir(self.configdir+key+".d/"):
569            rmtree(self.configdir+key+".d/")
570         self.saveConfig()
571     
572     def saveConfig(self):
573         self.listOfFeeds["feedingit-order"] = self.sortedKeys
574         file = open(self.configdir+"feeds.pickle", "w")
575         pickle.dump(self.listOfFeeds, file)
576         file.close()
577         file = open(self.configdir+"images.pickle", "w")
578         pickle.dump(self.imageHandler, file)
579         file.close()
580         
581     def moveUp(self, key):
582         index = self.sortedKeys.index(key)
583         self.sortedKeys[index] = self.sortedKeys[index-1]
584         self.sortedKeys[index-1] = key
585         
586     def moveDown(self, key):
587         index = self.sortedKeys.index(key)
588         index2 = (index+1)%len(self.sortedKeys)
589         self.sortedKeys[index] = self.sortedKeys[index2]
590         self.sortedKeys[index2] = key
591         
592     def setCurrentlyDisplayedFeed(self, key):
593         self.currentlyDisplayedFeed = key
594     def closeCurrentlyDisplayedFeed(self):
595         self.currentlyDisplayedFeed = False
596     def getCurrentlyDisplayedFeed(self):
597         return self.currentlyDisplayedFeed
598     
599 if __name__ == "__main__":
600     listing = Listing('/home/user/.feedingit/')
601     list = listing.getListOfFeeds()[:]
602         #list.reverse()
603     for key in list:
604         if key.startswith('d8'):
605             print listing.getFeedUpdateTime(key)