fix formatting
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove, utime
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urljoin
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         pass
62
63 class Feed:
64     def __init__(self, uniqueId, name, url):
65         self.titles = []
66         self.entries = {}
67         self.ids = []
68         self.readItems = {}
69         self.name = name
70         self.url = url
71         self.countUnread = 0
72         self.updateTime = "Never"
73         self.uniqueId = uniqueId
74         self.etag = None
75         self.modified = None
76
77     def addImage(self, configdir, key, baseurl, url):
78         filename = configdir+key+".d/"+getId(url)
79         if not isfile(filename):
80             try:
81                 #if url.startswith("http"):
82                 #    f = urllib2.urlopen(url)
83                 #else:
84                 f = urllib2.urlopen(urljoin(baseurl,url))
85                 outf = open(filename, "w")
86                 outf.write(f.read())
87                 f.close()
88                 outf.close()
89             except:
90                 print "Could not download " + url
91         else:
92             #open(filename,"a").close()  # "Touch" the file
93             file = open(filename,"a")
94             utime(filename, None)
95             file.close()
96         return filename
97
98     def editFeed(self, url):
99         self.url = url
100
101     def saveFeed(self, configdir):
102         if not isdir(configdir+self.uniqueId+".d"):
103              mkdir(configdir+self.uniqueId+".d")
104         file = open(configdir+self.uniqueId+".d/feed", "w")
105         pickle.dump(self, file )
106         file.close()
107         self.saveUnread(configdir)
108         
109     def saveUnread(self, configdir):
110         if not isdir(configdir+self.uniqueId+".d"):
111             mkdir(configdir+self.uniqueId+".d")
112         file = open(configdir+self.uniqueId+".d/unread", "w")
113         pickle.dump(self.readItems, file )
114         file.close()
115
116     def reloadUnread(self, configdir):
117         try:
118             file = open(configdir+self.uniqueId+".d/unread", "r")
119             self.readItems = pickle.load( file )
120             file.close()
121             self.countUnread = 0
122             for id in self.getIds():
123                if self.readItems[id]==False:
124                   self.countUnread = self.countUnread + 1
125         except:
126             pass
127         return self.countUnread
128
129     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
130         # Expiry time is in hours
131         if proxy == None:
132             tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified)
133         else:
134             tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified, handlers = [proxy])
135         try:
136             self.etag = tmp["etag"]
137         except KeyError:
138             pass
139         try:
140             self.modified = tmp["modified"]
141         except KeyError:
142             pass
143         expiry = float(expiryTime) * 3600.
144         # Check if the parse was succesful (number of entries > 0, else do nothing)
145         if len(tmp["entries"])>0:
146            if not isdir(configdir+self.uniqueId+".d"):
147                mkdir(configdir+self.uniqueId+".d")
148            try:
149                f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico"))
150                data = f.read()
151                f.close()
152                outf = open(configdir+self.uniqueId+".d/favicon.ico", "w")
153                outf.write(data)
154                outf.close()
155                del data
156            except:
157                #import traceback
158                #traceback.print_exc()
159                 pass
160
161
162            #reversedEntries = self.getEntries()
163            #reversedEntries.reverse()
164
165            currentTime = time.time()
166            tmpEntries = {}
167            tmpIds = []
168            for entry in tmp["entries"]:
169                (dateTuple, date) = self.extractDate(entry)
170                try:
171                    entry["title"]
172                except:
173                    entry["title"] = "No Title"
174                try:
175                    entry["link"]
176                except:
177                    entry["link"] = ""
178                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
179                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
180                id = self.generateUniqueId(tmpEntry)
181                
182                #articleTime = time.mktime(self.entries[id]["dateTuple"])
183                if not id in self.ids:
184                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
185                    images = soup('img')
186                    baseurl = tmpEntry["link"]
187                    if imageCache:
188                       for img in images:
189                           try:
190                             filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
191                             img['src']=filename
192                             tmpEntry["images"].append(filename)
193                           except:
194                               print "Error downloading image %s" % img
195                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
196                    file = open(tmpEntry["contentLink"], "w")
197                    file.write(soup.prettify())
198                    file.close()
199                    tmpEntries[id] = tmpEntry
200                    tmpIds.append(id)
201                    if id not in self.readItems:
202                        self.readItems[id] = False
203                else:
204                    try:
205                        filename = configdir+self.uniqueId+".d/"+id+".html"
206                        file = open(filename,"a")
207                        utime(filename, None)
208                        file.close()
209                        for image in self.entries[id]["images"]:
210                             file = open(image,"a")
211                             utime(image, None)
212                             file.close()
213                    except:
214                        pass
215                    tmpEntries[id] = self.entries[id]
216                    tmpIds.append(id)
217             
218            oldIds = self.ids[:]
219            for entryId in oldIds:
220                 if not entryId in tmpIds:
221                     try:
222                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
223                         if (currentTime - articleTime > 2*expiry):
224                             self.removeEntry(entryId)
225                             continue
226                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
227                             # Entry is over 24 hours, and already read
228                             self.removeEntry(entryId)
229                             continue
230                         tmpEntries[entryId] = self.entries[entryId]
231                         tmpIds.append(entryId)
232                     except:
233                         print "Error purging old articles %s" % entryId
234                         self.removeEntry(entryId)
235
236            self.entries = tmpEntries
237            self.ids = tmpIds
238            tmpUnread = 0
239            
240
241            ids = self.ids[:]
242            for id in ids:
243                if not self.readItems.has_key(id):
244                    self.readItems[id] = False
245                if self.readItems[id]==False:
246                   tmpUnread = tmpUnread + 1
247            keys = self.readItems.keys()
248            for id in keys:
249                if not id in self.ids:
250                    del self.readItems[id]
251            del tmp
252            self.countUnread = tmpUnread
253            self.updateTime = time.asctime()
254            self.saveFeed(configdir)
255            from glob import glob
256            from os import stat
257            for file in glob(configdir+self.uniqueId+".d/*"):
258                 #
259                 stats = stat(file)
260                 #
261                 # put the two dates into matching format
262                 #
263                 lastmodDate = stats[8]
264                 #
265                 expDate = time.time()-expiry*3
266                 # check if image-last-modified-date is outdated
267                 #
268                 if expDate > lastmodDate:
269                     #
270                     try:
271                         #
272                         #print 'Removing', file
273                         #
274                         remove(file) # commented out for testing
275                         #
276                     except OSError:
277                         #
278                         print 'Could not remove', file
279            
280
281     def extractContent(self, entry):
282         content = ""
283         if entry.has_key('summary'):
284             content = entry.get('summary', '')
285         if entry.has_key('content'):
286             if len(entry.content[0].value) > len(content):
287                 content = entry.content[0].value
288         if content == "":
289             content = entry.get('description', '')
290         return content
291         
292     def extractDate(self, entry):
293         if entry.has_key("updated_parsed"):
294             date1 = entry["updated_parsed"]
295             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
296         elif entry.has_key("published_parsed"):
297             date1 = entry["published_parsed"]
298             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
299         else:
300             date1= ""
301             date = ""
302         #print date1, date
303         return (date1, date)
304
305     def setEntryRead(self, id):
306         if self.readItems[id]==False:
307             self.countUnread = self.countUnread - 1
308             self.readItems[id] = True
309             
310     def setEntryUnread(self, id):
311         if self.readItems[id]==True:
312             self.countUnread = self.countUnread + 1
313             self.readItems[id] = False
314     
315     def isEntryRead(self, id):
316         return self.readItems[id]
317     
318     def getTitle(self, id):
319         return self.entries[id]["title"]
320     
321     def getContentLink(self, id):
322         if self.entries[id].has_key("contentLink"):
323             return self.entries[id]["contentLink"]
324         return self.entries[id]["link"]
325     
326     def getExternalLink(self, id):
327         return self.entries[id]["link"]
328     
329     def getDate(self, id):
330         return self.entries[id]["date"]
331
332     def getDateTuple(self, id):
333         return self.entries[id]["dateTuple"]
334  
335     def getUniqueId(self, index):
336         return self.ids[index]
337     
338     def generateUniqueId(self, entry):
339         return getId(entry["date"] + entry["title"])
340     
341     def getUpdateTime(self):
342         return self.updateTime
343     
344     def getEntries(self):
345         return self.entries
346     
347     def getIds(self):
348         return self.ids
349     
350     def getNextId(self, id):
351         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
352     
353     def getPreviousId(self, id):
354         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
355     
356     def getNumberOfUnreadItems(self):
357         return self.countUnread
358     
359     def getNumberOfEntries(self):
360         return len(self.ids)
361     
362     def getItem(self, id):
363         try:
364             return self.entries[id]
365         except:
366             return []
367     
368     def getContent(self, id):
369         if self.entries[id].has_key("contentLink"):
370             file = open(self.entries[id]["contentLink"])
371             content = file.read()
372             file.close()
373             return content
374         return self.entries[id]["content"]
375     
376     def removeEntry(self, id):
377         #try:
378         if self.entries.has_key(id):
379             entry = self.entries[id]
380             
381             if entry.has_key("contentLink"):
382                 try:
383                     remove(entry["contentLink"])  #os.remove
384                 except:
385                     print "File not found for deletion: %s" % entry["contentLink"]
386             del self.entries[id]
387         else:
388             print "Entries has no %s key" % id
389         if id in self.ids:
390             self.ids.remove(id)
391         else:
392             print "Ids has no %s key" % id
393         if self.readItems.has_key(id):
394             if self.readItems[id]==False:
395                 self.countUnread = self.countUnread - 1
396             del self.readItems[id]
397         else:
398             print "ReadItems has no %s key" % id
399         #except:
400         #    print "Error removing entry %s" %id
401     
402     def getArticle(self, entry):
403         #self.setEntryRead(id)
404         #entry = self.entries[id]
405         title = entry['title']
406         #content = entry.get('content', entry.get('summary_detail', {}))
407         content = entry["content"]
408
409         link = entry['link']
410         date = entry["date"]
411
412         #text = '''<div style="color: black; background-color: white;">'''
413         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
414         text += "<html><head><title>" + title + "</title>"
415         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
416         #text += '<style> body {-webkit-user-select: none;} </style>'
417         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
418         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
419         text += "<BR /><BR />"
420         text += content
421         text += "</body></html>"
422         return text
423         
424 class ArchivedArticles(Feed):    
425     def addArchivedArticle(self, title, link, updated_parsed, configdir):
426         entry = {}
427         entry["title"] = title
428         entry["link"] = link
429         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
430         entry["updated_parsed"] = updated_parsed
431         entry["time"] = time.time()
432         #print entry
433         (dateTuple, date) = self.extractDate(entry)
434         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
435                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
436         id = self.generateUniqueId(tmpEntry)
437         self.entries[id] = tmpEntry
438         self.ids.append(id)  
439         self.readItems[id] = False
440         self.countUnread = self.countUnread + 1
441         self.saveFeed(configdir)
442         self.saveUnread(configdir)
443         
444     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
445         for id in self.getIds():
446             entry = self.entries[id]
447             if not entry["downloaded"]:
448                 #try:
449                     f = urllib2.urlopen(entry["link"])
450                     #entry["content"] = f.read()
451                     html = f.read()
452                     f.close()
453                     soup = BeautifulSoup(html)
454                     images = soup('img')
455                     baseurl = entry["link"]
456                     for img in images:
457                         filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
458                         img['src']=filename
459                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
460                     file = open(entry["contentLink"], "w")
461                     file.write(soup.prettify())
462                     file.close()
463                     if len(entry["content"]) > 0:
464                         entry["downloaded"] = True
465                         entry["time"] = time.time()
466                         self.setEntryUnread(id)
467                 #except:
468                 #    pass
469             #currentTime = time.time()
470             #expiry = float(expiryTime) * 3600
471             #if currentTime - entry["time"] > expiry:
472             #    if self.isEntryRead(id):
473             #        self.removeEntry(id)
474             #    else:
475             #        if currentTime - entry["time"] > 2*expiry:
476             #            self.removeEntry(id)
477         self.updateTime = time.asctime()
478         self.saveFeed(configdir)
479         
480     def purgeReadArticles(self):
481         ids = self.getIds()
482         for id in ids:
483             entry = self.entries[id]
484             if self.isEntryRead(id):
485                 self.removeEntry(id)
486                 
487     def removeArticle(self, id):
488         self.removeEntry(id)
489
490     def getArticle(self, index):
491         self.setEntryRead(index)
492         content = self.getContent(index)
493         return content
494
495
496 class Listing:
497     # Lists all the feeds in a dictionary, and expose the data
498     def __init__(self, configdir):
499         self.configdir = configdir
500         #self.feeds = {}
501         if isfile(self.configdir+"feeds.pickle"):
502             file = open(self.configdir+"feeds.pickle")
503             self.listOfFeeds = pickle.load(file)
504             file.close()
505         else:
506             self.listOfFeeds = {getId("Maemo News"):{"title":"Maemo News", "url":"http://maemo.org/news/items.xml", "unread":0, "updateTime":"Never"}, }
507         if self.listOfFeeds.has_key("font"):
508             del self.listOfFeeds["font"]
509         if self.listOfFeeds.has_key("feedingit-order"):
510             self.sortedKeys = self.listOfFeeds["feedingit-order"]
511         else:
512             self.sortedKeys = self.listOfFeeds.keys()
513             if "font" in self.sortedKeys:
514                 self.sortedKeys.remove("font")
515             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
516         list = self.sortedKeys[:]
517         #self.closeCurrentlyDisplayedFeed()
518
519     def addArchivedArticle(self, key, index):
520         feed = self.getFeed(key)
521         title = feed.getTitle(index)
522         link = feed.getExternalLink(index)
523         date = feed.getDateTuple(index)
524         if not self.listOfFeeds.has_key("ArchivedArticles"):
525             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
526             self.sortedKeys.append("ArchivedArticles")
527             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
528             self.saveConfig()
529         archFeed = self.getFeed("ArchivedArticles")
530         archFeed.addArchivedArticle(title, link, date, self.configdir)
531         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
532         
533     def loadFeed(self, key):
534             if isfile(self.configdir+key+".d/feed"):
535                 file = open(self.configdir+key+".d/feed")
536                 feed = pickle.load(file)
537                 file.close()
538                 try:
539                     feed.uniqueId
540                 except AttributeError:
541                     feed.uniqueId = getId(feed.name)
542                 try:
543                     del feed.imageHandler
544                 except:
545                     pass
546                 try:
547                     feed.etag
548                 except AttributeError:
549                     feed.etag = None
550                 try:
551                     feed.modified
552                 except AttributeError:
553                     feed.modified = None
554                 #feed.reloadUnread(self.configdir)
555             else:
556                 #print key
557                 title = self.listOfFeeds[key]["title"]
558                 url = self.listOfFeeds[key]["url"]
559                 if key == "ArchivedArticles":
560                     feed = ArchivedArticles("ArchivedArticles", title, url)
561                 else:
562                     feed = Feed(getId(title), title, url)
563             return feed
564         
565     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
566         for key in self.getListOfFeeds():
567             feed = self.loadFeed(key)
568             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
569             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
570             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
571             
572     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
573         feed = self.getFeed(key)
574         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
575         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
576         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
577         
578     def editFeed(self, key, title, url):
579         self.listOfFeeds[key]["title"] = title
580         self.listOfFeeds[key]["url"] = url
581         feed = self.loadFeed(key)
582         feed.editFeed(url)
583
584     def getFeed(self, key):
585         try:
586             feed = self.loadFeed(key)
587             feed.reloadUnread(self.configdir)
588         except:
589             # If the feed file gets corrupted, we need to reset the feed.
590             import traceback
591             traceback.print_exc()
592             import dbus
593             bus = dbus.SessionBus()
594             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
595                                "/org/freedesktop/Notifications" # Object's path
596                               )
597             iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
598             iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
599             if isdir(self.configdir+key+".d/"):
600                 rmtree(self.configdir+key+".d/")
601             feed = self.loadFeed(key)
602         return feed
603     
604     def getFeedUpdateTime(self, key):
605         #print self.listOfFeeds.has_key(key)
606         if not self.listOfFeeds[key].has_key("updateTime"):
607             self.listOfFeeds[key]["updateTime"] = "Never"
608         return self.listOfFeeds[key]["updateTime"]
609     
610     def getFeedNumberOfUnreadItems(self, key):
611         if not self.listOfFeeds[key].has_key("unread"):
612             self.listOfFeeds[key]["unread"] = 0
613         return self.listOfFeeds[key]["unread"]
614
615     def updateUnread(self, key, unreadItems):
616         self.listOfFeeds[key]["unread"] = unreadItems
617    
618     def getFeedTitle(self, key):
619         return self.listOfFeeds[key]["title"]
620     
621     def getFeedUrl(self, key):
622         return self.listOfFeeds[key]["url"]
623     
624     def getListOfFeeds(self):
625         return self.sortedKeys
626     
627     def getFavicon(self, key):
628         filename = self.configdir+key+".d/favicon.ico"
629         if isfile(filename):
630             return filename
631         else:
632             return False
633     
634     def addFeed(self, title, url):
635         if not self.listOfFeeds.has_key(getId(title)):
636             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
637             self.sortedKeys.append(getId(title))
638             self.saveConfig()
639             #self.feeds[getId(title)] = Feed(title, url)
640             return True
641         else:
642             return False
643         
644     def removeFeed(self, key):
645         del self.listOfFeeds[key]
646         self.sortedKeys.remove(key)
647         #del self.feeds[key]
648         if isdir(self.configdir+key+".d/"):
649            rmtree(self.configdir+key+".d/")
650         self.saveConfig()
651     
652     def saveConfig(self):
653         self.listOfFeeds["feedingit-order"] = self.sortedKeys
654         file = open(self.configdir+"feeds.pickle", "w")
655         pickle.dump(self.listOfFeeds, file)
656         file.close()
657         
658     def moveUp(self, key):
659         index = self.sortedKeys.index(key)
660         self.sortedKeys[index] = self.sortedKeys[index-1]
661         self.sortedKeys[index-1] = key
662         
663     def moveDown(self, key):
664         index = self.sortedKeys.index(key)
665         index2 = (index+1)%len(self.sortedKeys)
666         self.sortedKeys[index] = self.sortedKeys[index2]
667         self.sortedKeys[index2] = key
668     
669 if __name__ == "__main__":
670     listing = Listing('/home/user/.feedingit/')
671     list = listing.getListOfFeeds()[:]
672         #list.reverse()
673     for key in list:
674         if key.startswith('d8'):
675             print listing.getFeedUpdateTime(key)