444c918e229fd4996522871b120b5d0660e41613
[feedingit] / src / rss_sqlite.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # Copyright (c) 2011 Neal H. Walfield
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Lesser General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 #  This program is distributed in the hope that it will be useful,
12 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 #  GNU Lesser General Public License for more details.
15 #
16 #  You should have received a copy of the GNU Lesser General Public License
17 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 #
19
20 # ============================================================================
21 # Name        : FeedingIt.py
22 # Author      : Yves Marcoz
23 # Version     : 0.5.4
24 # Description : Simple RSS Reader
25 # ============================================================================
26
27 from __future__ import with_statement
28
29 import sqlite3
30 from os.path import isfile, isdir
31 from shutil import rmtree
32 from os import mkdir, remove, utime
33 import os
34 import md5
35 import feedparser
36 import time
37 import urllib2
38 from BeautifulSoup import BeautifulSoup
39 from urlparse import urljoin
40 from calendar import timegm
41 import threading
42 import traceback
43 from wc import wc, wc_init, woodchuck
44 import subprocess
45 import dbus
46 from updatedbus import update_server_object
47
48 from jobmanager import JobManager
49 import mainthread
50 from httpprogresshandler import HTTPProgressHandler
51 import random
52 import sys
53 import logging
54 logger = logging.getLogger(__name__)
55
56 def getId(string):
57     return md5.new(string).hexdigest()
58
59 def download_callback(connection):
60     if JobManager().do_quit:
61         raise KeyboardInterrupt
62
63 def downloader(progress_handler=None, proxy=None):
64     openers = []
65
66     if progress_handler is not None:
67         openers.append(progress_handler)
68     else:
69         openers.append(HTTPProgressHandler(download_callback))
70
71     if proxy:
72         openers.append(proxy)
73
74     return urllib2.build_opener(*openers)
75
76 # If not None, a subprocess.Popen object corresponding to a
77 # update_feeds.py process.
78 update_feed_process = None
79
80 update_feeds_iface = None
81
82 jobs_at_start = 0
83
84 class Feed:
85     serial_execution_lock = threading.Lock()
86
87     def _getdb(self):
88         try:
89             db = self.tls.db
90         except AttributeError:
91             db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
92             self.tls.db = db
93         return db
94     db = property(_getdb)
95
96     def __init__(self, configdir, key):
97         self.key = key
98         self.configdir = configdir
99         self.dir = "%s/%s.d" %(self.configdir, self.key)
100         self.tls = threading.local ()
101
102         if not isdir(self.dir):
103             mkdir(self.dir)
104         if not isfile("%s/%s.db" %(self.dir, self.key)):
105             self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
106             self.db.execute("CREATE TABLE images (id text, imagePath text);")
107             self.db.commit()
108
109     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
110         filename = configdir+key+".d/"+getId(url)
111         if not isfile(filename):
112             try:
113                 if not opener:
114                     opener = downloader(proxy=proxy)
115
116                 abs_url = urljoin(baseurl,url)
117                 f = opener.open(abs_url)
118                 try:
119                     with open(filename, "w") as outf:
120                         for data in f:
121                             outf.write(data)
122                 finally:
123                     f.close()
124             except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
125                 logger.info("Could not download image %s: %s"
126                             % (abs_url, str (exception)))
127                 return None
128             except:
129                 exception = sys.exc_info()[0]
130
131                 logger.info("Downloading image %s: %s" %
132                             (abs_url, traceback.format_exc()))
133                 try:
134                     remove(filename)
135                 except OSError:
136                     pass
137
138                 raise exception
139         else:
140             #open(filename,"a").close()  # "Touch" the file
141             file = open(filename,"a")
142             utime(filename, None)
143             file.close()
144         return filename
145
146     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
147         if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
148             def doit():
149                 def it():
150                     self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
151                 return it
152             JobManager().execute(doit(), self.key, priority=priority)
153         else:
154             def send_update_request():
155                 global update_feeds_iface
156                 if update_feeds_iface is None:
157                     bus=dbus.SessionBus()
158                     remote_object = bus.get_object(
159                         "org.marcoz.feedingit", # Connection name
160                         "/org/marcoz/feedingit/update" # Object's path
161                         )
162                     update_feeds_iface = dbus.Interface(
163                         remote_object, 'org.marcoz.feedingit')
164
165                 try:
166                     update_feeds_iface.Update(self.key)
167                 except Exception, e:
168                     logger.error("Invoking org.marcoz.feedingit.Update: %s"
169                                  % str(e))
170                     update_feeds_iface = None
171                 else:
172                     return True
173
174             if send_update_request():
175                 # Success!  It seems we were able to start the update
176                 # daemon via dbus (or, it was already running).
177                 return
178
179             global update_feed_process
180             if (update_feed_process is None
181                 or update_feed_process.poll() is not None):
182                 # The update_feeds process is not running.  Start it.
183                 update_feeds = os.path.join(os.path.dirname(__file__),
184                                             'update_feeds.py')
185                 argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
186                 logger.debug("Starting update_feeds: running %s"
187                              % (str(argv),))
188                 update_feed_process = subprocess.Popen(argv)
189                 # Make sure the dbus calls go to the right process:
190                 # rebind.
191                 update_feeds_iface = None
192
193             for _ in xrange(5):
194                 if send_update_request():
195                     break
196                 time.sleep(1)
197
198     def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
199         success = False
200         have_serial_execution_lock = False
201         try:
202             download_start = time.time ()
203
204             progress_handler = HTTPProgressHandler(download_callback)
205
206             openers = [progress_handler]
207             if proxy:
208                 openers.append (proxy)
209             kwargs = {'handlers':openers}
210             
211             tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
212             download_duration = time.time () - download_start
213     
214             opener = downloader(progress_handler, proxy)
215
216             if JobManager().do_quit:
217                 raise KeyboardInterrupt
218
219             process_start = time.time()
220
221             # Expiry time is in hours
222             expiry = float(expiryTime) * 3600.
223     
224             currentTime = 0
225     
226             have_woodchuck = mainthread.execute (wc().available)
227
228             def wc_success():
229                 try:
230                     wc().stream_register (self.key, "", 6 * 60 * 60)
231                 except woodchuck.ObjectExistsError:
232                     pass
233                 try:
234                     wc()[self.key].updated (
235                         indicator=(woodchuck.Indicator.ApplicationVisual
236                                    |woodchuck.Indicator.StreamWide),
237                         transferred_down=progress_handler.stats['received'],
238                         transferred_up=progress_handler.stats['sent'],
239                         transfer_time=download_start,
240                         transfer_duration=download_duration,
241                         new_objects=len (tmp.entries),
242                         objects_inline=len (tmp.entries))
243                 except KeyError:
244                     logger.warn(
245                         "Failed to register update of %s with woodchuck!"
246                         % (self.key))
247     
248             http_status = tmp.get ('status', 200)
249     
250             # Check if the parse was succesful.  If the http status code
251             # is 304, then the download was successful, but there is
252             # nothing new.  Indeed, no content is returned.  This make a
253             # 304 look like an error because there are no entries and the
254             # parse fails.  But really, everything went great!  Check for
255             # this first.
256             if http_status == 304:
257                 logger.debug("%s: No changes to feed." % (self.key,))
258                 mainthread.execute (wc_success, async=True)
259                 success = True
260             elif len(tmp["entries"])==0 and not tmp.version:
261                 # An error occured fetching or parsing the feed.  (Version
262                 # will be either None if e.g. the connection timed our or
263                 # '' if the data is not a proper feed)
264                 logger.error(
265                     "Error fetching %s: version is: %s: error: %s"
266                     % (url, str (tmp.version),
267                        str (tmp.get ('bozo_exception', 'Unknown error'))))
268                 logger.debug(tmp)
269                 if have_woodchuck:
270                     def e():
271                         logger.debug("%s: stream update failed!" % self.key)
272     
273                         try:
274                             # It's not easy to get the feed's title from here.
275                             # At the latest, the next time the application is
276                             # started, we'll fix up the human readable name.
277                             wc().stream_register (self.key, "", 6 * 60 * 60)
278                         except woodchuck.ObjectExistsError:
279                             pass
280                         ec = woodchuck.TransferStatus.TransientOther
281                         if 300 <= http_status and http_status < 400:
282                             ec = woodchuck.TransferStatus.TransientNetwork
283                         if 400 <= http_status and http_status < 500:
284                             ec = woodchuck.TransferStatus.FailureGone
285                         if 500 <= http_status and http_status < 600:
286                             ec = woodchuck.TransferStatus.TransientNetwork
287                         wc()[self.key].update_failed(ec)
288                     mainthread.execute (e, async=True)
289             else:
290                currentTime = time.time()
291                # The etag and modified value should only be updated if the content was not null
292                try:
293                    etag = tmp["etag"]
294                except KeyError:
295                    etag = None
296                try:
297                    modified = tmp["modified"]
298                except KeyError:
299                    modified = None
300                try:
301                    abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
302                    f = opener.open(abs_url)
303                    data = f.read()
304                    f.close()
305                    outf = open(self.dir+"/favicon.ico", "w")
306                    outf.write(data)
307                    outf.close()
308                    del data
309                except (urllib2.HTTPError, urllib2.URLError), exception:
310                    logger.debug("Could not download favicon %s: %s"
311                                 % (abs_url, str (exception)))
312     
313                self.serial_execution_lock.acquire ()
314                have_serial_execution_lock = True
315
316                #reversedEntries = self.getEntries()
317                #reversedEntries.reverse()
318     
319                ids = self.getIds()
320     
321                tmp["entries"].reverse()
322                for entry in tmp["entries"]:
323                    # Yield so as to make the main thread a bit more
324                    # responsive.
325                    time.sleep(0)
326     
327                    if JobManager().do_quit:
328                        raise KeyboardInterrupt
329
330                    received_base = progress_handler.stats['received']
331                    sent_base = progress_handler.stats['sent']
332                    object_size = 0
333
334                    date = self.extractDate(entry)
335                    try:
336                        entry["title"]
337                    except KeyError:
338                        entry["title"] = "No Title"
339                    try :
340                        entry["link"]
341                    except KeyError:
342                        entry["link"] = ""
343                    try:
344                        entry["author"]
345                    except KeyError:
346                        entry["author"] = None
347                    if(not(entry.has_key("id"))):
348                        entry["id"] = None
349                    content = self.extractContent(entry)
350                    object_size = len (content)
351                    received_base -= len (content)
352                    tmpEntry = {"title":entry["title"], "content":content,
353                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
354                    id = self.generateUniqueId(tmpEntry)
355                    
356                    #articleTime = time.mktime(self.entries[id]["dateTuple"])
357                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
358                    images = soup('img')
359                    baseurl = tmpEntry["link"]
360                    #if not id in ids:
361                    if imageCache and len(images) > 0:
362                        self.serial_execution_lock.release ()
363                        have_serial_execution_lock = False
364                        for img in images:
365                            filename = self.addImage(
366                                configdir, self.key, baseurl, img['src'],
367                                opener=opener)
368                            if filename:
369                                 img['src']="file://%s" %filename
370                                 count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
371                                 if count == 0:
372                                     self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
373                                     self.db.commit()
374     
375                                 try:
376                                     object_size += os.path.getsize (filename)
377                                 except os.error, exception:
378                                     logger.error ("Error getting size of %s: %s"
379                                                   % (filename, exception))
380                        self.serial_execution_lock.acquire ()
381                        have_serial_execution_lock = True
382     
383                    tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
384                    file = open(tmpEntry["contentLink"], "w")
385                    file.write(soup.prettify())
386                    file.close()
387                    if id in ids:
388                        self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
389                        self.db.commit()
390                    else:
391                        values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
392                        self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
393                        self.db.commit()
394 #                   else:
395 #                       try:
396 #                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
397 #                           self.db.commit()
398 #                           filename = configdir+self.key+".d/"+id+".html"
399 #                           file = open(filename,"a")
400 #                           utime(filename, None)
401 #                           file.close()
402 #                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
403 #                           for image in images:
404 #                                file = open(image[0],"a")
405 #                                utime(image[0], None)
406 #                                file.close()
407 #                       except:
408 #                           pass
409     
410                    # Register the object with Woodchuck and mark it as
411                    # downloaded.
412                    if have_woodchuck:
413                        def e():
414                            try:
415                                obj = wc()[self.key].object_register(
416                                    object_identifier=id,
417                                    human_readable_name=tmpEntry["title"])
418                            except woodchuck.ObjectExistsError:
419                                obj = wc()[self.key][id]
420                            else:
421                                # If the entry does not contain a publication
422                                # time, the attribute won't exist.
423                                pubtime = entry.get ('date_parsed', None)
424                                if pubtime:
425                                    obj.publication_time = time.mktime (pubtime)
426         
427                                received = (progress_handler.stats['received']
428                                            - received_base)
429                                sent = progress_handler.stats['sent'] - sent_base
430                                obj.transferred (
431                                    indicator=(woodchuck.Indicator.ApplicationVisual
432                                               |woodchuck.Indicator.StreamWide),
433                                    transferred_down=received,
434                                    transferred_up=sent,
435                                    object_size=object_size)
436                        mainthread.execute(e, async=True)
437                self.db.commit()
438
439                logger.debug (
440                    "%s: Update successful: transferred: %d/%d; objects: %d)"
441                    % (self.key,
442                       progress_handler.stats['sent'],
443                       progress_handler.stats['received'],
444                       len (tmp.entries)))
445                mainthread.execute (wc_success, async=True)
446                success = True
447
448             rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
449             for row in rows:
450                self.removeEntry(row[0])
451             
452             from glob import glob
453             from os import stat
454             for file in glob(configdir+self.key+".d/*"):
455                 #
456                 stats = stat(file)
457                 #
458                 # put the two dates into matching format
459                 #
460                 lastmodDate = stats[8]
461                 #
462                 expDate = time.time()-expiry*3
463                 # check if image-last-modified-date is outdated
464                 #
465                 if expDate > lastmodDate:
466                     #
467                     try:
468                         #
469                         #print 'Removing', file
470                         #
471                         # XXX: Tell woodchuck.
472                         remove(file) # commented out for testing
473                         #
474                     except OSError, exception:
475                         #
476                         logger.error('Could not remove %s: %s'
477                                      % (file, str (exception)))
478             logger.debug("updated %s: %fs in download, %fs in processing"
479                          % (self.key, download_duration,
480                             time.time () - process_start))
481         except:
482             logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
483         finally:
484             self.db.commit ()
485
486             if have_serial_execution_lock:
487                 self.serial_execution_lock.release ()
488
489             updateTime = 0
490             try:
491                 rows = self.db.execute("SELECT MAX(date) FROM feed;")
492                 for row in rows:
493                     updateTime=row[0]
494             except Exception, e:
495                 logger.error("Fetching update time: %s: %s"
496                              % (str(e), traceback.format_exc()))
497             finally:
498                 if not success:
499                     etag = None
500                     modified = None
501                 title = None
502                 try:
503                     title = tmp.feed.title
504                 except (AttributeError, UnboundLocalError), exception:
505                     pass
506                 if postFeedUpdateFunc is not None:
507                     postFeedUpdateFunc (self.key, updateTime, etag, modified,
508                                         title, *postFeedUpdateFuncArgs)
509
510     def setEntryRead(self, id):
511         self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
512         self.db.commit()
513
514         def e():
515             if wc().available():
516                 try:
517                     wc()[self.key][id].used()
518                 except KeyError:
519                     pass
520
521     def setEntryUnread(self, id):
522         self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
523         self.db.commit()     
524         
525     def markAllAsRead(self):
526         self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
527         self.db.commit()
528
529     def isEntryRead(self, id):
530         read_status = self.db.execute("SELECT read FROM feed WHERE id=?;", (id,) ).fetchone()[0]
531         return read_status==1  # Returns True if read==1, and False if read==0
532     
533     def getTitle(self, id):
534         return self.db.execute("SELECT title FROM feed WHERE id=?;", (id,) ).fetchone()[0]
535     
536     def getContentLink(self, id):
537         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
538     
539     def getExternalLink(self, id):
540         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
541     
542     def getDate(self, id):
543         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
544         return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
545
546     def getDateTuple(self, id):
547         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
548         return time.localtime(dateStamp)
549     
550     def getDateStamp(self, id):
551         return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
552     
553     def generateUniqueId(self, entry):
554         """
555         Generate a stable identifier for the article.  For the same
556         entry, this should result in the same identifier.  If
557         possible, the identifier should remain the same even if the
558         article is updated.
559         """
560         # Prefer the entry's id, which is supposed to be globally
561         # unique.
562         key = entry.get('id', None)
563         if not key:
564             # Next, try the link to the content.
565             key = entry.get('link', None)
566         if not key:
567             # Ok, the title and the date concatenated are likely to be
568             # relatively stable.
569             key = entry.get('title', None) + entry.get('date', None)
570         if not key:
571             # Hmm, the article's content will at least guarantee no
572             # false negatives (i.e., missing articles)
573             key = entry.get('content', None)
574         if not key:
575             # If all else fails, just use a random number.
576             key = str (random.random ())
577         return getId (key)
578     
579     def getIds(self, onlyUnread=False):
580         if onlyUnread:
581             rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
582         else:
583             rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
584         ids = []
585         for row in rows:
586             ids.append(row[0])
587         #ids.reverse()
588         return ids
589     
590     def getNextId(self, id, forward=True):
591         if forward:
592             delta = 1
593         else:
594             delta = -1
595         ids = self.getIds()
596         index = ids.index(id)
597         return ids[(index + delta) % len(ids)]
598         
599     def getPreviousId(self, id):
600         return self.getNextId(id, forward=False)
601     
602     def getNumberOfUnreadItems(self):
603         return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
604     
605     def getNumberOfEntries(self):
606         return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
607
608     def getArticle(self, entry):
609         #self.setEntryRead(id)
610         #entry = self.entries[id]
611         title = entry['title']
612         #content = entry.get('content', entry.get('summary_detail', {}))
613         content = entry["content"]
614
615         link = entry['link']
616         author = entry['author']
617         date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
618
619         #text = '''<div style="color: black; background-color: white;">'''
620         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
621         text += "<html><head><title>" + title + "</title>"
622         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
623         #text += '<style> body {-webkit-user-select: none;} </style>'
624         text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
625         if author != None:
626             text += "<BR /><small><i>Author: " + author + "</i></small>"
627         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
628         text += "<BR /><BR />"
629         text += content
630         text += "</body></html>"
631         return text
632    
633     def getContent(self, id):
634         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
635         try:
636             file = open(self.entries[id]["contentLink"])
637             content = file.read()
638             file.close()
639         except:
640             content = "Content unavailable"
641         return content
642     
643     def extractDate(self, entry):
644         if entry.has_key("updated_parsed"):
645             return timegm(entry["updated_parsed"])
646         elif entry.has_key("published_parsed"):
647             return timegm(entry["published_parsed"])
648         else:
649             return time.time()
650         
651     def extractContent(self, entry):
652         content = ""
653         if entry.has_key('summary'):
654             content = entry.get('summary', '')
655         if entry.has_key('content'):
656             if len(entry.content[0].value) > len(content):
657                 content = entry.content[0].value
658         if content == "":
659             content = entry.get('description', '')
660         return content
661     
662     def removeEntry(self, id):
663         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
664         if contentLink:
665             try:
666                 remove(contentLink)
667             except OSError, exception:
668                 logger.error("Deleting %s: %s" % (contentLink, str (exception)))
669         self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
670         self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
671         self.db.commit()
672
673         def e():
674             if wc().available():
675                 try:
676                     wc()[self.key][id].files_deleted (
677                         woodchuck.DeletionResponse.Deleted)
678                     del wc()[self.key][id]
679                 except KeyError:
680                     pass
681         mainthread.execute (e, async=True)
682  
683 class ArchivedArticles(Feed):    
684     def addArchivedArticle(self, title, link, date, configdir):
685         id = self.generateUniqueId({"date":date, "title":title})
686         values = (id, title, link, date, 0, link, 0)
687         self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
688         self.db.commit()
689
690     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
691         currentTime = 0
692         rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
693         for row in rows:
694             currentTime = time.time()
695             id = row[0]
696             link = row[1]
697             f = urllib2.urlopen(link)
698             #entry["content"] = f.read()
699             html = f.read()
700             f.close()
701             soup = BeautifulSoup(html)
702             images = soup('img')
703             baseurl = link
704             for img in images:
705                 filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
706                 img['src']=filename
707                 self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
708                 self.db.commit()
709             contentLink = configdir+self.key+".d/"+id+".html"
710             file = open(contentLink, "w")
711             file.write(soup.prettify())
712             file.close()
713             
714             self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
715             self.db.commit()
716         return (currentTime, None, None)
717     
718     def purgeReadArticles(self):
719         rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
720         #ids = self.getIds()
721         for row in rows:
722             self.removeArticle(row[0])
723
724     def removeArticle(self, id):
725         rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
726         for row in rows:
727             try:
728                 count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
729                 if count == 0:
730                     os.remove(row[0])
731             except:
732                 pass
733         self.removeEntry(id)
734
735 class Listing:
736     def _getdb(self):
737         try:
738             db = self.tls.db
739         except AttributeError:
740             db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
741             self.tls.db = db
742         return db
743     db = property(_getdb)
744
745     # Lists all the feeds in a dictionary, and expose the data
746     def __init__(self, config, configdir):
747         self.config = config
748         self.configdir = configdir
749
750         self.tls = threading.local ()
751         
752         try:
753             table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
754             if table == None:
755                 self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
756                 self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
757                 self.addCategory("Default Category")
758                 if isfile(self.configdir+"feeds.pickle"):
759                     self.importOldFormatFeeds()
760                 else:
761                     self.addFeed("Maemo News", "http://maemo.org/news/items.xml")    
762             else:
763                 from string import find, upper
764                 if find(upper(table[0]), "WIDGET")<0:
765                     self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
766                     self.db.execute("UPDATE feeds SET widget=1;")
767                     self.db.commit()
768                 if find(upper(table[0]), "CATEGORY")<0:
769                     self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
770                     self.addCategory("Default Category")
771                     self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
772                     self.db.execute("UPDATE feeds SET category=1;")
773             self.db.commit()
774         except:
775             pass
776
777         # Check that Woodchuck's state is up to date with respect our
778         # state.
779         updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
780         wc_init (self, True if updater else False)
781         if wc().available() and updater:
782             # The list of known streams.
783             streams = wc().streams_list ()
784             stream_ids = [s.identifier for s in streams]
785
786             # Register any unknown streams.  Remove known streams from
787             # STREAMS_IDS.
788             for key in self.getListOfFeeds():
789                 title = self.getFeedTitle(key)
790                 # XXX: We should also check whether the list of
791                 # articles/objects in each feed/stream is up to date.
792                 if key not in stream_ids:
793                     logger.debug(
794                         "Registering previously unknown channel: %s (%s)"
795                         % (key, title,))
796                     # Use a default refresh interval of 6 hours.
797                     wc().stream_register (key, title, 6 * 60 * 60)
798                 else:
799                     # Make sure the human readable name is up to date.
800                     if wc()[key].human_readable_name != title:
801                         wc()[key].human_readable_name = title
802                     stream_ids.remove (key)
803                     
804
805             # Unregister any streams that are no longer subscribed to.
806             for id in stream_ids:
807                 logger.debug("Unregistering %s" % (id,))
808                 w.stream_unregister (id)
809
810     def importOldFormatFeeds(self):
811         """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
812         import rss
813         listing = rss.Listing(self.configdir)
814         rank = 0
815         for id in listing.getListOfFeeds():
816             try:
817                 rank += 1
818                 values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
819                 self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
820                 self.db.commit()
821                 
822                 feed = listing.getFeed(id)
823                 new_feed = self.getFeed(id)
824                 
825                 items = feed.getIds()[:]
826                 items.reverse()
827                 for item in items:
828                         if feed.isEntryRead(item):
829                             read_status = 1
830                         else:
831                             read_status = 0 
832                         date = timegm(feed.getDateTuple(item))
833                         title = feed.getTitle(item)
834                         newId = new_feed.generateUniqueId({"date":date, "title":title})
835                         values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
836                         new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
837                         new_feed.db.commit()
838                         try:
839                             images = feed.getImages(item)
840                             for image in images:
841                                 new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
842                                 new_feed.db.commit()
843                         except:
844                             pass
845                 self.updateUnread(id)
846             except:
847                 logger.error("importOldFormatFeeds: %s"
848                              % (traceback.format_exc(),))
849         remove(self.configdir+"feeds.pickle")
850                 
851         
852     def addArchivedArticle(self, key, index):
853         feed = self.getFeed(key)
854         title = feed.getTitle(index)
855         link = feed.getExternalLink(index)
856         date = feed.getDate(index)
857         count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
858         if count == 0:
859             self.addFeed("Archived Articles", "", id="ArchivedArticles")
860
861         archFeed = self.getFeed("ArchivedArticles")
862         archFeed.addArchivedArticle(title, link, date, self.configdir)
863         self.updateUnread("ArchivedArticles")
864         
865     def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
866                    priority=0):
867         if expiryTime is None:
868             expiryTime = self.config.getExpiry()
869         if not expiryTime:
870             # Default to 24 hours
871             expriyTime = 24
872         if proxy is None:
873             (use_proxy, proxy) = self.config.getProxy()
874             if not use_proxy:
875                 proxy = None
876         if imageCache is None:
877             imageCache = self.config.getImageCache()
878
879         feed = self.getFeed(key)
880         (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
881         try:
882             modified = time.struct_time(eval(modified))
883         except:
884             modified = None
885         feed.updateFeed(
886             self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
887             priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
888
889     def _queuePostFeedUpdate(self, *args, **kwargs):
890         mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
891
892     def _postFeedUpdate(self, key, updateTime, etag, modified, title):
893         if modified==None:
894             modified="None"
895         else:
896             modified=str(tuple(modified))
897         if updateTime > 0:
898             self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
899         else:
900             self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
901
902         if title is not None:
903             self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
904                             (title, key))
905         self.db.commit()
906         self.updateUnread(key)
907
908         update_server_object().ArticleCountUpdated()
909
910         stats = JobManager().stats()
911         global jobs_at_start
912         completed = stats['jobs-completed'] - jobs_at_start
913         in_progress = stats['jobs-in-progress']
914         queued = stats['jobs-queued']
915
916         percent = (100 * ((completed + in_progress / 2.))
917                    / (completed + in_progress + queued))
918
919         update_server_object().UpdateProgress(
920             percent, completed, in_progress, queued, 0, 0, 0, key)
921
922         if in_progress == 0 and queued == 0:
923             jobs_at_start = stats['jobs-completed']
924         
925     def getFeed(self, key):
926         if key == "ArchivedArticles":
927             return ArchivedArticles(self.configdir, key)
928         return Feed(self.configdir, key)
929         
930     def editFeed(self, key, title, url, category=None):
931         if category:
932             self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
933         else:
934             self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
935         self.db.commit()
936
937         if wc().available():
938             try:
939                 wc()[key].human_readable_name = title
940             except KeyError:
941                 logger.debug("Feed %s (%s) unknown." % (key, title))
942         
943     def getFeedUpdateTime(self, key):
944         return time.ctime(self.db.execute("SELECT updateTime FROM feeds WHERE id=?;", (key,)).fetchone()[0])
945         
946     def getFeedNumberOfUnreadItems(self, key):
947         return self.db.execute("SELECT unread FROM feeds WHERE id=?;", (key,)).fetchone()[0]
948         
949     def getFeedTitle(self, key):
950         (title, url) = self.db.execute("SELECT title, url FROM feeds WHERE id=?;", (key,)).fetchone()
951         if title:
952             return title
953         return url
954         
955     def getFeedUrl(self, key):
956         return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
957     
958     def getFeedCategory(self, key):
959         return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
960         
961     def getListOfFeeds(self, category=None):
962         if category:
963             rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
964         else:
965             rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
966         keys = []
967         for row in rows:
968             if row[0]:
969                 keys.append(row[0])
970         return keys
971     
972     def getListOfCategories(self):
973         rows = self.db.execute("SELECT id FROM categories ORDER BY rank;" )
974         keys = []
975         for row in rows:
976             if row[0]:
977                 keys.append(row[0])
978         return keys
979     
980     def getCategoryTitle(self, id):
981         row = self.db.execute("SELECT title FROM categories WHERE id=?;", (id, )).fetchone()
982         return row[0]
983     
984     def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
985         if   order == "Most unread":
986             tmp = "ORDER BY unread DESC"
987             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
988         elif order == "Least unread":
989             tmp = "ORDER BY unread"
990             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
991         elif order == "Most recent":
992             tmp = "ORDER BY updateTime DESC"
993             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
994         elif order == "Least recent":
995             tmp = "ORDER BY updateTime"
996             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
997         else: # order == "Manual" or invalid value...
998             tmp = "ORDER BY rank"
999             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
1000         if onlyUnread:
1001             sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp 
1002         else:
1003             sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
1004         rows = self.db.execute(sql)
1005         keys = []
1006         for row in rows:
1007             if row[0]:
1008                 keys.append(row[0])
1009         return keys
1010     
1011     def getFavicon(self, key):
1012         filename = "%s%s.d/favicon.ico" % (self.configdir, key)
1013         if isfile(filename):
1014             return filename
1015         else:
1016             return False
1017         
1018     def updateUnread(self, key):
1019         feed = self.getFeed(key)
1020         self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
1021         self.db.commit()
1022
1023     def addFeed(self, title, url, id=None, category=1):
1024         if not id:
1025             id = getId(url)
1026         count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
1027         if count == 0:
1028             max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1029             if max_rank == None:
1030                 max_rank = 0
1031             values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
1032             self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
1033             self.db.commit()
1034             # Ask for the feed object, it will create the necessary tables
1035             self.getFeed(id)
1036
1037             if wc().available():
1038                 # Register the stream with Woodchuck.  Update approximately
1039                 # every 6 hours.
1040                 wc().stream_register(stream_identifier=id,
1041                                      human_readable_name=title,
1042                                      freshness=6*60*60)
1043
1044             return True
1045         else:
1046             return False
1047         
1048     def addCategory(self, title):
1049         rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
1050         if rank==None:
1051             rank=1
1052         id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
1053         if id==None:
1054             id=1
1055         self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
1056         self.db.commit()
1057     
1058     def removeFeed(self, key):
1059         if wc().available ():
1060             try:
1061                 del wc()[key]
1062             except KeyError:
1063                 logger.debug("Removing unregistered feed %s failed" % (key,))
1064
1065         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
1066         self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
1067         self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
1068         self.db.commit()
1069
1070         if isdir(self.configdir+key+".d/"):
1071            rmtree(self.configdir+key+".d/")
1072            
1073     def removeCategory(self, key):
1074         if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
1075             rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
1076             self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
1077             self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
1078             self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
1079             self.db.commit()
1080         
1081     #def saveConfig(self):
1082     #    self.listOfFeeds["feedingit-order"] = self.sortedKeys
1083     #    file = open(self.configdir+"feeds.pickle", "w")
1084     #    pickle.dump(self.listOfFeeds, file)
1085     #    file.close()
1086         
1087     def moveUp(self, key):
1088         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1089         if rank>0:
1090             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
1091             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
1092             self.db.commit()
1093             
1094     def moveCategoryUp(self, key):
1095         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1096         if rank>0:
1097             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
1098             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
1099             self.db.commit()
1100         
1101     def moveDown(self, key):
1102         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1103         max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1104         if rank<max_rank:
1105             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
1106             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
1107             self.db.commit()
1108             
1109     def moveCategoryDown(self, key):
1110         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1111         max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
1112         if rank<max_rank:
1113             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
1114             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
1115             self.db.commit()
1116             
1117