git.maemo.org Git - feedingit/blob - src/rss_sqlite.py

   1 #!/usr/bin/env python2.5
   2
   3 #
   4 # Copyright (c) 2007-2008 INdT.
   5 # Copyright (c) 2011 Neal H. Walfield
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU Lesser General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 #  This program is distributed in the hope that it will be useful,
  12 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #  GNU Lesser General Public License for more details.
  15 #
  16 #  You should have received a copy of the GNU Lesser General Public License
  17 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 # ============================================================================
  21 # Name        : FeedingIt.py
  22 # Author      : Yves Marcoz
  23 # Version     : 0.5.4
  24 # Description : Simple RSS Reader
  25 # ============================================================================
  26
  27 import sqlite3
  28 from os.path import isfile, isdir
  29 from shutil import rmtree
  30 from os import mkdir, remove, utime
  31 import os
  32 import md5
  33 import feedparser
  34 import time
  35 import urllib2
  36 from BeautifulSoup import BeautifulSoup
  37 from urlparse import urljoin
  38 from calendar import timegm
  39 import threading
  40 import traceback
  41 from wc import wc, wc_init, woodchuck
  42 import subprocess
  43 import dbus
  44 from updatedbus import update_server_object
  45
  46 from jobmanager import JobManager
  47 import mainthread
  48 from httpprogresshandler import HTTPProgressHandler
  49 import random
  50 import sys
  51 import logging
  52 logger = logging.getLogger(__name__)
  53
  54 def getId(string):
  55     return md5.new(string).hexdigest()
  56
  57 def download_callback(connection):
  58     if JobManager().do_quit:
  59         raise KeyboardInterrupt
  60
  61 def downloader(progress_handler=None, proxy=None):
  62     openers = []
  63
  64     if progress_handler is not None:
  65         openers.append(progress_handler)
  66     else:
  67         openers.append(HTTPProgressHandler(download_callback))
  68
  69     if proxy:
  70         openers.append(proxy)
  71
  72     return urllib2.build_opener(*openers)
  73
  74 # If not None, a subprocess.Popen object corresponding to a
  75 # update_feeds.py process.
  76 update_feed_process = None
  77
  78 update_feeds_iface = None
  79
  80 jobs_at_start = 0
  81
  82 class Feed:
  83     serial_execution_lock = threading.Lock()
  84
  85     def _getdb(self):
  86         try:
  87             db = self.tls.db
  88         except AttributeError:
  89             db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
  90             self.tls.db = db
  91         return db
  92     db = property(_getdb)
  93
  94     def __init__(self, configdir, key):
  95         self.key = key
  96         self.configdir = configdir
  97         self.dir = "%s/%s.d" %(self.configdir, self.key)
  98         self.tls = threading.local ()
  99
 100         if not isdir(self.dir):
 101             mkdir(self.dir)
 102         if not isfile("%s/%s.db" %(self.dir, self.key)):
 103             self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
 104             self.db.execute("CREATE TABLE images (id text, imagePath text);")
 105             self.db.commit()
 106
 107     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
 108         filename = configdir+key+".d/"+getId(url)
 109         if not isfile(filename):
 110             try:
 111                 if not opener:
 112                     opener = downloader(proxy=proxy)
 113
 114                 abs_url = urljoin(baseurl,url)
 115                 f = opener.open(abs_url)
 116                 outf = open(filename, "w")
 117                 outf.write(f.read())
 118                 f.close()
 119                 outf.close()
 120             except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
 121                 logger.info("Could not download image %s: %s"
 122                             % (abs_url, str (exception)))
 123                 return None
 124             except:
 125                 exception = sys.exc_info()[0]
 126
 127                 logger.info("Downloading image %s: %s" %
 128                             (abs_url, traceback.format_exc()))
 129                 try:
 130                     remove(filename)
 131                 except OSError:
 132                     pass
 133
 134                 raise exception
 135         else:
 136             #open(filename,"a").close()  # "Touch" the file
 137             file = open(filename,"a")
 138             utime(filename, None)
 139             file.close()
 140         return filename
 141
 142     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
 143         if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
 144             def doit():
 145                 def it():
 146                     self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
 147                 return it
 148             JobManager().execute(doit(), self.key, priority=priority)
 149         else:
 150             def send_update_request():
 151                 global update_feeds_iface
 152                 if update_feeds_iface is None:
 153                     bus=dbus.SessionBus()
 154                     remote_object = bus.get_object(
 155                         "org.marcoz.feedingit", # Connection name
 156                         "/org/marcoz/feedingit/update" # Object's path
 157                         )
 158                     update_feeds_iface = dbus.Interface(
 159                         remote_object, 'org.marcoz.feedingit')
 160
 161                 try:
 162                     update_feeds_iface.Update(self.key)
 163                 except Exception, e:
 164                     logger.error("Invoking org.marcoz.feedingit.Update: %s"
 165                                  % str(e))
 166                     update_feeds_iface = None
 167                 else:
 168                     return True
 169
 170             if send_update_request():
 171                 # Success!  It seems we were able to start the update
 172                 # daemon via dbus (or, it was already running).
 173                 return
 174
 175             global update_feed_process
 176             if (update_feed_process is None
 177                 or update_feed_process.poll() is not None):
 178                 # The update_feeds process is not running.  Start it.
 179                 update_feeds = os.path.join(os.path.dirname(__file__),
 180                                             'update_feeds.py')
 181                 argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
 182                 logger.debug("Starting update_feeds: running %s"
 183                              % (str(argv),))
 184                 update_feed_process = subprocess.Popen(argv)
 185                 # Make sure the dbus calls go to the right process:
 186                 # rebind.
 187                 update_feeds_iface = None
 188
 189             for _ in xrange(5):
 190                 if send_update_request():
 191                     break
 192                 time.sleep(1)
 193
 194     def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
 195         success = False
 196         have_serial_execution_lock = False
 197         try:
 198             download_start = time.time ()
 199
 200             progress_handler = HTTPProgressHandler(download_callback)
 201
 202             openers = [progress_handler]
 203             if proxy:
 204                 openers.append (proxy)
 205             kwargs = {'handlers':openers}
 206
 207             tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
 208             download_duration = time.time () - download_start
 209
 210             opener = downloader(progress_handler, proxy)
 211
 212             if JobManager().do_quit:
 213                 raise KeyboardInterrupt
 214
 215             process_start = time.time()
 216
 217             # Expiry time is in hours
 218             expiry = float(expiryTime) * 3600.
 219
 220             currentTime = 0
 221
 222             have_woodchuck = mainthread.execute (wc().available)
 223
 224             def wc_success():
 225                 try:
 226                     wc().stream_register (self.key, "", 6 * 60 * 60)
 227                 except woodchuck.ObjectExistsError:
 228                     pass
 229                 try:
 230                     wc()[self.key].updated (
 231                         indicator=(woodchuck.Indicator.ApplicationVisual
 232                                    |woodchuck.Indicator.StreamWide),
 233                         transferred_down=progress_handler.stats['received'],
 234                         transferred_up=progress_handler.stats['sent'],
 235                         transfer_time=download_start,
 236                         transfer_duration=download_duration,
 237                         new_objects=len (tmp.entries),
 238                         objects_inline=len (tmp.entries))
 239                 except KeyError:
 240                     logger.warn(
 241                         "Failed to register update of %s with woodchuck!"
 242                         % (self.key))
 243
 244             http_status = tmp.get ('status', 200)
 245
 246             # Check if the parse was succesful.  If the http status code
 247             # is 304, then the download was successful, but there is
 248             # nothing new.  Indeed, no content is returned.  This make a
 249             # 304 look like an error because there are no entries and the
 250             # parse fails.  But really, everything went great!  Check for
 251             # this first.
 252             if http_status == 304:
 253                 logger.debug("%s: No changes to feed." % (self.key,))
 254                 mainthread.execute (wc_success, async=True)
 255                 success = True
 256             elif len(tmp["entries"])==0 and not tmp.version:
 257                 # An error occured fetching or parsing the feed.  (Version
 258                 # will be either None if e.g. the connection timed our or
 259                 # '' if the data is not a proper feed)
 260                 logger.error(
 261                     "Error fetching %s: version is: %s: error: %s"
 262                     % (url, str (tmp.version),
 263                        str (tmp.get ('bozo_exception', 'Unknown error'))))
 264                 logger.debug(tmp)
 265                 if have_woodchuck:
 266                     def e():
 267                         logger.debug("%s: stream update failed!" % self.key)
 268
 269                         try:
 270                             # It's not easy to get the feed's title from here.
 271                             # At the latest, the next time the application is
 272                             # started, we'll fix up the human readable name.
 273                             wc().stream_register (self.key, "", 6 * 60 * 60)
 274                         except woodchuck.ObjectExistsError:
 275                             pass
 276                         ec = woodchuck.TransferStatus.TransientOther
 277                         if 300 <= http_status and http_status < 400:
 278                             ec = woodchuck.TransferStatus.TransientNetwork
 279                         if 400 <= http_status and http_status < 500:
 280                             ec = woodchuck.TransferStatus.FailureGone
 281                         if 500 <= http_status and http_status < 600:
 282                             ec = woodchuck.TransferStatus.TransientNetwork
 283                         wc()[self.key].update_failed(ec)
 284                     mainthread.execute (e, async=True)
 285             else:
 286                currentTime = time.time()
 287                # The etag and modified value should only be updated if the content was not null
 288                try:
 289                    etag = tmp["etag"]
 290                except KeyError:
 291                    etag = None
 292                try:
 293                    modified = tmp["modified"]
 294                except KeyError:
 295                    modified = None
 296                try:
 297                    abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
 298                    f = opener.open(abs_url)
 299                    data = f.read()
 300                    f.close()
 301                    outf = open(self.dir+"/favicon.ico", "w")
 302                    outf.write(data)
 303                    outf.close()
 304                    del data
 305                except (urllib2.HTTPError, urllib2.URLError), exception:
 306                    logger.debug("Could not download favicon %s: %s"
 307                                 % (abs_url, str (exception)))
 308
 309                self.serial_execution_lock.acquire ()
 310                have_serial_execution_lock = True
 311
 312                #reversedEntries = self.getEntries()
 313                #reversedEntries.reverse()
 314
 315                ids = self.getIds()
 316
 317                tmp["entries"].reverse()
 318                for entry in tmp["entries"]:
 319                    # Yield so as to make the main thread a bit more
 320                    # responsive.
 321                    time.sleep(0)
 322
 323                    if JobManager().do_quit:
 324                        raise KeyboardInterrupt
 325
 326                    received_base = progress_handler.stats['received']
 327                    sent_base = progress_handler.stats['sent']
 328                    object_size = 0
 329
 330                    date = self.extractDate(entry)
 331                    try:
 332                        entry["title"]
 333                    except KeyError:
 334                        entry["title"] = "No Title"
 335                    try :
 336                        entry["link"]
 337                    except KeyError:
 338                        entry["link"] = ""
 339                    try:
 340                        entry["author"]
 341                    except KeyError:
 342                        entry["author"] = None
 343                    if(not(entry.has_key("id"))):
 344                        entry["id"] = None
 345                    content = self.extractContent(entry)
 346                    object_size = len (content)
 347                    received_base -= len (content)
 348                    tmpEntry = {"title":entry["title"], "content":content,
 349                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
 350                    id = self.generateUniqueId(tmpEntry)
 351
 352                    #articleTime = time.mktime(self.entries[id]["dateTuple"])
 353                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
 354                    images = soup('img')
 355                    baseurl = tmpEntry["link"]
 356                    #if not id in ids:
 357                    if imageCache and len(images) > 0:
 358                        self.serial_execution_lock.release ()
 359                        have_serial_execution_lock = False
 360                        for img in images:
 361                            filename = self.addImage(
 362                                configdir, self.key, baseurl, img['src'],
 363                                opener=opener)
 364                            if filename:
 365                                 img['src']="file://%s" %filename
 366                                 count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
 367                                 if count == 0:
 368                                     self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
 369                                     self.db.commit()
 370
 371                                 try:
 372                                     object_size += os.path.getsize (filename)
 373                                 except os.error, exception:
 374                                     logger.error ("Error getting size of %s: %s"
 375                                                   % (filename, exception))
 376                        self.serial_execution_lock.acquire ()
 377                        have_serial_execution_lock = True
 378
 379                    tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
 380                    file = open(tmpEntry["contentLink"], "w")
 381                    file.write(soup.prettify())
 382                    file.close()
 383                    if id in ids:
 384                        self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
 385                        self.db.commit()
 386                    else:
 387                        values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
 388                        self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 389                        self.db.commit()
 390 #                   else:
 391 #                       try:
 392 #                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
 393 #                           self.db.commit()
 394 #                           filename = configdir+self.key+".d/"+id+".html"
 395 #                           file = open(filename,"a")
 396 #                           utime(filename, None)
 397 #                           file.close()
 398 #                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
 399 #                           for image in images:
 400 #                                file = open(image[0],"a")
 401 #                                utime(image[0], None)
 402 #                                file.close()
 403 #                       except:
 404 #                           pass
 405
 406                    # Register the object with Woodchuck and mark it as
 407                    # downloaded.
 408                    if have_woodchuck:
 409                        def e():
 410                            try:
 411                                obj = wc()[self.key].object_register(
 412                                    object_identifier=id,
 413                                    human_readable_name=tmpEntry["title"])
 414                            except woodchuck.ObjectExistsError:
 415                                obj = wc()[self.key][id]
 416                            else:
 417                                # If the entry does not contain a publication
 418                                # time, the attribute won't exist.
 419                                pubtime = entry.get ('date_parsed', None)
 420                                if pubtime:
 421                                    obj.publication_time = time.mktime (pubtime)
 422
 423                                received = (progress_handler.stats['received']
 424                                            - received_base)
 425                                sent = progress_handler.stats['sent'] - sent_base
 426                                obj.transferred (
 427                                    indicator=(woodchuck.Indicator.ApplicationVisual
 428                                               |woodchuck.Indicator.StreamWide),
 429                                    transferred_down=received,
 430                                    transferred_up=sent,
 431                                    object_size=object_size)
 432                        mainthread.execute(e, async=True)
 433                self.db.commit()
 434
 435                logger.debug (
 436                    "%s: Update successful: transferred: %d/%d; objects: %d)"
 437                    % (self.key,
 438                       progress_handler.stats['sent'],
 439                       progress_handler.stats['received'],
 440                       len (tmp.entries)))
 441                mainthread.execute (wc_success, async=True)
 442                success = True
 443
 444             rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
 445             for row in rows:
 446                self.removeEntry(row[0])
 447
 448             from glob import glob
 449             from os import stat
 450             for file in glob(configdir+self.key+".d/*"):
 451                 #
 452                 stats = stat(file)
 453                 #
 454                 # put the two dates into matching format
 455                 #
 456                 lastmodDate = stats[8]
 457                 #
 458                 expDate = time.time()-expiry*3
 459                 # check if image-last-modified-date is outdated
 460                 #
 461                 if expDate > lastmodDate:
 462                     #
 463                     try:
 464                         #
 465                         #print 'Removing', file
 466                         #
 467                         # XXX: Tell woodchuck.
 468                         remove(file) # commented out for testing
 469                         #
 470                     except OSError, exception:
 471                         #
 472                         logger.error('Could not remove %s: %s'
 473                                      % (file, str (exception)))
 474             logger.debug("updated %s: %fs in download, %fs in processing"
 475                          % (self.key, download_duration,
 476                             time.time () - process_start))
 477         except:
 478             logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
 479         finally:
 480             self.db.commit ()
 481
 482             if have_serial_execution_lock:
 483                 self.serial_execution_lock.release ()
 484
 485             updateTime = 0
 486             try:
 487                 rows = self.db.execute("SELECT MAX(date) FROM feed;")
 488                 for row in rows:
 489                     updateTime=row[0]
 490             except Exception, e:
 491                 logger.error("Fetching update time: %s: %s"
 492                              % (str(e), traceback.format_exc()))
 493             finally:
 494                 if not success:
 495                     etag = None
 496                     modified = None
 497                 title = None
 498                 try:
 499                     title = tmp.feed.title
 500                 except (AttributeError, UnboundLocalError), exception:
 501                     pass
 502                 if postFeedUpdateFunc is not None:
 503                     postFeedUpdateFunc (self.key, updateTime, etag, modified,
 504                                         title, *postFeedUpdateFuncArgs)
 505
 506     def setEntryRead(self, id):
 507         self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
 508         self.db.commit()
 509
 510         def e():
 511             if wc().available():
 512                 try:
 513                     wc()[self.key][id].used()
 514                 except KeyError:
 515                     pass
 516
 517     def setEntryUnread(self, id):
 518         self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
 519         self.db.commit()
 520
 521     def markAllAsRead(self):
 522         self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
 523         self.db.commit()
 524
 525     def isEntryRead(self, id):
 526         read_status = self.db.execute("SELECT read FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 527         return read_status==1  # Returns True if read==1, and False if read==0
 528
 529     def getTitle(self, id):
 530         return self.db.execute("SELECT title FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 531
 532     def getContentLink(self, id):
 533         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 534
 535     def getExternalLink(self, id):
 536         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 537
 538     def getDate(self, id):
 539         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 540         return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
 541
 542     def getDateTuple(self, id):
 543         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 544         return time.localtime(dateStamp)
 545
 546     def getDateStamp(self, id):
 547         return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 548
 549     def generateUniqueId(self, entry):
 550         """
 551         Generate a stable identifier for the article.  For the same
 552         entry, this should result in the same identifier.  If
 553         possible, the identifier should remain the same even if the
 554         article is updated.
 555         """
 556         # Prefer the entry's id, which is supposed to be globally
 557         # unique.
 558         key = entry.get('id', None)
 559         if not key:
 560             # Next, try the link to the content.
 561             key = entry.get('link', None)
 562         if not key:
 563             # Ok, the title and the date concatenated are likely to be
 564             # relatively stable.
 565             key = entry.get('title', None) + entry.get('date', None)
 566         if not key:
 567             # Hmm, the article's content will at least guarantee no
 568             # false negatives (i.e., missing articles)
 569             key = entry.get('content', None)
 570         if not key:
 571             # If all else fails, just use a random number.
 572             key = str (random.random ())
 573         return getId (key)
 574
 575     def getIds(self, onlyUnread=False):
 576         if onlyUnread:
 577             rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
 578         else:
 579             rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
 580         ids = []
 581         for row in rows:
 582             ids.append(row[0])
 583         #ids.reverse()
 584         return ids
 585
 586     def getNextId(self, id, forward=True):
 587         if forward:
 588             delta = 1
 589         else:
 590             delta = -1
 591         ids = self.getIds()
 592         index = ids.index(id)
 593         return ids[(index + delta) % len(ids)]
 594
 595     def getPreviousId(self, id):
 596         return self.getNextId(id, forward=False)
 597
 598     def getNumberOfUnreadItems(self):
 599         return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
 600
 601     def getNumberOfEntries(self):
 602         return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
 603
 604     def getArticle(self, entry):
 605         #self.setEntryRead(id)
 606         #entry = self.entries[id]
 607         title = entry['title']
 608         #content = entry.get('content', entry.get('summary_detail', {}))
 609         content = entry["content"]
 610
 611         link = entry['link']
 612         author = entry['author']
 613         date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
 614
 615         #text = '''<div style="color: black; background-color: white;">'''
 616         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
 617         text += "<html><head><title>" + title + "</title>"
 618         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
 619         #text += '<style> body {-webkit-user-select: none;} </style>'
 620         text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
 621         if author != None:
 622             text += "<BR /><small><i>Author: " + author + "</i></small>"
 623         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
 624         text += "<BR /><BR />"
 625         text += content
 626         text += "</body></html>"
 627         return text
 628
 629     def getContent(self, id):
 630         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
 631         try:
 632             file = open(self.entries[id]["contentLink"])
 633             content = file.read()
 634             file.close()
 635         except:
 636             content = "Content unavailable"
 637         return content
 638
 639     def extractDate(self, entry):
 640         if entry.has_key("updated_parsed"):
 641             return timegm(entry["updated_parsed"])
 642         elif entry.has_key("published_parsed"):
 643             return timegm(entry["published_parsed"])
 644         else:
 645             return time.time()
 646
 647     def extractContent(self, entry):
 648         content = ""
 649         if entry.has_key('summary'):
 650             content = entry.get('summary', '')
 651         if entry.has_key('content'):
 652             if len(entry.content[0].value) > len(content):
 653                 content = entry.content[0].value
 654         if content == "":
 655             content = entry.get('description', '')
 656         return content
 657
 658     def removeEntry(self, id):
 659         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
 660         if contentLink:
 661             try:
 662                 remove(contentLink)
 663             except OSError, exception:
 664                 logger.error("Deleting %s: %s" % (contentLink, str (exception)))
 665         self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
 666         self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
 667         self.db.commit()
 668
 669         def e():
 670             if wc().available():
 671                 try:
 672                     wc()[self.key][id].files_deleted (
 673                         woodchuck.DeletionResponse.Deleted)
 674                     del wc()[self.key][id]
 675                 except KeyError:
 676                     pass
 677         mainthread.execute (e, async=True)
 678
 679 class ArchivedArticles(Feed):
 680     def addArchivedArticle(self, title, link, date, configdir):
 681         id = self.generateUniqueId({"date":date, "title":title})
 682         values = (id, title, link, date, 0, link, 0)
 683         self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 684         self.db.commit()
 685
 686     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
 687         currentTime = 0
 688         rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
 689         for row in rows:
 690             currentTime = time.time()
 691             id = row[0]
 692             link = row[1]
 693             f = urllib2.urlopen(link)
 694             #entry["content"] = f.read()
 695             html = f.read()
 696             f.close()
 697             soup = BeautifulSoup(html)
 698             images = soup('img')
 699             baseurl = link
 700             for img in images:
 701                 filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
 702                 img['src']=filename
 703                 self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
 704                 self.db.commit()
 705             contentLink = configdir+self.key+".d/"+id+".html"
 706             file = open(contentLink, "w")
 707             file.write(soup.prettify())
 708             file.close()
 709
 710             self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
 711             self.db.commit()
 712         return (currentTime, None, None)
 713
 714     def purgeReadArticles(self):
 715         rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
 716         #ids = self.getIds()
 717         for row in rows:
 718             self.removeArticle(row[0])
 719
 720     def removeArticle(self, id):
 721         rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
 722         for row in rows:
 723             try:
 724                 count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
 725                 if count == 0:
 726                     os.remove(row[0])
 727             except:
 728                 pass
 729         self.removeEntry(id)
 730
 731 class Listing:
 732     def _getdb(self):
 733         try:
 734             db = self.tls.db
 735         except AttributeError:
 736             db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
 737             self.tls.db = db
 738         return db
 739     db = property(_getdb)
 740
 741     # Lists all the feeds in a dictionary, and expose the data
 742     def __init__(self, config, configdir):
 743         self.config = config
 744         self.configdir = configdir
 745
 746         self.tls = threading.local ()
 747
 748         try:
 749             table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
 750             if table == None:
 751                 self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
 752                 self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
 753                 self.addCategory("Default Category")
 754                 if isfile(self.configdir+"feeds.pickle"):
 755                     self.importOldFormatFeeds()
 756                 else:
 757                     self.addFeed("Maemo News", "http://maemo.org/news/items.xml")
 758             else:
 759                 from string import find, upper
 760                 if find(upper(table[0]), "WIDGET")<0:
 761                     self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
 762                     self.db.execute("UPDATE feeds SET widget=1;")
 763                     self.db.commit()
 764                 if find(upper(table[0]), "CATEGORY")<0:
 765                     self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
 766                     self.addCategory("Default Category")
 767                     self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
 768                     self.db.execute("UPDATE feeds SET category=1;")
 769             self.db.commit()
 770         except:
 771             pass
 772
 773         # Check that Woodchuck's state is up to date with respect our
 774         # state.
 775         updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
 776         wc_init (self, True if updater else False)
 777         if wc().available() and updater:
 778             # The list of known streams.
 779             streams = wc().streams_list ()
 780             stream_ids = [s.identifier for s in streams]
 781
 782             # Register any unknown streams.  Remove known streams from
 783             # STREAMS_IDS.
 784             for key in self.getListOfFeeds():
 785                 title = self.getFeedTitle(key)
 786                 # XXX: We should also check whether the list of
 787                 # articles/objects in each feed/stream is up to date.
 788                 if key not in stream_ids:
 789                     logger.debug(
 790                         "Registering previously unknown channel: %s (%s)"
 791                         % (key, title,))
 792                     # Use a default refresh interval of 6 hours.
 793                     wc().stream_register (key, title, 6 * 60 * 60)
 794                 else:
 795                     # Make sure the human readable name is up to date.
 796                     if wc()[key].human_readable_name != title:
 797                         wc()[key].human_readable_name = title
 798                     stream_ids.remove (key)
 799
 800
 801             # Unregister any streams that are no longer subscribed to.
 802             for id in stream_ids:
 803                 logger.debug("Unregistering %s" % (id,))
 804                 w.stream_unregister (id)
 805
 806     def importOldFormatFeeds(self):
 807         """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
 808         import rss
 809         listing = rss.Listing(self.configdir)
 810         rank = 0
 811         for id in listing.getListOfFeeds():
 812             try:
 813                 rank += 1
 814                 values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
 815                 self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
 816                 self.db.commit()
 817
 818                 feed = listing.getFeed(id)
 819                 new_feed = self.getFeed(id)
 820
 821                 items = feed.getIds()[:]
 822                 items.reverse()
 823                 for item in items:
 824                         if feed.isEntryRead(item):
 825                             read_status = 1
 826                         else:
 827                             read_status = 0
 828                         date = timegm(feed.getDateTuple(item))
 829                         title = feed.getTitle(item)
 830                         newId = new_feed.generateUniqueId({"date":date, "title":title})
 831                         values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
 832                         new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 833                         new_feed.db.commit()
 834                         try:
 835                             images = feed.getImages(item)
 836                             for image in images:
 837                                 new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
 838                                 new_feed.db.commit()
 839                         except:
 840                             pass
 841                 self.updateUnread(id)
 842             except:
 843                 logger.error("importOldFormatFeeds: %s"
 844                              % (traceback.format_exc(),))
 845         remove(self.configdir+"feeds.pickle")
 846
 847
 848     def addArchivedArticle(self, key, index):
 849         feed = self.getFeed(key)
 850         title = feed.getTitle(index)
 851         link = feed.getExternalLink(index)
 852         date = feed.getDate(index)
 853         count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
 854         if count == 0:
 855             self.addFeed("Archived Articles", "", id="ArchivedArticles")
 856
 857         archFeed = self.getFeed("ArchivedArticles")
 858         archFeed.addArchivedArticle(title, link, date, self.configdir)
 859         self.updateUnread("ArchivedArticles")
 860
 861     def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
 862                    priority=0):
 863         if expiryTime is None:
 864             expiryTime = self.config.getExpiry()
 865         if not expiryTime:
 866             # Default to 24 hours
 867             expriyTime = 24
 868         if proxy is None:
 869             (use_proxy, proxy) = self.config.getProxy()
 870             if not use_proxy:
 871                 proxy = None
 872         if imageCache is None:
 873             imageCache = self.config.getImageCache()
 874
 875         feed = self.getFeed(key)
 876         (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
 877         try:
 878             modified = time.struct_time(eval(modified))
 879         except:
 880             modified = None
 881         feed.updateFeed(
 882             self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
 883             priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
 884
 885     def _queuePostFeedUpdate(self, *args, **kwargs):
 886         mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
 887
 888     def _postFeedUpdate(self, key, updateTime, etag, modified, title):
 889         if modified==None:
 890             modified="None"
 891         else:
 892             modified=str(tuple(modified))
 893         if updateTime > 0:
 894             self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
 895         else:
 896             self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
 897
 898         if title is not None:
 899             self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
 900                             (title, key))
 901         self.db.commit()
 902         self.updateUnread(key)
 903
 904         update_server_object().ArticleCountUpdated()
 905
 906         stats = JobManager().stats()
 907         global jobs_at_start
 908         completed = stats['jobs-completed'] - jobs_at_start
 909         in_progress = stats['jobs-in-progress']
 910         queued = stats['jobs-queued']
 911
 912         percent = (100 * ((completed + in_progress / 2.))
 913                    / (completed + in_progress + queued))
 914
 915         update_server_object().UpdateProgress(
 916             percent, completed, in_progress, queued, 0, 0, 0, key)
 917
 918         if in_progress == 0 and queued == 0:
 919             jobs_at_start = stats['jobs-completed']
 920
 921     def getFeed(self, key):
 922         if key == "ArchivedArticles":
 923             return ArchivedArticles(self.configdir, key)
 924         return Feed(self.configdir, key)
 925
 926     def editFeed(self, key, title, url, category=None):
 927         if category:
 928             self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
 929         else:
 930             self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
 931         self.db.commit()
 932
 933         if wc().available():
 934             try:
 935                 wc()[key].human_readable_name = title
 936             except KeyError:
 937                 logger.debug("Feed %s (%s) unknown." % (key, title))
 938
 939     def getFeedUpdateTime(self, key):
 940         return time.ctime(self.db.execute("SELECT updateTime FROM feeds WHERE id=?;", (key,)).fetchone()[0])
 941
 942     def getFeedNumberOfUnreadItems(self, key):
 943         return self.db.execute("SELECT unread FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 944
 945     def getFeedTitle(self, key):
 946         (title, url) = self.db.execute("SELECT title, url FROM feeds WHERE id=?;", (key,)).fetchone()
 947         if title:
 948             return title
 949         return url
 950
 951     def getFeedUrl(self, key):
 952         return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 953
 954     def getFeedCategory(self, key):
 955         return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 956
 957     def getListOfFeeds(self, category=None):
 958         if category:
 959             rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
 960         else:
 961             rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
 962         keys = []
 963         for row in rows:
 964             if row[0]:
 965                 keys.append(row[0])
 966         return keys
 967
 968     def getListOfCategories(self):
 969         rows = self.db.execute("SELECT id FROM categories ORDER BY rank;" )
 970         keys = []
 971         for row in rows:
 972             if row[0]:
 973                 keys.append(row[0])
 974         return keys
 975
 976     def getCategoryTitle(self, id):
 977         row = self.db.execute("SELECT title FROM categories WHERE id=?;", (id, )).fetchone()
 978         return row[0]
 979
 980     def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
 981         if   order == "Most unread":
 982             tmp = "ORDER BY unread DESC"
 983             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
 984         elif order == "Least unread":
 985             tmp = "ORDER BY unread"
 986             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
 987         elif order == "Most recent":
 988             tmp = "ORDER BY updateTime DESC"
 989             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
 990         elif order == "Least recent":
 991             tmp = "ORDER BY updateTime"
 992             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
 993         else: # order == "Manual" or invalid value...
 994             tmp = "ORDER BY rank"
 995             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
 996         if onlyUnread:
 997             sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp
 998         else:
 999             sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
1000         rows = self.db.execute(sql)
1001         keys = []
1002         for row in rows:
1003             if row[0]:
1004                 keys.append(row[0])
1005         return keys
1006
1007     def getFavicon(self, key):
1008         filename = "%s%s.d/favicon.ico" % (self.configdir, key)
1009         if isfile(filename):
1010             return filename
1011         else:
1012             return False
1013
1014     def updateUnread(self, key):
1015         feed = self.getFeed(key)
1016         self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
1017         self.db.commit()
1018
1019     def addFeed(self, title, url, id=None, category=1):
1020         if not id:
1021             id = getId(url)
1022         count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
1023         if count == 0:
1024             max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1025             if max_rank == None:
1026                 max_rank = 0
1027             values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
1028             self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
1029             self.db.commit()
1030             # Ask for the feed object, it will create the necessary tables
1031             self.getFeed(id)
1032
1033             if wc().available():
1034                 # Register the stream with Woodchuck.  Update approximately
1035                 # every 6 hours.
1036                 wc().stream_register(stream_identifier=id,
1037                                      human_readable_name=title,
1038                                      freshness=6*60*60)
1039
1040             return True
1041         else:
1042             return False
1043
1044     def addCategory(self, title):
1045         rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
1046         if rank==None:
1047             rank=1
1048         id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
1049         if id==None:
1050             id=1
1051         self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
1052         self.db.commit()
1053
1054     def removeFeed(self, key):
1055         if wc().available ():
1056             try:
1057                 del wc()[key]
1058             except KeyError:
1059                 logger.debug("Removing unregistered feed %s failed" % (key,))
1060
1061         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
1062         self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
1063         self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
1064         self.db.commit()
1065
1066         if isdir(self.configdir+key+".d/"):
1067            rmtree(self.configdir+key+".d/")
1068
1069     def removeCategory(self, key):
1070         if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
1071             rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
1072             self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
1073             self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
1074             self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
1075             self.db.commit()
1076
1077     #def saveConfig(self):
1078     #    self.listOfFeeds["feedingit-order"] = self.sortedKeys
1079     #    file = open(self.configdir+"feeds.pickle", "w")
1080     #    pickle.dump(self.listOfFeeds, file)
1081     #    file.close()
1082
1083     def moveUp(self, key):
1084         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1085         if rank>0:
1086             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
1087             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
1088             self.db.commit()
1089
1090     def moveCategoryUp(self, key):
1091         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1092         if rank>0:
1093             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
1094             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
1095             self.db.commit()
1096
1097     def moveDown(self, key):
1098         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1099         max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1100         if rank<max_rank:
1101             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
1102             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
1103             self.db.commit()
1104
1105     def moveCategoryDown(self, key):
1106         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1107         max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
1108         if rank<max_rank:
1109             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
1110             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
1111             self.db.commit()
1112
1113