git.maemo.org Git - feedingit/blob - src/rss_sqlite.py

   1 #!/usr/bin/env python2.5
   2
   3 #
   4 # Copyright (c) 2007-2008 INdT.
   5 # Copyright (c) 2011 Neal H. Walfield
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU Lesser General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 #  This program is distributed in the hope that it will be useful,
  12 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #  GNU Lesser General Public License for more details.
  15 #
  16 #  You should have received a copy of the GNU Lesser General Public License
  17 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 # ============================================================================
  21 # Name        : FeedingIt.py
  22 # Author      : Yves Marcoz
  23 # Version     : 0.5.4
  24 # Description : Simple RSS Reader
  25 # ============================================================================
  26
  27 from __future__ import with_statement
  28
  29 import sqlite3
  30 from os.path import isfile, isdir
  31 from shutil import rmtree
  32 from os import mkdir, remove, utime
  33 import os
  34 import md5
  35 import feedparser
  36 import time
  37 import urllib2
  38 from BeautifulSoup import BeautifulSoup
  39 from urlparse import urljoin
  40 from calendar import timegm
  41 import threading
  42 import traceback
  43 from wc import wc, wc_init, woodchuck
  44 import subprocess
  45 import dbus
  46 from updatedbus import update_server_object
  47
  48 from jobmanager import JobManager
  49 import mainthread
  50 from httpprogresshandler import HTTPProgressHandler
  51 import random
  52 import sys
  53 import logging
  54 logger = logging.getLogger(__name__)
  55
  56 def getId(string):
  57     return md5.new(string).hexdigest()
  58
  59 def download_callback(connection):
  60     if JobManager().do_quit:
  61         raise KeyboardInterrupt
  62
  63 def downloader(progress_handler=None, proxy=None):
  64     openers = []
  65
  66     if progress_handler is not None:
  67         openers.append(progress_handler)
  68     else:
  69         openers.append(HTTPProgressHandler(download_callback))
  70
  71     if proxy:
  72         openers.append(proxy)
  73
  74     return urllib2.build_opener(*openers)
  75
  76 # If not None, a subprocess.Popen object corresponding to a
  77 # update_feeds.py process.
  78 update_feed_process = None
  79
  80 update_feeds_iface = None
  81
  82 jobs_at_start = 0
  83
  84 class Feed:
  85     serial_execution_lock = threading.Lock()
  86
  87     def _getdb(self):
  88         try:
  89             db = self.tls.db
  90         except AttributeError:
  91             db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
  92             self.tls.db = db
  93         return db
  94     db = property(_getdb)
  95
  96     def __init__(self, configdir, key):
  97         self.key = key
  98         self.configdir = configdir
  99         self.dir = "%s/%s.d" %(self.configdir, self.key)
 100         self.tls = threading.local ()
 101
 102         if not isdir(self.dir):
 103             mkdir(self.dir)
 104         if not isfile("%s/%s.db" %(self.dir, self.key)):
 105             self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
 106             self.db.execute("CREATE TABLE images (id text, imagePath text);")
 107             self.db.commit()
 108
 109     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
 110         filename = configdir+key+".d/"+getId(url)
 111         if not isfile(filename):
 112             try:
 113                 if not opener:
 114                     opener = downloader(proxy=proxy)
 115
 116                 abs_url = urljoin(baseurl,url)
 117                 f = opener.open(abs_url)
 118                 try:
 119                     with open(filename, "w") as outf:
 120                         for data in f:
 121                             outf.write(data)
 122                 finally:
 123                     f.close()
 124             except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
 125                 logger.info("Could not download image %s: %s"
 126                             % (abs_url, str (exception)))
 127                 return None
 128             except:
 129                 exception = sys.exc_info()[0]
 130
 131                 logger.info("Downloading image %s: %s" %
 132                             (abs_url, traceback.format_exc()))
 133                 try:
 134                     remove(filename)
 135                 except OSError:
 136                     pass
 137
 138                 raise exception
 139         else:
 140             #open(filename,"a").close()  # "Touch" the file
 141             file = open(filename,"a")
 142             utime(filename, None)
 143             file.close()
 144         return filename
 145
 146     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
 147         if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
 148             def doit():
 149                 def it():
 150                     self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
 151                 return it
 152             JobManager().execute(doit(), self.key, priority=priority)
 153         else:
 154             def send_update_request():
 155                 global update_feeds_iface
 156                 if update_feeds_iface is None:
 157                     bus=dbus.SessionBus()
 158                     remote_object = bus.get_object(
 159                         "org.marcoz.feedingit", # Connection name
 160                         "/org/marcoz/feedingit/update" # Object's path
 161                         )
 162                     update_feeds_iface = dbus.Interface(
 163                         remote_object, 'org.marcoz.feedingit')
 164
 165                 try:
 166                     update_feeds_iface.Update(self.key)
 167                 except Exception, e:
 168                     logger.error("Invoking org.marcoz.feedingit.Update: %s"
 169                                  % str(e))
 170                     update_feeds_iface = None
 171                 else:
 172                     return True
 173
 174             if send_update_request():
 175                 # Success!  It seems we were able to start the update
 176                 # daemon via dbus (or, it was already running).
 177                 return
 178
 179             global update_feed_process
 180             if (update_feed_process is None
 181                 or update_feed_process.poll() is not None):
 182                 # The update_feeds process is not running.  Start it.
 183                 update_feeds = os.path.join(os.path.dirname(__file__),
 184                                             'update_feeds.py')
 185                 argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
 186                 logger.debug("Starting update_feeds: running %s"
 187                              % (str(argv),))
 188                 update_feed_process = subprocess.Popen(argv)
 189                 # Make sure the dbus calls go to the right process:
 190                 # rebind.
 191                 update_feeds_iface = None
 192
 193             for _ in xrange(5):
 194                 if send_update_request():
 195                     break
 196                 time.sleep(1)
 197
 198     def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
 199         success = False
 200         have_serial_execution_lock = False
 201         try:
 202             download_start = time.time ()
 203
 204             progress_handler = HTTPProgressHandler(download_callback)
 205
 206             openers = [progress_handler]
 207             if proxy:
 208                 openers.append (proxy)
 209             kwargs = {'handlers':openers}
 210
 211             tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
 212             download_duration = time.time () - download_start
 213
 214             opener = downloader(progress_handler, proxy)
 215
 216             if JobManager().do_quit:
 217                 raise KeyboardInterrupt
 218
 219             process_start = time.time()
 220
 221             # Expiry time is in hours
 222             expiry = float(expiryTime) * 3600.
 223
 224             currentTime = 0
 225
 226             have_woodchuck = mainthread.execute (wc().available)
 227
 228             def wc_success():
 229                 try:
 230                     wc().stream_register (self.key, "", 6 * 60 * 60)
 231                 except woodchuck.ObjectExistsError:
 232                     pass
 233                 try:
 234                     wc()[self.key].updated (
 235                         indicator=(woodchuck.Indicator.ApplicationVisual
 236                                    |woodchuck.Indicator.StreamWide),
 237                         transferred_down=progress_handler.stats['received'],
 238                         transferred_up=progress_handler.stats['sent'],
 239                         transfer_time=download_start,
 240                         transfer_duration=download_duration,
 241                         new_objects=len (tmp.entries),
 242                         objects_inline=len (tmp.entries))
 243                 except KeyError:
 244                     logger.warn(
 245                         "Failed to register update of %s with woodchuck!"
 246                         % (self.key))
 247
 248             http_status = tmp.get ('status', 200)
 249
 250             # Check if the parse was succesful.  If the http status code
 251             # is 304, then the download was successful, but there is
 252             # nothing new.  Indeed, no content is returned.  This make a
 253             # 304 look like an error because there are no entries and the
 254             # parse fails.  But really, everything went great!  Check for
 255             # this first.
 256             if http_status == 304:
 257                 logger.debug("%s: No changes to feed." % (self.key,))
 258                 mainthread.execute (wc_success, async=True)
 259                 success = True
 260             elif len(tmp["entries"])==0 and not tmp.version:
 261                 # An error occured fetching or parsing the feed.  (Version
 262                 # will be either None if e.g. the connection timed our or
 263                 # '' if the data is not a proper feed)
 264                 logger.error(
 265                     "Error fetching %s: version is: %s: error: %s"
 266                     % (url, str (tmp.version),
 267                        str (tmp.get ('bozo_exception', 'Unknown error'))))
 268                 logger.debug(tmp)
 269                 if have_woodchuck:
 270                     def e():
 271                         logger.debug("%s: stream update failed!" % self.key)
 272
 273                         try:
 274                             # It's not easy to get the feed's title from here.
 275                             # At the latest, the next time the application is
 276                             # started, we'll fix up the human readable name.
 277                             wc().stream_register (self.key, "", 6 * 60 * 60)
 278                         except woodchuck.ObjectExistsError:
 279                             pass
 280                         ec = woodchuck.TransferStatus.TransientOther
 281                         if 300 <= http_status and http_status < 400:
 282                             ec = woodchuck.TransferStatus.TransientNetwork
 283                         if 400 <= http_status and http_status < 500:
 284                             ec = woodchuck.TransferStatus.FailureGone
 285                         if 500 <= http_status and http_status < 600:
 286                             ec = woodchuck.TransferStatus.TransientNetwork
 287                         wc()[self.key].update_failed(ec)
 288                     mainthread.execute (e, async=True)
 289             else:
 290                currentTime = time.time()
 291                # The etag and modified value should only be updated if the content was not null
 292                try:
 293                    etag = tmp["etag"]
 294                except KeyError:
 295                    etag = None
 296                try:
 297                    modified = tmp["modified"]
 298                except KeyError:
 299                    modified = None
 300                try:
 301                    abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
 302                    f = opener.open(abs_url)
 303                    data = f.read()
 304                    f.close()
 305                    outf = open(self.dir+"/favicon.ico", "w")
 306                    outf.write(data)
 307                    outf.close()
 308                    del data
 309                except (urllib2.HTTPError, urllib2.URLError), exception:
 310                    logger.debug("Could not download favicon %s: %s"
 311                                 % (abs_url, str (exception)))
 312
 313                self.serial_execution_lock.acquire ()
 314                have_serial_execution_lock = True
 315
 316                #reversedEntries = self.getEntries()
 317                #reversedEntries.reverse()
 318
 319                ids = self.getIds()
 320
 321                tmp["entries"].reverse()
 322                for entry in tmp["entries"]:
 323                    # Yield so as to make the main thread a bit more
 324                    # responsive.
 325                    time.sleep(0)
 326
 327                    if JobManager().do_quit:
 328                        raise KeyboardInterrupt
 329
 330                    received_base = progress_handler.stats['received']
 331                    sent_base = progress_handler.stats['sent']
 332                    object_size = 0
 333
 334                    date = self.extractDate(entry)
 335                    try:
 336                        entry["title"]
 337                    except KeyError:
 338                        entry["title"] = "No Title"
 339                    try :
 340                        entry["link"]
 341                    except KeyError:
 342                        entry["link"] = ""
 343                    try:
 344                        entry["author"]
 345                    except KeyError:
 346                        entry["author"] = None
 347                    if(not(entry.has_key("id"))):
 348                        entry["id"] = None
 349                    content = self.extractContent(entry)
 350                    object_size = len (content)
 351                    received_base -= len (content)
 352                    tmpEntry = {"title":entry["title"], "content":content,
 353                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
 354                    id = self.generateUniqueId(tmpEntry)
 355
 356                    #articleTime = time.mktime(self.entries[id]["dateTuple"])
 357                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
 358                    images = soup('img')
 359                    baseurl = tmpEntry["link"]
 360                    #if not id in ids:
 361                    if imageCache and len(images) > 0:
 362                        self.serial_execution_lock.release ()
 363                        have_serial_execution_lock = False
 364                        for img in images:
 365                            filename = self.addImage(
 366                                configdir, self.key, baseurl, img['src'],
 367                                opener=opener)
 368                            if filename:
 369                                 img['src']="file://%s" %filename
 370                                 count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
 371                                 if count == 0:
 372                                     self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
 373                                     self.db.commit()
 374
 375                                 try:
 376                                     object_size += os.path.getsize (filename)
 377                                 except os.error, exception:
 378                                     logger.error ("Error getting size of %s: %s"
 379                                                   % (filename, exception))
 380                        self.serial_execution_lock.acquire ()
 381                        have_serial_execution_lock = True
 382
 383                    tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
 384                    file = open(tmpEntry["contentLink"], "w")
 385                    file.write(soup.prettify())
 386                    file.close()
 387                    if id in ids:
 388                        self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
 389                        self.db.commit()
 390                    else:
 391                        values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
 392                        self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 393                        self.db.commit()
 394 #                   else:
 395 #                       try:
 396 #                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
 397 #                           self.db.commit()
 398 #                           filename = configdir+self.key+".d/"+id+".html"
 399 #                           file = open(filename,"a")
 400 #                           utime(filename, None)
 401 #                           file.close()
 402 #                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
 403 #                           for image in images:
 404 #                                file = open(image[0],"a")
 405 #                                utime(image[0], None)
 406 #                                file.close()
 407 #                       except:
 408 #                           pass
 409
 410                    # Register the object with Woodchuck and mark it as
 411                    # downloaded.
 412                    if have_woodchuck:
 413                        def e():
 414                            try:
 415                                obj = wc()[self.key].object_register(
 416                                    object_identifier=id,
 417                                    human_readable_name=tmpEntry["title"])
 418                            except woodchuck.ObjectExistsError:
 419                                obj = wc()[self.key][id]
 420                            else:
 421                                # If the entry does not contain a publication
 422                                # time, the attribute won't exist.
 423                                pubtime = entry.get ('date_parsed', None)
 424                                if pubtime:
 425                                    obj.publication_time = time.mktime (pubtime)
 426
 427                                received = (progress_handler.stats['received']
 428                                            - received_base)
 429                                sent = progress_handler.stats['sent'] - sent_base
 430                                obj.transferred (
 431                                    indicator=(woodchuck.Indicator.ApplicationVisual
 432                                               |woodchuck.Indicator.StreamWide),
 433                                    transferred_down=received,
 434                                    transferred_up=sent,
 435                                    object_size=object_size)
 436                        mainthread.execute(e, async=True)
 437                self.db.commit()
 438
 439                logger.debug (
 440                    "%s: Update successful: transferred: %d/%d; objects: %d)"
 441                    % (self.key,
 442                       progress_handler.stats['sent'],
 443                       progress_handler.stats['received'],
 444                       len (tmp.entries)))
 445                mainthread.execute (wc_success, async=True)
 446                success = True
 447
 448             rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
 449             for row in rows:
 450                self.removeEntry(row[0])
 451
 452             from glob import glob
 453             from os import stat
 454             for file in glob(configdir+self.key+".d/*"):
 455                 #
 456                 stats = stat(file)
 457                 #
 458                 # put the two dates into matching format
 459                 #
 460                 lastmodDate = stats[8]
 461                 #
 462                 expDate = time.time()-expiry*3
 463                 # check if image-last-modified-date is outdated
 464                 #
 465                 if expDate > lastmodDate:
 466                     #
 467                     try:
 468                         #
 469                         #print 'Removing', file
 470                         #
 471                         # XXX: Tell woodchuck.
 472                         remove(file) # commented out for testing
 473                         #
 474                     except OSError, exception:
 475                         #
 476                         logger.error('Could not remove %s: %s'
 477                                      % (file, str (exception)))
 478             logger.debug("updated %s: %fs in download, %fs in processing"
 479                          % (self.key, download_duration,
 480                             time.time () - process_start))
 481         except:
 482             logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
 483         finally:
 484             self.db.commit ()
 485
 486             if have_serial_execution_lock:
 487                 self.serial_execution_lock.release ()
 488
 489             updateTime = 0
 490             try:
 491                 rows = self.db.execute("SELECT MAX(date) FROM feed;")
 492                 for row in rows:
 493                     updateTime=row[0]
 494             except Exception, e:
 495                 logger.error("Fetching update time: %s: %s"
 496                              % (str(e), traceback.format_exc()))
 497             finally:
 498                 if not success:
 499                     etag = None
 500                     modified = None
 501                 title = None
 502                 try:
 503                     title = tmp.feed.title
 504                 except (AttributeError, UnboundLocalError), exception:
 505                     pass
 506                 if postFeedUpdateFunc is not None:
 507                     postFeedUpdateFunc (self.key, updateTime, etag, modified,
 508                                         title, *postFeedUpdateFuncArgs)
 509
 510     def setEntryRead(self, id):
 511         self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
 512         self.db.commit()
 513
 514         def e():
 515             if wc().available():
 516                 try:
 517                     wc()[self.key][id].used()
 518                 except KeyError:
 519                     pass
 520
 521     def setEntryUnread(self, id):
 522         self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
 523         self.db.commit()
 524
 525     def markAllAsRead(self):
 526         self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
 527         self.db.commit()
 528
 529     def isEntryRead(self, id):
 530         read_status = self.db.execute("SELECT read FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 531         return read_status==1  # Returns True if read==1, and False if read==0
 532
 533     def getTitle(self, id):
 534         return self.db.execute("SELECT title FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 535
 536     def getContentLink(self, id):
 537         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 538
 539     def getExternalLink(self, id):
 540         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 541
 542     def getDate(self, id):
 543         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 544         return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
 545
 546     def getDateTuple(self, id):
 547         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 548         return time.localtime(dateStamp)
 549
 550     def getDateStamp(self, id):
 551         return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 552
 553     def generateUniqueId(self, entry):
 554         """
 555         Generate a stable identifier for the article.  For the same
 556         entry, this should result in the same identifier.  If
 557         possible, the identifier should remain the same even if the
 558         article is updated.
 559         """
 560         # Prefer the entry's id, which is supposed to be globally
 561         # unique.
 562         key = entry.get('id', None)
 563         if not key:
 564             # Next, try the link to the content.
 565             key = entry.get('link', None)
 566         if not key:
 567             # Ok, the title and the date concatenated are likely to be
 568             # relatively stable.
 569             key = entry.get('title', None) + entry.get('date', None)
 570         if not key:
 571             # Hmm, the article's content will at least guarantee no
 572             # false negatives (i.e., missing articles)
 573             key = entry.get('content', None)
 574         if not key:
 575             # If all else fails, just use a random number.
 576             key = str (random.random ())
 577         return getId (key)
 578
 579     def getIds(self, onlyUnread=False):
 580         if onlyUnread:
 581             rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
 582         else:
 583             rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
 584         ids = []
 585         for row in rows:
 586             ids.append(row[0])
 587         #ids.reverse()
 588         return ids
 589
 590     def getNextId(self, id, forward=True):
 591         if forward:
 592             delta = 1
 593         else:
 594             delta = -1
 595         ids = self.getIds()
 596         index = ids.index(id)
 597         return ids[(index + delta) % len(ids)]
 598
 599     def getPreviousId(self, id):
 600         return self.getNextId(id, forward=False)
 601
 602     def getNumberOfUnreadItems(self):
 603         return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
 604
 605     def getNumberOfEntries(self):
 606         return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
 607
 608     def getArticle(self, entry):
 609         #self.setEntryRead(id)
 610         #entry = self.entries[id]
 611         title = entry['title']
 612         #content = entry.get('content', entry.get('summary_detail', {}))
 613         content = entry["content"]
 614
 615         link = entry['link']
 616         author = entry['author']
 617         date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
 618
 619         #text = '''<div style="color: black; background-color: white;">'''
 620         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
 621         text += "<html><head><title>" + title + "</title>"
 622         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
 623         #text += '<style> body {-webkit-user-select: none;} </style>'
 624         text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
 625         if author != None:
 626             text += "<BR /><small><i>Author: " + author + "</i></small>"
 627         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
 628         text += "<BR /><BR />"
 629         text += content
 630         text += "</body></html>"
 631         return text
 632
 633     def getContent(self, id):
 634         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
 635         try:
 636             file = open(self.entries[id]["contentLink"])
 637             content = file.read()
 638             file.close()
 639         except:
 640             content = "Content unavailable"
 641         return content
 642
 643     def extractDate(self, entry):
 644         if entry.has_key("updated_parsed"):
 645             return timegm(entry["updated_parsed"])
 646         elif entry.has_key("published_parsed"):
 647             return timegm(entry["published_parsed"])
 648         else:
 649             return time.time()
 650
 651     def extractContent(self, entry):
 652         content = ""
 653         if entry.has_key('summary'):
 654             content = entry.get('summary', '')
 655         if entry.has_key('content'):
 656             if len(entry.content[0].value) > len(content):
 657                 content = entry.content[0].value
 658         if content == "":
 659             content = entry.get('description', '')
 660         return content
 661
 662     def removeEntry(self, id):
 663         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
 664         if contentLink:
 665             try:
 666                 remove(contentLink)
 667             except OSError, exception:
 668                 logger.error("Deleting %s: %s" % (contentLink, str (exception)))
 669         self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
 670         self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
 671         self.db.commit()
 672
 673         def e():
 674             if wc().available():
 675                 try:
 676                     wc()[self.key][id].files_deleted (
 677                         woodchuck.DeletionResponse.Deleted)
 678                     del wc()[self.key][id]
 679                 except KeyError:
 680                     pass
 681         mainthread.execute (e, async=True)
 682
 683 class ArchivedArticles(Feed):
 684     def addArchivedArticle(self, title, link, date, configdir):
 685         id = self.generateUniqueId({"date":date, "title":title})
 686         values = (id, title, link, date, 0, link, 0)
 687         self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 688         self.db.commit()
 689
 690     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
 691         currentTime = 0
 692         rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
 693         for row in rows:
 694             currentTime = time.time()
 695             id = row[0]
 696             link = row[1]
 697             f = urllib2.urlopen(link)
 698             #entry["content"] = f.read()
 699             html = f.read()
 700             f.close()
 701             soup = BeautifulSoup(html)
 702             images = soup('img')
 703             baseurl = link
 704             for img in images:
 705                 filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
 706                 img['src']=filename
 707                 self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
 708                 self.db.commit()
 709             contentLink = configdir+self.key+".d/"+id+".html"
 710             file = open(contentLink, "w")
 711             file.write(soup.prettify())
 712             file.close()
 713
 714             self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
 715             self.db.commit()
 716         return (currentTime, None, None)
 717
 718     def purgeReadArticles(self):
 719         rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
 720         #ids = self.getIds()
 721         for row in rows:
 722             self.removeArticle(row[0])
 723
 724     def removeArticle(self, id):
 725         rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
 726         for row in rows:
 727             try:
 728                 count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
 729                 if count == 0:
 730                     os.remove(row[0])
 731             except:
 732                 pass
 733         self.removeEntry(id)
 734
 735 class Listing:
 736     def _getdb(self):
 737         try:
 738             db = self.tls.db
 739         except AttributeError:
 740             db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
 741             self.tls.db = db
 742         return db
 743     db = property(_getdb)
 744
 745     # Lists all the feeds in a dictionary, and expose the data
 746     def __init__(self, config, configdir):
 747         self.config = config
 748         self.configdir = configdir
 749
 750         self.tls = threading.local ()
 751
 752         try:
 753             table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
 754             if table == None:
 755                 self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
 756                 self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
 757                 self.addCategory("Default Category")
 758                 if isfile(self.configdir+"feeds.pickle"):
 759                     self.importOldFormatFeeds()
 760                 else:
 761                     self.addFeed("Maemo News", "http://maemo.org/news/items.xml")
 762             else:
 763                 from string import find, upper
 764                 if find(upper(table[0]), "WIDGET")<0:
 765                     self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
 766                     self.db.execute("UPDATE feeds SET widget=1;")
 767                     self.db.commit()
 768                 if find(upper(table[0]), "CATEGORY")<0:
 769                     self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
 770                     self.addCategory("Default Category")
 771                     self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
 772                     self.db.execute("UPDATE feeds SET category=1;")
 773             self.db.commit()
 774         except:
 775             pass
 776
 777         # Check that Woodchuck's state is up to date with respect our
 778         # state.
 779         updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
 780         wc_init (self, True if updater else False)
 781         if wc().available() and updater:
 782             # The list of known streams.
 783             streams = wc().streams_list ()
 784             stream_ids = [s.identifier for s in streams]
 785
 786             # Register any unknown streams.  Remove known streams from
 787             # STREAMS_IDS.
 788             for key in self.getListOfFeeds():
 789                 title = self.getFeedTitle(key)
 790                 # XXX: We should also check whether the list of
 791                 # articles/objects in each feed/stream is up to date.
 792                 if key not in stream_ids:
 793                     logger.debug(
 794                         "Registering previously unknown channel: %s (%s)"
 795                         % (key, title,))
 796                     # Use a default refresh interval of 6 hours.
 797                     wc().stream_register (key, title, 6 * 60 * 60)
 798                 else:
 799                     # Make sure the human readable name is up to date.
 800                     if wc()[key].human_readable_name != title:
 801                         wc()[key].human_readable_name = title
 802                     stream_ids.remove (key)
 803
 804
 805             # Unregister any streams that are no longer subscribed to.
 806             for id in stream_ids:
 807                 logger.debug("Unregistering %s" % (id,))
 808                 w.stream_unregister (id)
 809
 810     def importOldFormatFeeds(self):
 811         """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
 812         import rss
 813         listing = rss.Listing(self.configdir)
 814         rank = 0
 815         for id in listing.getListOfFeeds():
 816             try:
 817                 rank += 1
 818                 values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
 819                 self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
 820                 self.db.commit()
 821
 822                 feed = listing.getFeed(id)
 823                 new_feed = self.getFeed(id)
 824
 825                 items = feed.getIds()[:]
 826                 items.reverse()
 827                 for item in items:
 828                         if feed.isEntryRead(item):
 829                             read_status = 1
 830                         else:
 831                             read_status = 0
 832                         date = timegm(feed.getDateTuple(item))
 833                         title = feed.getTitle(item)
 834                         newId = new_feed.generateUniqueId({"date":date, "title":title})
 835                         values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
 836                         new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 837                         new_feed.db.commit()
 838                         try:
 839                             images = feed.getImages(item)
 840                             for image in images:
 841                                 new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
 842                                 new_feed.db.commit()
 843                         except:
 844                             pass
 845                 self.updateUnread(id)
 846             except:
 847                 logger.error("importOldFormatFeeds: %s"
 848                              % (traceback.format_exc(),))
 849         remove(self.configdir+"feeds.pickle")
 850
 851
 852     def addArchivedArticle(self, key, index):
 853         feed = self.getFeed(key)
 854         title = feed.getTitle(index)
 855         link = feed.getExternalLink(index)
 856         date = feed.getDate(index)
 857         count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
 858         if count == 0:
 859             self.addFeed("Archived Articles", "", id="ArchivedArticles")
 860
 861         archFeed = self.getFeed("ArchivedArticles")
 862         archFeed.addArchivedArticle(title, link, date, self.configdir)
 863         self.updateUnread("ArchivedArticles")
 864
 865     def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
 866                    priority=0):
 867         if expiryTime is None:
 868             expiryTime = self.config.getExpiry()
 869         if not expiryTime:
 870             # Default to 24 hours
 871             expriyTime = 24
 872         if proxy is None:
 873             (use_proxy, proxy) = self.config.getProxy()
 874             if not use_proxy:
 875                 proxy = None
 876         if imageCache is None:
 877             imageCache = self.config.getImageCache()
 878
 879         feed = self.getFeed(key)
 880         (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
 881         try:
 882             modified = time.struct_time(eval(modified))
 883         except:
 884             modified = None
 885         feed.updateFeed(
 886             self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
 887             priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
 888
 889     def _queuePostFeedUpdate(self, *args, **kwargs):
 890         mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
 891
 892     def _postFeedUpdate(self, key, updateTime, etag, modified, title):
 893         if modified==None:
 894             modified="None"
 895         else:
 896             modified=str(tuple(modified))
 897         if updateTime > 0:
 898             self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
 899         else:
 900             self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
 901
 902         if title is not None:
 903             self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
 904                             (title, key))
 905         self.db.commit()
 906         self.updateUnread(key)
 907
 908         update_server_object().ArticleCountUpdated()
 909
 910         stats = JobManager().stats()
 911         global jobs_at_start
 912         completed = stats['jobs-completed'] - jobs_at_start
 913         in_progress = stats['jobs-in-progress']
 914         queued = stats['jobs-queued']
 915
 916         percent = (100 * ((completed + in_progress / 2.))
 917                    / (completed + in_progress + queued))
 918
 919         update_server_object().UpdateProgress(
 920             percent, completed, in_progress, queued, 0, 0, 0, key)
 921
 922         if in_progress == 0 and queued == 0:
 923             jobs_at_start = stats['jobs-completed']
 924
 925     def getFeed(self, key):
 926         if key == "ArchivedArticles":
 927             return ArchivedArticles(self.configdir, key)
 928         return Feed(self.configdir, key)
 929
 930     def editFeed(self, key, title, url, category=None):
 931         if category:
 932             self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
 933         else:
 934             self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
 935         self.db.commit()
 936
 937         if wc().available():
 938             try:
 939                 wc()[key].human_readable_name = title
 940             except KeyError:
 941                 logger.debug("Feed %s (%s) unknown." % (key, title))
 942
 943     def getFeedUpdateTime(self, key):
 944         return time.ctime(self.db.execute("SELECT updateTime FROM feeds WHERE id=?;", (key,)).fetchone()[0])
 945
 946     def getFeedNumberOfUnreadItems(self, key):
 947         return self.db.execute("SELECT unread FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 948
 949     def getFeedTitle(self, key):
 950         (title, url) = self.db.execute("SELECT title, url FROM feeds WHERE id=?;", (key,)).fetchone()
 951         if title:
 952             return title
 953         return url
 954
 955     def getFeedUrl(self, key):
 956         return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 957
 958     def getFeedCategory(self, key):
 959         return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 960
 961     def getListOfFeeds(self, category=None):
 962         if category:
 963             rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
 964         else:
 965             rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
 966         keys = []
 967         for row in rows:
 968             if row[0]:
 969                 keys.append(row[0])
 970         return keys
 971
 972     def getListOfCategories(self):
 973         rows = self.db.execute("SELECT id FROM categories ORDER BY rank;" )
 974         keys = []
 975         for row in rows:
 976             if row[0]:
 977                 keys.append(row[0])
 978         return keys
 979
 980     def getCategoryTitle(self, id):
 981         row = self.db.execute("SELECT title FROM categories WHERE id=?;", (id, )).fetchone()
 982         return row[0]
 983
 984     def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
 985         if   order == "Most unread":
 986             tmp = "ORDER BY unread DESC"
 987             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
 988         elif order == "Least unread":
 989             tmp = "ORDER BY unread"
 990             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
 991         elif order == "Most recent":
 992             tmp = "ORDER BY updateTime DESC"
 993             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
 994         elif order == "Least recent":
 995             tmp = "ORDER BY updateTime"
 996             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
 997         else: # order == "Manual" or invalid value...
 998             tmp = "ORDER BY rank"
 999             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
1000         if onlyUnread:
1001             sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp
1002         else:
1003             sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
1004         rows = self.db.execute(sql)
1005         keys = []
1006         for row in rows:
1007             if row[0]:
1008                 keys.append(row[0])
1009         return keys
1010
1011     def getFavicon(self, key):
1012         filename = "%s%s.d/favicon.ico" % (self.configdir, key)
1013         if isfile(filename):
1014             return filename
1015         else:
1016             return False
1017
1018     def updateUnread(self, key):
1019         feed = self.getFeed(key)
1020         self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
1021         self.db.commit()
1022
1023     def addFeed(self, title, url, id=None, category=1):
1024         if not id:
1025             id = getId(url)
1026         count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
1027         if count == 0:
1028             max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1029             if max_rank == None:
1030                 max_rank = 0
1031             values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
1032             self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
1033             self.db.commit()
1034             # Ask for the feed object, it will create the necessary tables
1035             self.getFeed(id)
1036
1037             if wc().available():
1038                 # Register the stream with Woodchuck.  Update approximately
1039                 # every 6 hours.
1040                 wc().stream_register(stream_identifier=id,
1041                                      human_readable_name=title,
1042                                      freshness=6*60*60)
1043
1044             return True
1045         else:
1046             return False
1047
1048     def addCategory(self, title):
1049         rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
1050         if rank==None:
1051             rank=1
1052         id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
1053         if id==None:
1054             id=1
1055         self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
1056         self.db.commit()
1057
1058     def removeFeed(self, key):
1059         if wc().available ():
1060             try:
1061                 del wc()[key]
1062             except KeyError:
1063                 logger.debug("Removing unregistered feed %s failed" % (key,))
1064
1065         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
1066         self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
1067         self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
1068         self.db.commit()
1069
1070         if isdir(self.configdir+key+".d/"):
1071            rmtree(self.configdir+key+".d/")
1072
1073     def removeCategory(self, key):
1074         if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
1075             rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
1076             self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
1077             self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
1078             self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
1079             self.db.commit()
1080
1081     #def saveConfig(self):
1082     #    self.listOfFeeds["feedingit-order"] = self.sortedKeys
1083     #    file = open(self.configdir+"feeds.pickle", "w")
1084     #    pickle.dump(self.listOfFeeds, file)
1085     #    file.close()
1086
1087     def moveUp(self, key):
1088         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1089         if rank>0:
1090             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
1091             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
1092             self.db.commit()
1093
1094     def moveCategoryUp(self, key):
1095         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1096         if rank>0:
1097             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
1098             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
1099             self.db.commit()
1100
1101     def moveDown(self, key):
1102         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1103         max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1104         if rank<max_rank:
1105             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
1106             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
1107             self.db.commit()
1108
1109     def moveCategoryDown(self, key):
1110         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1111         max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
1112         if rank<max_rank:
1113             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
1114             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
1115             self.db.commit()
1116
1117