IMDb: Split downloading from parsing into SQLite database
authorPhilipp Zabel <philipp.zabel@gmail.com>
Sat, 22 May 2010 13:41:55 +0000 (15:41 +0200)
committerPhilipp Zabel <philipp.zabel@gmail.com>
Wed, 14 Jul 2010 21:34:08 +0000 (23:34 +0200)
This takes about 135 MB more storage, but it will minimize the download
time to reduce energy consumption.

Makefile.am
src/imdb/ftp-downloader.vala [new file with mode: 0644]
src/imdb/imdb-downloader-cli.vala
src/imdb/imdb-ftp-downloader.vala [deleted file]
src/imdb/imdb-gzip-parser.vala [new file with mode: 0644]
src/imdb/imdb-plaintext-downloader.vala

index 4ba24c9..0f7b627 100644 (file)
@@ -222,14 +222,18 @@ src/backends/moviepilot/moviepilot-backend.c: ${cinaest_moviepilot_backend_VALAS
 
 imdb_plaintext_downloader_SOURCES = \
         src/imdb/imdb-plaintext-downloader.c \
-       src/imdb/imdb-ftp-downloader.c \
+       src/imdb/ftp-downloader.c \
+       src/imdb/gzip-input-stream.c \
+       src/imdb/imdb-gzip-parser.c \
        src/imdb/imdb-line-parser.c \
         src/imdb/imdb-sqlite.c \
         src/imdb/plaintext-downloader-interface.c
 
 imdb_plaintext_downloader_VALASOURCES = \
         src/imdb/imdb-plaintext-downloader.vala \
-       src/imdb/imdb-ftp-downloader.vala \
+       src/imdb/ftp-downloader.vala \
+       src/imdb/gzip-input-stream.vala \
+       src/imdb/imdb-gzip-parser.vala \
        src/imdb/imdb-line-parser.vala \
         src/imdb/imdb-sqlite.vala \
         src/imdb/plaintext-downloader-interface.vala
diff --git a/src/imdb/ftp-downloader.vala b/src/imdb/ftp-downloader.vala
new file mode 100644 (file)
index 0000000..2e873c9
--- /dev/null
@@ -0,0 +1,139 @@
+class FtpDownloader {
+       private Curl.EasyHandle curl;
+       private Cancellable cancellable;
+       private FileStream file;
+       private string dirname;
+       private HashTable<string,int> file_size;
+
+       public FtpDownloader (Cancellable? _cancellable) {
+               cancellable = _cancellable;
+               curl = new Curl.EasyHandle ();
+       }
+
+       [CCode (instance_pos = -1)]
+       size_t write_callback (void *buffer, size_t size, size_t nmemb) {
+               if (cancellable != null && cancellable.is_cancelled ())
+                       return 0;
+
+               unowned uint8[] buf = (uint8[]) buffer;
+               buf.length = (int) (size * nmemb);
+
+               file.write (buf);
+
+               return buf.length;
+       }
+
+       private int last_dlnow;
+
+       int progress_callback (double dltotal, double dlnow, double ultotal, double ulnow) {
+               if (cancellable != null && cancellable.is_cancelled ())
+                       return 1;
+               if (last_dlnow != (int) dlnow) {
+                       last_dlnow = (int) dlnow;
+                       progress ((int) dltotal, last_dlnow);
+               }
+               return 0;
+       }
+
+       public void download (string url, string filename) throws IOError {
+               print ("download (\"%s\", \"%s\")\n", url, filename);
+               download_dir (Path.get_dirname (url) + "/");
+               string basename = Path.get_basename (url);
+               int size = file_size.lookup (basename);
+               if (size > 0) {
+                       Posix.Stat st;
+                       Posix.stat (filename, out st);
+                       if (size == st.st_size) {
+                               return;
+                       }
+               }
+
+               curl.setopt (Curl.Option.URL, url);
+               curl.setopt (Curl.Option.WRITEFUNCTION, write_callback);
+               curl.setopt (Curl.Option.WRITEDATA, this);
+               curl.setopt (Curl.Option.NOPROGRESS, 0L);
+               curl.setopt (Curl.Option.PROGRESSFUNCTION, progress_callback);
+               curl.setopt (Curl.Option.PROGRESSDATA, this);
+
+               last_dlnow = -1;
+               file = FileStream.open (filename, "w");
+
+               var res = curl.perform ();
+               if (Curl.Code.ABORTED_BY_CALLBACK == res) {
+                               throw new IOError.CANCELLED ("Download cancelled.");
+               } else if (res != 0) {
+                       stderr.printf ("cURL performed: %d\n", res);
+               }
+
+               file = null;
+       }
+
+       void parse_dir_entry (string line) {
+               try {
+                       Regex re_dir_entry = new Regex ("^.* ([0-9]*) [A-Z][a-z]* *[0-9]* [0-9]* [0-9]*:[0-9]* ([^ ]*)$");
+                       MatchInfo match_info;
+                       if (re_dir_entry.match (line, 0, out match_info)) {
+                               string name = match_info.fetch (2);
+                               int size = match_info.fetch (1).to_int ();
+                               file_size.insert (name, size);
+                       }
+               } catch (RegexError e) {
+               }
+       }
+
+       string last_line = null;
+       [CCode (instance_pos = -1)]
+       size_t dir_callback (void *buffer, size_t size, size_t nmemb) {
+               if (cancellable != null && cancellable.is_cancelled ())
+                       return 0;
+
+               unowned char[] buf = (char[]) buffer;
+               buf.length = (int) (size * nmemb);
+
+               char *p = buf;
+               int i;
+               int j;
+               for (i = 0, j = 0; i < buf.length; i++, j++) {
+                       if (buf[i] == '\n') {
+                               buf[i] = 0;
+                               if (last_line != null) {
+                                       parse_dir_entry (last_line + (string) p);
+                                       last_line = null;
+                               } else {
+                                       parse_dir_entry ((string) p);
+                               }
+                               p += j + 1;
+                               j = -1;
+                       }
+               }
+               if (j > 0)
+                       last_line = ((string) p).ndup (j);
+
+               return buf.length;
+       }
+
+       public void download_dir (string url) throws IOError {
+               if (dirname != null && dirname == url)
+                       return;
+               print ("download_dir (\"%s\")\n", url);
+
+               curl.setopt (Curl.Option.URL, url);
+               curl.setopt (Curl.Option.WRITEFUNCTION, dir_callback);
+               curl.setopt (Curl.Option.WRITEDATA, this);
+               curl.setopt (Curl.Option.NOPROGRESS, 1L);
+               curl.setopt (Curl.Option.PROGRESSFUNCTION, null);
+               curl.setopt (Curl.Option.PROGRESSDATA, null);
+
+               file_size = new HashTable<string, int> (str_hash, int_equal);
+
+               var res = curl.perform ();
+               if (Curl.Code.ABORTED_BY_CALLBACK == res) {
+                               throw new IOError.CANCELLED ("Dir listing cancelled.");
+               } else if (res != 0) {
+                       stderr.printf ("cURL performed: %d\n", res);
+               }
+               dirname = url;
+       }
+
+       public signal void progress (int dltotal, int dlnow);
+}
index 07da9ea..393a21f 100644 (file)
@@ -55,7 +55,11 @@ class IMDbDownloaderCLI : Object, IMDbDownloader {
        // Private methods
 
        private bool do_download () {
-               download ("ftp.fu-berlin.de/pub/misc/movies/database/", MOVIES | GENRES | RATINGS | AKAS | PLOTS);
+               try {
+                       download ("ftp.fu-berlin.de/pub/misc/movies/database/", MOVIES | GENRES | RATINGS | AKAS | PLOTS);
+               } catch (Error e) {
+                       print ("Error: %s\n", e.message);
+               }
 
                return false;
        }
@@ -79,32 +83,60 @@ class IMDbDownloaderCLI : Object, IMDbDownloader {
                        var aka_parser = new AkaLineParser (sqlite);
                        var plot_parser = new PlotLineParser (sqlite);
 
-                       var downloader = new IMDbFtpDownloader (cancellable);
-                       downloader.progress_changed.connect (on_progress_changed);
+                       var downloader = new FtpDownloader (cancellable);
+
+                       var parser = new IMDbGzipParser (cancellable);
+
+                       downloader.progress.connect (on_progress);
 
                        if (MOVIES in flags) {
                                description_changed ("Downloading movie list ...");
-                               downloader.download (url + "movies.list.gz", movie_parser);
+                               downloader.download (url + "movies.list.gz", Path.build_filename (cache_dir, "movies.list.gz"));
                        }
                        percent_finished = 20;
                        if (GENRES in flags) {
                                description_changed ("Downloading genre data ...");
-                               downloader.download (url + "genres.list.gz", genre_parser);
+                               downloader.download (url + "genres.list.gz", Path.build_filename (cache_dir, "genres.list.gz"));
                        }
                        percent_finished = 40;
                        if (RATINGS in flags) {
                                description_changed ("Downloading rating data ...");
-                               downloader.download (url + "ratings.list.gz", rating_parser);
+                               downloader.download (url + "ratings.list.gz", Path.build_filename (cache_dir, "ratings.list.gz"));
                        }
                        percent_finished = 60;
                        if (AKAS in flags) {
                                description_changed ("Downloading alternative titles ...");
-                               downloader.download (url + "aka-titles.list.gz", aka_parser);
+                               downloader.download (url + "aka-titles.list.gz", Path.build_filename (cache_dir, "aka-titles.list.gz"));
                        }
                        percent_finished = 80;
                        if (PLOTS in flags) {
                                description_changed ("Downloading plots ...");
-                               downloader.download (url + "plot.list.gz", plot_parser);
+                               downloader.download (url + "plot.list.gz", Path.build_filename (cache_dir, "plot.list.gz"));
+                       }
+
+                       if (MOVIES in flags) {
+                               description_changed ("Parsing movie list ...");
+                               parser.parse (Path.build_filename (cache_dir, "movies.list.gz"), movie_parser);
+                       }
+                       percent_finished = 20;
+                       if (GENRES in flags) {
+                               description_changed ("Parsing genre data ...");
+                               parser.parse (Path.build_filename (cache_dir, "genres.list.gz"), genre_parser);
+                       }
+                       percent_finished = 40;
+                       if (RATINGS in flags) {
+                               description_changed ("Parsing rating data ...");
+                               parser.parse (Path.build_filename (cache_dir, "ratings.list.gz"), rating_parser);
+                       }
+                       percent_finished = 60;
+                       if (AKAS in flags) {
+                               description_changed ("Parsing alternative titles ...");
+                               parser.parse (Path.build_filename (cache_dir, "aka-titles.list.gz"), aka_parser);
+                       }
+                       percent_finished = 80;
+                       if (PLOTS in flags) {
+                               description_changed ("Parsing plots ...");
+                               parser.parse (Path.build_filename (cache_dir, "plot.list.gz"), plot_parser);
                        }
                } catch (Error e2) {
                        if (e2 is IOError.CANCELLED)
@@ -132,9 +164,12 @@ class IMDbDownloaderCLI : Object, IMDbDownloader {
                return null;
        }
 
-       private void on_progress_changed (int percent) {
+       private void on_progress (int dltotal, int dlnow) {
+       /*
                progress (percent_finished + percent / 5);
                stdout.printf ("%d %%\r", percent_finished + percent / 5);
+       */
+               stdout.printf ("%d / %d\r", dlnow, dltotal);
                stdout.flush ();
        }
 
@@ -153,11 +188,16 @@ class IMDbDownloaderCLI : Object, IMDbDownloader {
                loop.run ();
        }
 
+       public void show_desc (string desc) {
+               print ("DESC: \"%s\"\n", desc);
+       }
+
        public static void main () {
                Curl.global_init (Curl.GLOBAL_DEFAULT);
 
                // Start server
                var downloader = new IMDbDownloaderCLI ();
+               downloader.description_changed.connect (downloader.show_desc);
 
                Idle.add (downloader.do_download);
                downloader.run ();
diff --git a/src/imdb/imdb-ftp-downloader.vala b/src/imdb/imdb-ftp-downloader.vala
deleted file mode 100644 (file)
index 4c78aac..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-class IMDbFtpDownloader {
-       Curl.EasyHandle curl;
-       private ZLib.InflateStream strm;
-       private int percent;
-       private char[] buf_out;
-       private uint have;
-       private LineParser parser;
-       private Cancellable cancellable;
-
-       [CCode (instance_pos = -1)]
-       size_t write_callback (void *buffer, size_t size, size_t nmemb) {
-               if (cancellable != null && cancellable.is_cancelled ())
-                       return 0;
-               strm.next_in = buffer;
-               strm.avail_in = (uint) (size * nmemb);
-               if (strm.avail_in == 0)
-                       return 0;
-
-               do {
-                       strm.next_out = (char*) buf_out + have;
-                       strm.avail_out = buf_out.length - have;
-
-                       char* p = (char*) buf_out;
-
-                       var ret = strm.inflate (ZLib.Flush.NO_FLUSH);
-                        assert (ret != ZLib.Status.STREAM_ERROR);
-                       if (ret == ZLib.Status.NEED_DICT)
-                               ret = ZLib.Status.DATA_ERROR;
-                       switch (ret) {
-                       case ZLib.Status.DATA_ERROR:
-                       case ZLib.Status.MEM_ERROR:
-                               return ret;
-                       }
-
-                       have = buf_out.length - strm.avail_out;
-
-                       char* l = p;
-                       int j = 0;
-                       for (int i = 0; i < have; i++, j++) {
-                               if (p[i] == '\n') {
-                                       p[i] = 0;
-                                       if (parser != null)
-                                               parser.parse_line ((string) l);
-                                       j = -1;
-                                       l = p + i + 1;
-                               }
-                       }
-                       if (j > 0) {
-                               Memory.copy (p, l, j);
-                               have = j;
-                       } else {
-                               have = 0;
-                       }
-               } while (strm.avail_out == 0);
-
-               return size * nmemb;
-       }
-
-       int progress_callback (double dltotal, double dlnow, double ultotal, double ulnow) {
-               if (cancellable != null && cancellable.is_cancelled ())
-                       return 1;
-               if (dltotal > 0) {
-                       int p = (int) (100 * dlnow / dltotal);
-                       if (p > percent) {
-                               percent = p;
-                               progress_changed (p);
-                       }
-               }
-               return 0;
-       }
-
-       public IMDbFtpDownloader (Cancellable? _cancellable) {
-               cancellable = _cancellable;
-               curl = new Curl.EasyHandle ();
-               curl.setopt (Curl.Option.WRITEFUNCTION, write_callback);
-               curl.setopt (Curl.Option.WRITEDATA, this);
-               curl.setopt (Curl.Option.NOPROGRESS, 0L);
-               curl.setopt (Curl.Option.PROGRESSFUNCTION, progress_callback);
-               curl.setopt (Curl.Option.PROGRESSDATA, this);
-               buf_out = new char[16384];
-       }
-
-       public void download (string url, LineParser? _parser) throws IOError {
-               curl.setopt (Curl.Option.URL, url);
-               percent = 0;
-               parser = _parser;
-               have = 0;
-
-               strm = ZLib.InflateStream.full (15 | 32);
-
-               var res = curl.perform ();
-               if (Curl.Code.ABORTED_BY_CALLBACK == res) {
-                               throw new IOError.CANCELLED ("Download cancelled.");
-               }
-       }
-
-       public signal void progress_changed (int percent);
-}
diff --git a/src/imdb/imdb-gzip-parser.vala b/src/imdb/imdb-gzip-parser.vala
new file mode 100644 (file)
index 0000000..a729206
--- /dev/null
@@ -0,0 +1,32 @@
+class IMDbGzipParser {
+       private LineParser parser;
+       private Cancellable cancellable;
+
+       public IMDbGzipParser (Cancellable? _cancellable) {
+               cancellable = _cancellable;
+       }
+
+       public void parse (string path, LineParser? _parser) throws IOError {
+               parser = _parser;
+
+               var file = File.new_for_path (path);
+               var gz_stream = new GzipInputStream (file.read (cancellable));
+               var stream = new DataInputStream (gz_stream);
+
+               int total_in = 0;
+               size_t length;
+               string line;
+               progress (0, 0);
+               line = stream.read_line (out length, cancellable);
+               while (line != null) {
+                       parser.parse_line (line);
+                       line = stream.read_line (out length, cancellable);
+                       if (gz_stream.total_in () > total_in) {
+                               total_in = (int) gz_stream.total_in ();
+                               progress (0, total_in);
+                       }
+               }
+       }
+
+       public signal void progress (int total, int now);
+}
index a7c28e7..a8462c0 100644 (file)
@@ -75,33 +75,59 @@ class IMDbDownloadServer : Object, IMDbDownloader {
                        var aka_parser = new AkaLineParser (sqlite);
                        var plot_parser = new PlotLineParser (sqlite);
 
-                       var downloader = new IMDbFtpDownloader (cancellable);
-                       downloader.progress_changed.connect (on_progress_changed);
+                       var downloader = new FtpDownloader (cancellable);
+                       downloader.progress.connect (on_progress);
+
+                       var parser = new IMDbGzipParser (cancellable);
 
                        if (MOVIES in flags) {
                                description_changed ("Downloading movie list ...");
-                               downloader.download (url + "movies.list.gz", movie_parser);
+                               downloader.download (url + "movies.list.gz", Path.build_filename (cache_dir, "movies.list.gz"));
                        }
                        percent_finished = 20;
                        if (GENRES in flags) {
                                description_changed ("Downloading genre data ...");
-                               downloader.download (url + "genres.list.gz", genre_parser);
+                               downloader.download (url + "genres.list.gz", Path.build_filename (cache_dir, "genres.list.gz"));
                        }
                        percent_finished = 40;
                        if (RATINGS in flags) {
                                description_changed ("Downloading rating data ...");
-                               downloader.download (url + "ratings.list.gz", rating_parser);
+                               downloader.download (url + "ratings.list.gz", Path.build_filename (cache_dir, "ratings.list.gz"));
                        }
                        percent_finished = 60;
                        if (AKAS in flags) {
                                description_changed ("Downloading alternative titles ...");
-                               downloader.download (url + "aka-titles.list.gz", aka_parser);
+                               downloader.download (url + "aka-titles.list.gz", Path.build_filename (cache_dir, "aka-titles.list.gz"));
                        }
                        percent_finished = 80;
                        if (PLOTS in flags) {
                                description_changed ("Downloading plots ...");
-                               print ("Downloading Plots");
-                               downloader.download (url + "plot.list.gz", plot_parser);
+                               downloader.download (url + "plot.list.gz", Path.build_filename (cache_dir, "plot.list.gz"));
+                       }
+
+                       if (MOVIES in flags) {
+                               description_changed ("Parsing movie list ...");
+                               parser.parse (Path.build_filename (cache_dir, "movies.list.gz"), movie_parser);
+                       }
+                       percent_finished = 20;
+                       if (GENRES in flags) {
+                               description_changed ("Parsing genre data ...");
+                               parser.parse (Path.build_filename (cache_dir, "genres.list.gz"), genre_parser);
+                       }
+                       percent_finished = 40;
+                       if (RATINGS in flags) {
+                               description_changed ("Parsing rating data ...");
+                               parser.parse (Path.build_filename (cache_dir, "ratings.list.gz"), rating_parser);
+                       }
+                       percent_finished = 60;
+                       if (AKAS in flags) {
+                               description_changed ("Parsing alternative titles ...");
+                               parser.parse (Path.build_filename (cache_dir, "aka-titles.list.gz"), aka_parser);
+                       }
+                       percent_finished = 80;
+                       if (PLOTS in flags) {
+                               description_changed ("Parsing plots ...");
+                               parser.parse (Path.build_filename (cache_dir, "plot.list.gz"), plot_parser);
                        }
                } catch (Error e2) {
                        if (e2 is IOError.CANCELLED)
@@ -129,8 +155,10 @@ class IMDbDownloadServer : Object, IMDbDownloader {
                return null;
        }
 
-       private void on_progress_changed (int percent) {
-               progress (percent_finished + percent / 5);
+       private void on_progress (int dltotal, int dlnow) {
+               stdout.printf ("%d / %d\r", dlnow, dltotal);
+               if (dltotal > 0)
+                       progress (99*dlnow/dltotal/100);
        }
 
        private void timeout_quit () {