From e80999788c0626ccfef0995c7b924189c342d7a3 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Sat, 22 May 2010 15:41:55 +0200 Subject: [PATCH 1/1] IMDb: Split downloading from parsing into SQLite database This takes about 135 MB more storage, but it will minimize the download time to reduce energy consumption. --- Makefile.am | 8 +- src/imdb/ftp-downloader.vala | 139 +++++++++++++++++++++++++++++++ src/imdb/imdb-downloader-cli.vala | 58 +++++++++++-- src/imdb/imdb-ftp-downloader.vala | 98 ---------------------- src/imdb/imdb-gzip-parser.vala | 32 +++++++ src/imdb/imdb-plaintext-downloader.vala | 48 ++++++++--- 6 files changed, 264 insertions(+), 119 deletions(-) create mode 100644 src/imdb/ftp-downloader.vala delete mode 100644 src/imdb/imdb-ftp-downloader.vala create mode 100644 src/imdb/imdb-gzip-parser.vala diff --git a/Makefile.am b/Makefile.am index 4ba24c9..0f7b627 100644 --- a/Makefile.am +++ b/Makefile.am @@ -222,14 +222,18 @@ src/backends/moviepilot/moviepilot-backend.c: ${cinaest_moviepilot_backend_VALAS imdb_plaintext_downloader_SOURCES = \ src/imdb/imdb-plaintext-downloader.c \ - src/imdb/imdb-ftp-downloader.c \ + src/imdb/ftp-downloader.c \ + src/imdb/gzip-input-stream.c \ + src/imdb/imdb-gzip-parser.c \ src/imdb/imdb-line-parser.c \ src/imdb/imdb-sqlite.c \ src/imdb/plaintext-downloader-interface.c imdb_plaintext_downloader_VALASOURCES = \ src/imdb/imdb-plaintext-downloader.vala \ - src/imdb/imdb-ftp-downloader.vala \ + src/imdb/ftp-downloader.vala \ + src/imdb/gzip-input-stream.vala \ + src/imdb/imdb-gzip-parser.vala \ src/imdb/imdb-line-parser.vala \ src/imdb/imdb-sqlite.vala \ src/imdb/plaintext-downloader-interface.vala diff --git a/src/imdb/ftp-downloader.vala b/src/imdb/ftp-downloader.vala new file mode 100644 index 0000000..2e873c9 --- /dev/null +++ b/src/imdb/ftp-downloader.vala @@ -0,0 +1,139 @@ +class FtpDownloader { + private Curl.EasyHandle curl; + private Cancellable cancellable; + private FileStream file; + private string dirname; + private HashTable file_size; + + public FtpDownloader (Cancellable? _cancellable) { + cancellable = _cancellable; + curl = new Curl.EasyHandle (); + } + + [CCode (instance_pos = -1)] + size_t write_callback (void *buffer, size_t size, size_t nmemb) { + if (cancellable != null && cancellable.is_cancelled ()) + return 0; + + unowned uint8[] buf = (uint8[]) buffer; + buf.length = (int) (size * nmemb); + + file.write (buf); + + return buf.length; + } + + private int last_dlnow; + + int progress_callback (double dltotal, double dlnow, double ultotal, double ulnow) { + if (cancellable != null && cancellable.is_cancelled ()) + return 1; + if (last_dlnow != (int) dlnow) { + last_dlnow = (int) dlnow; + progress ((int) dltotal, last_dlnow); + } + return 0; + } + + public void download (string url, string filename) throws IOError { + print ("download (\"%s\", \"%s\")\n", url, filename); + download_dir (Path.get_dirname (url) + "/"); + string basename = Path.get_basename (url); + int size = file_size.lookup (basename); + if (size > 0) { + Posix.Stat st; + Posix.stat (filename, out st); + if (size == st.st_size) { + return; + } + } + + curl.setopt (Curl.Option.URL, url); + curl.setopt (Curl.Option.WRITEFUNCTION, write_callback); + curl.setopt (Curl.Option.WRITEDATA, this); + curl.setopt (Curl.Option.NOPROGRESS, 0L); + curl.setopt (Curl.Option.PROGRESSFUNCTION, progress_callback); + curl.setopt (Curl.Option.PROGRESSDATA, this); + + last_dlnow = -1; + file = FileStream.open (filename, "w"); + + var res = curl.perform (); + if (Curl.Code.ABORTED_BY_CALLBACK == res) { + throw new IOError.CANCELLED ("Download cancelled."); + } else if (res != 0) { + stderr.printf ("cURL performed: %d\n", res); + } + + file = null; + } + + void parse_dir_entry (string line) { + try { + Regex re_dir_entry = new Regex ("^.* ([0-9]*) [A-Z][a-z]* *[0-9]* [0-9]* [0-9]*:[0-9]* ([^ ]*)$"); + MatchInfo match_info; + if (re_dir_entry.match (line, 0, out match_info)) { + string name = match_info.fetch (2); + int size = match_info.fetch (1).to_int (); + file_size.insert (name, size); + } + } catch (RegexError e) { + } + } + + string last_line = null; + [CCode (instance_pos = -1)] + size_t dir_callback (void *buffer, size_t size, size_t nmemb) { + if (cancellable != null && cancellable.is_cancelled ()) + return 0; + + unowned char[] buf = (char[]) buffer; + buf.length = (int) (size * nmemb); + + char *p = buf; + int i; + int j; + for (i = 0, j = 0; i < buf.length; i++, j++) { + if (buf[i] == '\n') { + buf[i] = 0; + if (last_line != null) { + parse_dir_entry (last_line + (string) p); + last_line = null; + } else { + parse_dir_entry ((string) p); + } + p += j + 1; + j = -1; + } + } + if (j > 0) + last_line = ((string) p).ndup (j); + + return buf.length; + } + + public void download_dir (string url) throws IOError { + if (dirname != null && dirname == url) + return; + print ("download_dir (\"%s\")\n", url); + + curl.setopt (Curl.Option.URL, url); + curl.setopt (Curl.Option.WRITEFUNCTION, dir_callback); + curl.setopt (Curl.Option.WRITEDATA, this); + curl.setopt (Curl.Option.NOPROGRESS, 1L); + curl.setopt (Curl.Option.PROGRESSFUNCTION, null); + curl.setopt (Curl.Option.PROGRESSDATA, null); + + file_size = new HashTable (str_hash, int_equal); + + var res = curl.perform (); + if (Curl.Code.ABORTED_BY_CALLBACK == res) { + throw new IOError.CANCELLED ("Dir listing cancelled."); + } else if (res != 0) { + stderr.printf ("cURL performed: %d\n", res); + } + dirname = url; + } + + public signal void progress (int dltotal, int dlnow); +} diff --git a/src/imdb/imdb-downloader-cli.vala b/src/imdb/imdb-downloader-cli.vala index 07da9ea..393a21f 100644 --- a/src/imdb/imdb-downloader-cli.vala +++ b/src/imdb/imdb-downloader-cli.vala @@ -55,7 +55,11 @@ class IMDbDownloaderCLI : Object, IMDbDownloader { // Private methods private bool do_download () { - download ("ftp.fu-berlin.de/pub/misc/movies/database/", MOVIES | GENRES | RATINGS | AKAS | PLOTS); + try { + download ("ftp.fu-berlin.de/pub/misc/movies/database/", MOVIES | GENRES | RATINGS | AKAS | PLOTS); + } catch (Error e) { + print ("Error: %s\n", e.message); + } return false; } @@ -79,32 +83,60 @@ class IMDbDownloaderCLI : Object, IMDbDownloader { var aka_parser = new AkaLineParser (sqlite); var plot_parser = new PlotLineParser (sqlite); - var downloader = new IMDbFtpDownloader (cancellable); - downloader.progress_changed.connect (on_progress_changed); + var downloader = new FtpDownloader (cancellable); + + var parser = new IMDbGzipParser (cancellable); + + downloader.progress.connect (on_progress); if (MOVIES in flags) { description_changed ("Downloading movie list ..."); - downloader.download (url + "movies.list.gz", movie_parser); + downloader.download (url + "movies.list.gz", Path.build_filename (cache_dir, "movies.list.gz")); } percent_finished = 20; if (GENRES in flags) { description_changed ("Downloading genre data ..."); - downloader.download (url + "genres.list.gz", genre_parser); + downloader.download (url + "genres.list.gz", Path.build_filename (cache_dir, "genres.list.gz")); } percent_finished = 40; if (RATINGS in flags) { description_changed ("Downloading rating data ..."); - downloader.download (url + "ratings.list.gz", rating_parser); + downloader.download (url + "ratings.list.gz", Path.build_filename (cache_dir, "ratings.list.gz")); } percent_finished = 60; if (AKAS in flags) { description_changed ("Downloading alternative titles ..."); - downloader.download (url + "aka-titles.list.gz", aka_parser); + downloader.download (url + "aka-titles.list.gz", Path.build_filename (cache_dir, "aka-titles.list.gz")); } percent_finished = 80; if (PLOTS in flags) { description_changed ("Downloading plots ..."); - downloader.download (url + "plot.list.gz", plot_parser); + downloader.download (url + "plot.list.gz", Path.build_filename (cache_dir, "plot.list.gz")); + } + + if (MOVIES in flags) { + description_changed ("Parsing movie list ..."); + parser.parse (Path.build_filename (cache_dir, "movies.list.gz"), movie_parser); + } + percent_finished = 20; + if (GENRES in flags) { + description_changed ("Parsing genre data ..."); + parser.parse (Path.build_filename (cache_dir, "genres.list.gz"), genre_parser); + } + percent_finished = 40; + if (RATINGS in flags) { + description_changed ("Parsing rating data ..."); + parser.parse (Path.build_filename (cache_dir, "ratings.list.gz"), rating_parser); + } + percent_finished = 60; + if (AKAS in flags) { + description_changed ("Parsing alternative titles ..."); + parser.parse (Path.build_filename (cache_dir, "aka-titles.list.gz"), aka_parser); + } + percent_finished = 80; + if (PLOTS in flags) { + description_changed ("Parsing plots ..."); + parser.parse (Path.build_filename (cache_dir, "plot.list.gz"), plot_parser); } } catch (Error e2) { if (e2 is IOError.CANCELLED) @@ -132,9 +164,12 @@ class IMDbDownloaderCLI : Object, IMDbDownloader { return null; } - private void on_progress_changed (int percent) { + private void on_progress (int dltotal, int dlnow) { + /* progress (percent_finished + percent / 5); stdout.printf ("%d %%\r", percent_finished + percent / 5); + */ + stdout.printf ("%d / %d\r", dlnow, dltotal); stdout.flush (); } @@ -153,11 +188,16 @@ class IMDbDownloaderCLI : Object, IMDbDownloader { loop.run (); } + public void show_desc (string desc) { + print ("DESC: \"%s\"\n", desc); + } + public static void main () { Curl.global_init (Curl.GLOBAL_DEFAULT); // Start server var downloader = new IMDbDownloaderCLI (); + downloader.description_changed.connect (downloader.show_desc); Idle.add (downloader.do_download); downloader.run (); diff --git a/src/imdb/imdb-ftp-downloader.vala b/src/imdb/imdb-ftp-downloader.vala deleted file mode 100644 index 4c78aac..0000000 --- a/src/imdb/imdb-ftp-downloader.vala +++ /dev/null @@ -1,98 +0,0 @@ -class IMDbFtpDownloader { - Curl.EasyHandle curl; - private ZLib.InflateStream strm; - private int percent; - private char[] buf_out; - private uint have; - private LineParser parser; - private Cancellable cancellable; - - [CCode (instance_pos = -1)] - size_t write_callback (void *buffer, size_t size, size_t nmemb) { - if (cancellable != null && cancellable.is_cancelled ()) - return 0; - strm.next_in = buffer; - strm.avail_in = (uint) (size * nmemb); - if (strm.avail_in == 0) - return 0; - - do { - strm.next_out = (char*) buf_out + have; - strm.avail_out = buf_out.length - have; - - char* p = (char*) buf_out; - - var ret = strm.inflate (ZLib.Flush.NO_FLUSH); - assert (ret != ZLib.Status.STREAM_ERROR); - if (ret == ZLib.Status.NEED_DICT) - ret = ZLib.Status.DATA_ERROR; - switch (ret) { - case ZLib.Status.DATA_ERROR: - case ZLib.Status.MEM_ERROR: - return ret; - } - - have = buf_out.length - strm.avail_out; - - char* l = p; - int j = 0; - for (int i = 0; i < have; i++, j++) { - if (p[i] == '\n') { - p[i] = 0; - if (parser != null) - parser.parse_line ((string) l); - j = -1; - l = p + i + 1; - } - } - if (j > 0) { - Memory.copy (p, l, j); - have = j; - } else { - have = 0; - } - } while (strm.avail_out == 0); - - return size * nmemb; - } - - int progress_callback (double dltotal, double dlnow, double ultotal, double ulnow) { - if (cancellable != null && cancellable.is_cancelled ()) - return 1; - if (dltotal > 0) { - int p = (int) (100 * dlnow / dltotal); - if (p > percent) { - percent = p; - progress_changed (p); - } - } - return 0; - } - - public IMDbFtpDownloader (Cancellable? _cancellable) { - cancellable = _cancellable; - curl = new Curl.EasyHandle (); - curl.setopt (Curl.Option.WRITEFUNCTION, write_callback); - curl.setopt (Curl.Option.WRITEDATA, this); - curl.setopt (Curl.Option.NOPROGRESS, 0L); - curl.setopt (Curl.Option.PROGRESSFUNCTION, progress_callback); - curl.setopt (Curl.Option.PROGRESSDATA, this); - buf_out = new char[16384]; - } - - public void download (string url, LineParser? _parser) throws IOError { - curl.setopt (Curl.Option.URL, url); - percent = 0; - parser = _parser; - have = 0; - - strm = ZLib.InflateStream.full (15 | 32); - - var res = curl.perform (); - if (Curl.Code.ABORTED_BY_CALLBACK == res) { - throw new IOError.CANCELLED ("Download cancelled."); - } - } - - public signal void progress_changed (int percent); -} diff --git a/src/imdb/imdb-gzip-parser.vala b/src/imdb/imdb-gzip-parser.vala new file mode 100644 index 0000000..a729206 --- /dev/null +++ b/src/imdb/imdb-gzip-parser.vala @@ -0,0 +1,32 @@ +class IMDbGzipParser { + private LineParser parser; + private Cancellable cancellable; + + public IMDbGzipParser (Cancellable? _cancellable) { + cancellable = _cancellable; + } + + public void parse (string path, LineParser? _parser) throws IOError { + parser = _parser; + + var file = File.new_for_path (path); + var gz_stream = new GzipInputStream (file.read (cancellable)); + var stream = new DataInputStream (gz_stream); + + int total_in = 0; + size_t length; + string line; + progress (0, 0); + line = stream.read_line (out length, cancellable); + while (line != null) { + parser.parse_line (line); + line = stream.read_line (out length, cancellable); + if (gz_stream.total_in () > total_in) { + total_in = (int) gz_stream.total_in (); + progress (0, total_in); + } + } + } + + public signal void progress (int total, int now); +} diff --git a/src/imdb/imdb-plaintext-downloader.vala b/src/imdb/imdb-plaintext-downloader.vala index a7c28e7..a8462c0 100644 --- a/src/imdb/imdb-plaintext-downloader.vala +++ b/src/imdb/imdb-plaintext-downloader.vala @@ -75,33 +75,59 @@ class IMDbDownloadServer : Object, IMDbDownloader { var aka_parser = new AkaLineParser (sqlite); var plot_parser = new PlotLineParser (sqlite); - var downloader = new IMDbFtpDownloader (cancellable); - downloader.progress_changed.connect (on_progress_changed); + var downloader = new FtpDownloader (cancellable); + downloader.progress.connect (on_progress); + + var parser = new IMDbGzipParser (cancellable); if (MOVIES in flags) { description_changed ("Downloading movie list ..."); - downloader.download (url + "movies.list.gz", movie_parser); + downloader.download (url + "movies.list.gz", Path.build_filename (cache_dir, "movies.list.gz")); } percent_finished = 20; if (GENRES in flags) { description_changed ("Downloading genre data ..."); - downloader.download (url + "genres.list.gz", genre_parser); + downloader.download (url + "genres.list.gz", Path.build_filename (cache_dir, "genres.list.gz")); } percent_finished = 40; if (RATINGS in flags) { description_changed ("Downloading rating data ..."); - downloader.download (url + "ratings.list.gz", rating_parser); + downloader.download (url + "ratings.list.gz", Path.build_filename (cache_dir, "ratings.list.gz")); } percent_finished = 60; if (AKAS in flags) { description_changed ("Downloading alternative titles ..."); - downloader.download (url + "aka-titles.list.gz", aka_parser); + downloader.download (url + "aka-titles.list.gz", Path.build_filename (cache_dir, "aka-titles.list.gz")); } percent_finished = 80; if (PLOTS in flags) { description_changed ("Downloading plots ..."); - print ("Downloading Plots"); - downloader.download (url + "plot.list.gz", plot_parser); + downloader.download (url + "plot.list.gz", Path.build_filename (cache_dir, "plot.list.gz")); + } + + if (MOVIES in flags) { + description_changed ("Parsing movie list ..."); + parser.parse (Path.build_filename (cache_dir, "movies.list.gz"), movie_parser); + } + percent_finished = 20; + if (GENRES in flags) { + description_changed ("Parsing genre data ..."); + parser.parse (Path.build_filename (cache_dir, "genres.list.gz"), genre_parser); + } + percent_finished = 40; + if (RATINGS in flags) { + description_changed ("Parsing rating data ..."); + parser.parse (Path.build_filename (cache_dir, "ratings.list.gz"), rating_parser); + } + percent_finished = 60; + if (AKAS in flags) { + description_changed ("Parsing alternative titles ..."); + parser.parse (Path.build_filename (cache_dir, "aka-titles.list.gz"), aka_parser); + } + percent_finished = 80; + if (PLOTS in flags) { + description_changed ("Parsing plots ..."); + parser.parse (Path.build_filename (cache_dir, "plot.list.gz"), plot_parser); } } catch (Error e2) { if (e2 is IOError.CANCELLED) @@ -129,8 +155,10 @@ class IMDbDownloadServer : Object, IMDbDownloader { return null; } - private void on_progress_changed (int percent) { - progress (percent_finished + percent / 5); + private void on_progress (int dltotal, int dlnow) { + stdout.printf ("%d / %d\r", dlnow, dltotal); + if (dltotal > 0) + progress (99*dlnow/dltotal/100); } private void timeout_quit () { -- 1.7.9.5