Google backend: replace parser with a libxml-2.0 based one
authorPhilipp Zabel <philipp.zabel@gmail.com>
Tue, 13 Jul 2010 17:29:06 +0000 (19:29 +0200)
committerPhilipp Zabel <philipp.zabel@gmail.com>
Tue, 13 Jul 2010 17:29:06 +0000 (19:29 +0200)
Makefile.am
configure.ac
src/backends/google/google-backend.vala
src/backends/google/google-parser.vala

index 31183a3..8236c8d 100644 (file)
@@ -209,9 +209,9 @@ cinaest_google_backend_VALASOURCES = \
        src/backends/google/google-parser.vala
 
 cinaest_google_backend_VALAFLAGS = --vapidir ./vapi --pkg dbus-glib-1 \
-       --pkg gconf-2.0 --pkg gee-1.0 --pkg gio-2.0
-cinaest_google_backend_CFLAGS = ${DBUS_CFLAGS} ${GCONF_CFLAGS} ${GEE_CFLAGS} ${GIO_CFLAGS}
-cinaest_google_backend_LDADD = ${DBUS_LIBS} ${GCONF_LIBS} ${GEE_LIBS} ${GIO_LIBS}
+       --pkg gconf-2.0 --pkg gee-1.0 --pkg gio-2.0 --pkg libxml-2.0
+cinaest_google_backend_CFLAGS = ${DBUS_CFLAGS} ${GCONF_CFLAGS} ${GEE_CFLAGS} ${GIO_CFLAGS} ${XML_CFLAGS}
+cinaest_google_backend_LDADD = ${DBUS_LIBS} ${GCONF_LIBS} ${GEE_LIBS} ${GIO_LIBS} ${XML_LIBS}
 
 src/backends/google/google-backend.c: ${cinaest_google_backend_VALASOURCES}
        ${VALAC} -C ${cinaest_google_backend_VALASOURCES} ${cinaest_google_backend_VALAFLAGS}
index 76919d6..f595916 100644 (file)
@@ -94,6 +94,10 @@ PKG_CHECK_MODULES(DBUS, dbus-glib-1 >= 0.78)
 AC_SUBST(DBUS_LIBS)
 AC_SUBST(DBUS_CFLAGS)
 
+PKG_CHECK_MODULES(XML, libxml-2.0)
+AC_SUBST(XML_LIBS)
+AC_SUBST(XML_CFLAGS)
+
 PKG_CHECK_MODULES(OSSOSETTINGS, osso-af-settings >= 0.9.2)
 
 localedir=`$PKG_CONFIG osso-af-settings --variable=localedir`
index 7e4d89e..1815970 100644 (file)
@@ -73,7 +73,7 @@ public class MovieSearch : Object {
                var m = new string[results.length ()];
                int i = 0;
                for (unowned GLib.List<GoogleMovie> node = results.first (); node != null; node = node.next) {
-                       m[i++] = "{\"title\":\"%s\",\"rating\":%f,\"runtime\":%d,\"showtimes\":\"%s\",\"cinema_name\":\"%s\",\"cinema_phone\":\"%s\"}".printf (node.data.title, node.data.rating, node.data.runtime, node.data.showtimes, node.data.cinema.name, node.data.cinema.phone);
+                       m[i++] = "{\"title\":\"%s\",\"rating\":%f,\"runtime\":%d,\"showtimes\":\"%s\",\"cinema_name\":\"%s\",\"cinema_phone\":\"%s\"}".printf (node.data.title, node.data.rating, node.data.runtime, node.data.showtimes, node.data.theater.name, node.data.theater.phone);
                }
                movies_found (m, true);
                service.timeout_quit ();
index e18a00b..439c0a3 100644 (file)
  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
  */
 
-errordomain ParserError {
-       WRONG_TAG,
-       EOF
-}
-
-public class Cinema {
+public class Theater {
        public string name;
        public string address;
        public string phone;
-
-       public Cinema (string _name) {
-               name = _name;
-       }
 }
 
 public class GoogleMovie {
        public string title;
        public int rating;
-       public Cinema cinema;
+       public Theater theater;
        public int runtime;
        public string fsk;
        public string showtimes;
 }
 
-public class GoogleParser : Object {
-       char *current;
-       Cinema last_cinema;
+class GoogleParser : Object {
+       int movies;
        public string location;
        string _title;
        PatternSpec pattern;
@@ -50,227 +40,112 @@ public class GoogleParser : Object {
        public delegate void ReceiveMovie (GoogleMovie movie);
        public ReceiveMovie _get_callback;
 
-       public int next_tag_offset () {
-               int i = -1;
-               while (current[++i] != '<' && current[i] != 0);
-               return i;
-       }
-
-       public void next_tag () {
-               if (current[0] == 0)
-                       return;
-               current += next_tag_offset ();
-       }
-
-       public void finish_tag () {
-               while (current[0] != '>' && current[0] != 0)
-                       current++;
-               if (current[0] == '>')
-                       current++;
-       }
-
-       public unowned string parse_tag (bool finish = true) throws Error {
-               unowned string tag;
-               next_tag ();
-               int i = 1;
-               while (current[++i].isalnum ());
-               if (current[i] == 0)
-                       throw new ParserError.EOF ("EOF in tag");
-               if (current[i] == '>')
-                       finish = false;
-               current[i] = 0;
-               tag = (string) (current + 1);
-               current += i + 1;
-               if (finish)
-                       finish_tag ();
-               return tag;
-       }
-
-       public void expect_tag (string tag) throws Error {
-               var found = parse_tag (true);
-               if (tag != found) {
-                       throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
-                                                        found, tag);
-               }
-       }
-
-       public string parse_text () {
-               string text = ((string) current).ndup (next_tag_offset ());
-               next_tag ();
-               return text;
+       private Html.Doc* get_html_document (ref char[] buf) {
+               return Html.Doc.read_memory (buf, (int) buf.length,
+                                            "http://movies.google.de", null, Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING);
        }
 
-       public void parse_attribute (string _attr, out string value) {
-               string attr;
-               if (current[0] == 0)
-                       return;
-               int i = -1;
-               while (current[++i] != '=' && current[i] != '>' && current[i] != 0) {
-
-               }
-               attr = ((string) current).ndup (i);
-               current += i;
-               if (current[0] == 0)
-                       return;
-               current++;
-               i = -1;
-               while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) {
-                       if (current[i] == '"')
-                               while (current[++i] != '"' && current[i] != 0);
-               }
-               if (attr == _attr) {
-                       if (current[0] == '"')
-                               value = ((string) current).substring (1, i - 2);
-                       else
-                               value = ((string) current).ndup (i);
+       public int parse (ref char[] buf) throws Error {
+               var doc = get_html_document (ref buf);
+               if (doc == null) {
+                       stderr.printf ("Error: parsing failed\n");
+                       return 0;
                }
-               current += i;
-       }
 
-       public void skip_whitespace () {
-               if (current[0] == 0)
-                       return;
-               int i = -1;
-               while (current[++i].isspace () && current[i] != 0);
-               current += i;
-       }
+               // TODO: set up location
+               location = "";
 
-       public string? parse_tag_attribute (string tag, string attribute) throws Error {
-               var found = parse_tag (false);
-               if (tag != found) {
-                       throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
-                                                        found, tag);
+               var theater = search_tag_by_class (doc->children, "div", "theater");
+               if (theater == null) {
+                       stderr.printf ("Error: does not contain theater\n");
+                       return 0;
                }
-
-               string? value = null;
-               skip_whitespace ();
-               while (current[0] != '>' && current[0] != 0) {
-                       parse_attribute (attribute, out value);
-                       skip_whitespace ();
+               movies = 0;
+               while (theater != null) {
+                       theater = parse_theater (theater);
                }
-               // Skip the closing '>' bracket
-               if (current[0] != 0)
-                       current++;
-
-               return value;
+               return movies;
        }
 
-       public string unescape_unicode (string s) {
-               string result = "";
-               int i, j;
-               long l = s.length;
-
-               for (i = 0; i < l; i++) {
-                       if (s[i] == '&' && s[i + 1] == '#') {
-                               for (j = i + 2; j < l; j++) {
-                                       if (!s[j].isdigit ())
-                                               break;
-                                       if (s[j] == ';')
-                                               break;
-                               }
-                               if (s[j] == ';') {
-                                       int codepoint = s.substring (i + 2, j - i - 2).to_int ();
-                                       char[] buf = new char[6];
-                                       ((unichar) codepoint).to_utf8 ((string) buf);
-                                       result += (string) buf;
-                                       i = j;
-                                       continue;
+       private Xml.Node* parse_theater (Xml.Node* t) {
+               var theater = new Theater ();
+               var desc = t->children;
+               if (desc != null && desc->name == "div" && desc->get_prop ("class") == "desc") {
+                       var name = desc->children;
+                       if (name != null && name->name == "h2" && name->get_prop ("class") == "name") {
+                               var a = name->children;
+                               if (a != null && a->name == "a")
+                                       theater.name = get_child_text_content (a);
+                                       print ("THEATER \"%s\"\n", theater.name);
+                       }
+                       var info = name->next;
+                       if (info != null && info->name == "div" && info->get_prop ("class") == "info") {
+                               var text = info->children;
+                               if (text != null && text->name == "text") {
+                                       var address_and_phone = text->content.split (" - ");
+                                       if (address_and_phone.length >= 2) {
+                                               theater.address = address_and_phone[0];
+                                               theater.phone = address_and_phone[1].replace (" ", "").replace ("-", "");
+                                       }
                                }
                        }
-                       if (s.offset (i).has_prefix ("&amp;")) {
-                               result += "&";
-                               i += 4;
-                               continue;
+               }
+               var showtimes = desc->next;
+               if (showtimes != null && showtimes->name == "div" && showtimes->get_prop ("class") == "showtimes") {
+                       var left = search_tag_by_class (showtimes->children, "div", "show_left");
+                       if (left != null && left->children != null) {
+                               print ("LEFT\n");
+                               var movie = search_tag_by_class (left->children, "div", "movie");
+                               while (movie != null) {
+                                       movie = parse_movie (movie, theater);
+                               }
                        }
-                       if (s.offset (i).has_prefix ("&quot;")) {
-                               result += "\"";
-                               i += 5;
-                               continue;
+                       var right = search_tag_by_class (left->next, "div", "show_right");
+                       if (right != null && right->children != null) {
+                               print ("RIGHT\n");
+                               var movie = search_tag_by_class (right->children, "div", "movie");
+                               while (movie != null) {
+                                       movie = parse_movie (movie, theater);
+                               }
                        }
-                       result += s.substring (i, 1);
-               }
 
-               return result;
+               }
+               return t->next;
        }
 
-       public void parse_movie () throws Error {
-               expect_tag ("div"); // class=movie
-               expect_tag ("div"); // class=name
-               expect_tag ("a"); // href="/movies?near=city&amp;mid=..."
-               expect_tag ("span"); // dir=ltr
-               var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
-               expect_tag ("/span");
-               expect_tag ("/a");
-               expect_tag ("/div");
-               expect_tag ("span"); // class=info
-               string info_text = parse_text ().replace ("&#8206;", "");
-               string[] runtime_and_fsk = {};
-               double rating = 0.0;
-               var tag = parse_tag ();
-               if (tag == "a") {
-                       // Trailer
-                       expect_tag ("/a");
-                       tag = parse_tag ();
-               }
-               if (tag == "a") {
-                       // IMDb
-                       expect_tag ("/a");
-                       tag = parse_tag ();
-               }
-               if (tag == "nobr") {
-                       expect_tag ("nobr");
-                       string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0"
-                       rating = rating_string.to_double ();
-                       expect_tag ("img");
-                       expect_tag ("img");
-                       expect_tag ("img");
-                       expect_tag ("img");
-                       expect_tag ("/nobr");
-                       expect_tag ("/nobr");
-                       info_text = parse_text ().replace ("&#8206;", "").offset (3);
-                       if (parse_tag () == "a") {
-                               // Trailer
-                               expect_tag ("/a");
-                               if (parse_tag () == "a") {
-                                       // IMDb link
-                                       expect_tag ("/a");
-                                       expect_tag ("/span");
-                               }
-                       }
-               }
-               runtime_and_fsk = info_text.split (" - ");
-               expect_tag ("div"); // class=times
-               var showtimes = parse_text ().replace ("&nbsp;", ",");
-               while (parse_tag () == "a") {
-                       showtimes += parse_text () + ",";
-                       expect_tag ("/a");
+       private Xml.Node* parse_movie (Xml.Node* m, Theater theater) {
+               var movie = new GoogleMovie ();
+               movie.theater = theater;
+               Xml.Node* n;
+               for (n = m->children; n != null; n = n->next) {
+                       if (n->name == "div" && n->get_prop ("class") == "name")
+                               movie.title = parse_movie_name (n);
+                       if (n->name == "span" && n->get_prop ("class") == "info")
+                               parse_movie_info (n, movie);
+                       if (n->name == "div" && n->get_prop ("class") == "times")
+                               parse_movie_times (n, movie);
                }
-
                if (pattern == null) {
-                       if (!title.has_prefix (_title))
-                               return;
+                       if (!movie.title.has_prefix (_title))
+                               return m->next;
                } else {
-                       if (!pattern.match ((uint) title.length, title, null))
-                               return;
+                       if (!pattern.match ((uint) movie.title.length, movie.title, null))
+                               return m->next;
                }
+               _get_callback (movie);
+               movies++;
+               return m->next;
+       }
 
-               var movie = new GoogleMovie ();
-
-               movie.title = strip_tags (title).replace ("\"", "\\\"");
-               movie.rating = (int) (rating * 10);
-
-               movie.cinema = last_cinema;
-               movie.runtime = 0;
-               if (runtime_and_fsk.length >= 2) {
-                       unowned string runtime = runtime_and_fsk[0];
-                       movie.runtime = 3600 * runtime.to_int ();
-                       runtime = runtime.str ("hr ");
-                       if (runtime != null)
-                               movie.runtime += 60 * runtime.offset (3).to_int ();
-                       movie.fsk = runtime_and_fsk[1];
+       private string? parse_movie_name (Xml.Node* n) {
+               var a = n->children;
+               if (a != null && a->name == "a") {
+                       var text = a->children;
+                       if (text != null && text->name == "text")
+                               print ("\"%s\"\n", text->content);
+                               return strip_tags (text->content);
                }
-               movie.showtimes = showtimes;
-               _get_callback (movie);
+               return null;
        }
 
        // FIXME - this is specific for Germany
@@ -284,69 +159,63 @@ public class GoogleParser : Object {
                return title.dup ();
        }
 
-       public void parse_cinema () throws Error {
-               expect_tag ("div"); // class=theater
-               expect_tag ("div"); // class=desc id=theater_...
-               expect_tag ("h2"); // class=name
-               expect_tag ("a"); // href="/movies?near=city&amp;tid=..."
-               expect_tag ("span"); // dir=ltr
-               var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
-               expect_tag ("/span");
-               expect_tag ("/a");
-               expect_tag ("/h2");
-               expect_tag ("div"); // class=info
-               var address_and_phone = parse_text ().replace ("&nbsp;", " ").split (" - ");
-               string address = null;
-               string phone = null;
-               if (address_and_phone.length >= 2) {
-                       address = address_and_phone[0];
-                       phone = address_and_phone[1].replace (" ", "").replace ("-", "");
+       private void parse_movie_info (Xml.Node* i, GoogleMovie movie) {
+               var text = i->children;
+               if (text != null && text->name == "text")
+                       print ("\t\"%s\"\n", text->content);
+                       //      movie.runtime
+               for (var n = text->next; n != null; n = n->next) {
+                       if (n->name == "nobr") {
+                               movie.rating = parse_rating (n);
+                               if (movie.rating == 0)
+                                       movie.rating = -1;
+                               break;
+                       }
                }
-               expect_tag ("a"); // target=_top
-               expect_tag ("/a");
-               expect_tag ("/div");
-               expect_tag ("/div");
+       }
 
-               last_cinema = new Cinema (name);
-               last_cinema.address = address;
-               last_cinema.phone = phone;
+       private int parse_rating (Xml.Node* nobr) {
+               for (var n = nobr->children; n != null; n = n->next) {
+                       if (n->name == "nobr") {
+                               for (var img = n->children; img != null; img = img->next) {
+                                       if (img->name == "img") {
+                                               var alt = img->get_prop ("alt"); // "Rated 0.0 out of 5.0"
+                                               if (alt != null && alt != "")    //        ^
+                                                       return (int) (10 * alt.offset (6).to_double ());
+                                                       print ("\trating: %s - %f\n", alt, alt.offset (6).to_double ());
+                                       }
+                               }
+                       }
+               }
+               return 0;
        }
 
-       public int parse (ref char[] buf) throws Error {
-               int movies = 0;
+       private void parse_movie_times (Xml.Node* node, GoogleMovie movie) {
+               movie.showtimes = get_child_text_content (node).replace ("\xc2\xa0", ","); // U+00A0 = &nbsp;
+       }
 
-               current = buf;
-               next_tag ();
-               while (location == null && current[0] != 0) {
-                       int i = 1;
-                       while (current[i++] != '>');
-                       if (((string) current).has_prefix ("<a href=\"/movies?near=")) {
-                               string href = parse_tag_attribute ("a", "href");
-                               char* p = (char*) href.offset (13); // skip "/movies?near="
-                               int j = -1;
+       unowned string? get_child_text_content (Xml.Node *n) {
+               if (n->children != null && n->children->name == "text")
+                       return n->children->content;
+               else
+                       return null;
+       }
 
-                               while (p[++j] != '&' && p[j] != 0);
-                               p[0] = p[0].toupper ();
-                               location = ((string) p).ndup (j);
+       Xml.Node* search_tag_by_property (Xml.Node* node, string tag, string prop, string val) requires (node != null) {
+               for (var n = node; n != null; n = n->next) {
+                       if (n->name == tag && n->get_prop (prop) == val)
+                               return n;
+                       if (n->children != null) {
+                               var found = search_tag_by_property (n->children, tag, prop, val);
+                               if (found != null)
+                                       return found;
                        }
-                       current += i;
-                       next_tag ();
-               }
-               while (current[0] != 0) {
-                       int i = 1;
-                       while (current[i++] != '>');
-                       if (((string) current).has_prefix ("<div class=movie>")) {
-                               parse_movie ();
-                               movies++;
-                       } else if (((string) current).has_prefix("<div class=theater>")) {
-                               parse_cinema ();
-                       } else {
-                               current += i;
-                       }
-                       next_tag ();
                }
+               return null;
+       }
 
-               return movies;
+       Xml.Node* search_tag_by_class (Xml.Node* node, string tag, string @class) requires (node != null) {
+               return search_tag_by_property (node, tag, "class", @class);
        }
 
        public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) {