X-Git-Url: http://git.maemo.org/git/?a=blobdiff_plain;f=src%2Fbackends%2Fgoogle%2Fgoogle-parser.vala;h=d8e51a117004e3deb19008408e372aa6d6330197;hb=823f663fd396540cd7acf44513626d8e29d852a9;hp=e18a00ba28592bea2fb62355f6a8eae701a94328;hpb=7afe5bd1f0a35e393087ab8f553b82af0f2ebde0;p=cinaest diff --git a/src/backends/google/google-parser.vala b/src/backends/google/google-parser.vala index e18a00b..d8e51a1 100644 --- a/src/backends/google/google-parser.vala +++ b/src/backends/google/google-parser.vala @@ -16,261 +16,147 @@ * along with Cinaest. If not, see . */ -errordomain ParserError { - WRONG_TAG, - EOF -} - -public class Cinema { +public class Theater { public string name; public string address; public string phone; - - public Cinema (string _name) { - name = _name; - } } public class GoogleMovie { public string title; public int rating; - public Cinema cinema; + public Theater theater; public int runtime; public string fsk; public string showtimes; } -public class GoogleParser : Object { - char *current; - Cinema last_cinema; +class GoogleParser : Object { + int movies; public string location; string _title; PatternSpec pattern; + CurlWrapper curlwrapper; + Regex re_runtime; public delegate void ReceiveMovie (GoogleMovie movie); public ReceiveMovie _get_callback; - public int next_tag_offset () { - int i = -1; - while (current[++i] != '<' && current[i] != 0); - return i; - } - - public void next_tag () { - if (current[0] == 0) - return; - current += next_tag_offset (); - } - - public void finish_tag () { - while (current[0] != '>' && current[0] != 0) - current++; - if (current[0] == '>') - current++; - } - - public unowned string parse_tag (bool finish = true) throws Error { - unowned string tag; - next_tag (); - int i = 1; - while (current[++i].isalnum ()); - if (current[i] == 0) - throw new ParserError.EOF ("EOF in tag"); - if (current[i] == '>') - finish = false; - current[i] = 0; - tag = (string) (current + 1); - current += i + 1; - if (finish) - finish_tag (); - return tag; - } - - public void expect_tag (string tag) throws Error { - var found = parse_tag (true); - if (tag != found) { - throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"", - found, tag); + construct { + curlwrapper = new CurlWrapper (); + try { + re_runtime = new Regex ("([0-9]+)hr ([0-9]+)min"); + } catch (RegexError e) { + critical ("Failed to initialize regex: %s\n", e.message); } } - public string parse_text () { - string text = ((string) current).ndup (next_tag_offset ()); - next_tag (); - return text; + private Html.Doc* get_html_document (string buf) { + return Html.Doc.read_memory ((char[]) buf, (int) buf.length, + "http://movies.google.de", null, Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING); } - public void parse_attribute (string _attr, out string value) { - string attr; - if (current[0] == 0) - return; - int i = -1; - while (current[++i] != '=' && current[i] != '>' && current[i] != 0) { - - } - attr = ((string) current).ndup (i); - current += i; - if (current[0] == 0) - return; - current++; - i = -1; - while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) { - if (current[i] == '"') - while (current[++i] != '"' && current[i] != 0); + public int parse (string buf) throws Error { + var doc = get_html_document (buf); + if (doc == null) { + stderr.printf ("Error: parsing failed\n"); + return 0; } - if (attr == _attr) { - if (current[0] == '"') - value = ((string) current).substring (1, i - 2); - else - value = ((string) current).ndup (i); - } - current += i; - } - public void skip_whitespace () { - if (current[0] == 0) - return; - int i = -1; - while (current[++i].isspace () && current[i] != 0); - current += i; - } + // TODO: set up location + location = ""; - public string? parse_tag_attribute (string tag, string attribute) throws Error { - var found = parse_tag (false); - if (tag != found) { - throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"", - found, tag); + var theater = search_tag_by_class (doc->children, "div", "theater"); + if (theater == null) { + stderr.printf ("Error: does not contain theater\n"); + return 0; } - - string? value = null; - skip_whitespace (); - while (current[0] != '>' && current[0] != 0) { - parse_attribute (attribute, out value); - skip_whitespace (); + movies = 0; + while (theater != null) { + theater = parse_theater (theater); } - // Skip the closing '>' bracket - if (current[0] != 0) - current++; - - return value; + return movies; } - public string unescape_unicode (string s) { - string result = ""; - int i, j; - long l = s.length; - - for (i = 0; i < l; i++) { - if (s[i] == '&' && s[i + 1] == '#') { - for (j = i + 2; j < l; j++) { - if (!s[j].isdigit ()) - break; - if (s[j] == ';') - break; - } - if (s[j] == ';') { - int codepoint = s.substring (i + 2, j - i - 2).to_int (); - char[] buf = new char[6]; - ((unichar) codepoint).to_utf8 ((string) buf); - result += (string) buf; - i = j; - continue; + private Xml.Node* parse_theater (Xml.Node* t) { + var theater = new Theater (); + var desc = t->children; + if (desc != null && desc->name == "div" && desc->get_prop ("class") == "desc") { + var name = desc->children; + if (name != null && name->name == "h2" && name->get_prop ("class") == "name") { + var a = name->children; + if (a != null && a->name == "a") + theater.name = get_child_text_content (a); + print ("THEATER \"%s\"\n", theater.name); + } + var info = name->next; + if (info != null && info->name == "div" && info->get_prop ("class") == "info") { + var text = info->children; + if (text != null && text->name == "text") { + var address_and_phone = text->content.split (" - "); + if (address_and_phone.length >= 2) { + theater.address = address_and_phone[0]; + theater.phone = address_and_phone[1].replace (" ", "").replace ("-", ""); + } } } - if (s.offset (i).has_prefix ("&")) { - result += "&"; - i += 4; - continue; + } + var showtimes = desc->next; + if (showtimes != null && showtimes->name == "div" && showtimes->get_prop ("class") == "showtimes") { + var left = search_tag_by_class (showtimes->children, "div", "show_left"); + if (left != null && left->children != null) { + print ("LEFT\n"); + var movie = search_tag_by_class (left->children, "div", "movie"); + while (movie != null) { + movie = parse_movie (movie, theater); + } } - if (s.offset (i).has_prefix (""")) { - result += "\""; - i += 5; - continue; + var right = search_tag_by_class (left->next, "div", "show_right"); + if (right != null && right->children != null) { + print ("RIGHT\n"); + var movie = search_tag_by_class (right->children, "div", "movie"); + while (movie != null) { + movie = parse_movie (movie, theater); + } } - result += s.substring (i, 1); - } - return result; + } + return t->next; } - public void parse_movie () throws Error { - expect_tag ("div"); // class=movie - expect_tag ("div"); // class=name - expect_tag ("a"); // href="/movies?near=city&mid=..." - expect_tag ("span"); // dir=ltr - var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME - expect_tag ("/span"); - expect_tag ("/a"); - expect_tag ("/div"); - expect_tag ("span"); // class=info - string info_text = parse_text ().replace ("‎", ""); - string[] runtime_and_fsk = {}; - double rating = 0.0; - var tag = parse_tag (); - if (tag == "a") { - // Trailer - expect_tag ("/a"); - tag = parse_tag (); - } - if (tag == "a") { - // IMDb - expect_tag ("/a"); - tag = parse_tag (); - } - if (tag == "nobr") { - expect_tag ("nobr"); - string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0" - rating = rating_string.to_double (); - expect_tag ("img"); - expect_tag ("img"); - expect_tag ("img"); - expect_tag ("img"); - expect_tag ("/nobr"); - expect_tag ("/nobr"); - info_text = parse_text ().replace ("‎", "").offset (3); - if (parse_tag () == "a") { - // Trailer - expect_tag ("/a"); - if (parse_tag () == "a") { - // IMDb link - expect_tag ("/a"); - expect_tag ("/span"); - } - } - } - runtime_and_fsk = info_text.split (" - "); - expect_tag ("div"); // class=times - var showtimes = parse_text ().replace (" ", ","); - while (parse_tag () == "a") { - showtimes += parse_text () + ","; - expect_tag ("/a"); + private Xml.Node* parse_movie (Xml.Node* m, Theater theater) { + var movie = new GoogleMovie (); + movie.theater = theater; + Xml.Node* n; + for (n = m->children; n != null; n = n->next) { + if (n->name == "div" && n->get_prop ("class") == "name") + movie.title = parse_movie_name (n); + if (n->name == "span" && n->get_prop ("class") == "info") + parse_movie_info (n, movie); + if (n->name == "div" && n->get_prop ("class") == "times") + parse_movie_times (n, movie); } - if (pattern == null) { - if (!title.has_prefix (_title)) - return; + if (!movie.title.has_prefix (_title)) + return m->next; } else { - if (!pattern.match ((uint) title.length, title, null)) - return; + if (!pattern.match ((uint) movie.title.length, movie.title, null)) + return m->next; } + _get_callback (movie); + movies++; + return m->next; + } - var movie = new GoogleMovie (); - - movie.title = strip_tags (title).replace ("\"", "\\\""); - movie.rating = (int) (rating * 10); - - movie.cinema = last_cinema; - movie.runtime = 0; - if (runtime_and_fsk.length >= 2) { - unowned string runtime = runtime_and_fsk[0]; - movie.runtime = 3600 * runtime.to_int (); - runtime = runtime.str ("hr "); - if (runtime != null) - movie.runtime += 60 * runtime.offset (3).to_int (); - movie.fsk = runtime_and_fsk[1]; + private string? parse_movie_name (Xml.Node* n) { + var a = n->children; + if (a != null && a->name == "a") { + var text = a->children; + if (text != null && text->name == "text") + print ("\"%s\"\n", text->content); + return strip_tags (text->content); } - movie.showtimes = showtimes; - _get_callback (movie); + return null; } // FIXME - this is specific for Germany @@ -284,69 +170,69 @@ public class GoogleParser : Object { return title.dup (); } - public void parse_cinema () throws Error { - expect_tag ("div"); // class=theater - expect_tag ("div"); // class=desc id=theater_... - expect_tag ("h2"); // class=name - expect_tag ("a"); // href="/movies?near=city&tid=..." - expect_tag ("span"); // dir=ltr - var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME - expect_tag ("/span"); - expect_tag ("/a"); - expect_tag ("/h2"); - expect_tag ("div"); // class=info - var address_and_phone = parse_text ().replace (" ", " ").split (" - "); - string address = null; - string phone = null; - if (address_and_phone.length >= 2) { - address = address_and_phone[0]; - phone = address_and_phone[1].replace (" ", "").replace ("-", ""); + private void parse_movie_info (Xml.Node* i, GoogleMovie movie) { + var text = i->children; + if (text != null && text->name == "text") { + MatchInfo match; + print ("\t\"%s\"\n", text->content); + if (re_runtime.match (text->content, 0, out match)) { + movie.runtime = match.fetch (1).to_int () * 3600 + + match.fetch (2).to_int () * 60; + } + movie.fsk = text->content.str ("Rated ").replace (" - ", ""); } - expect_tag ("a"); // target=_top - expect_tag ("/a"); - expect_tag ("/div"); - expect_tag ("/div"); + for (var n = text->next; n != null; n = n->next) { + if (n->name == "nobr") { + movie.rating = parse_rating (n); + if (movie.rating == 0) + movie.rating = -1; + break; + } + } + } - last_cinema = new Cinema (name); - last_cinema.address = address; - last_cinema.phone = phone; + private int parse_rating (Xml.Node* nobr) { + for (var n = nobr->children; n != null; n = n->next) { + if (n->name == "nobr") { + for (var img = n->children; img != null; img = img->next) { + if (img->name == "img") { + var alt = img->get_prop ("alt"); // "Rated 0.0 out of 5.0" + if (alt != null && alt != "") // ^ + print ("\trating: %s - %f\n", alt, alt.offset (6).to_double ()); + return (int) (10 * alt.offset (6).to_double ()); + } + } + } + } + return 0; } - public int parse (ref char[] buf) throws Error { - int movies = 0; + private void parse_movie_times (Xml.Node* node, GoogleMovie movie) { + movie.showtimes = get_child_text_content (node).replace ("\xc2\xa0", ","); // U+00A0 =   + } - current = buf; - next_tag (); - while (location == null && current[0] != 0) { - int i = 1; - while (current[i++] != '>'); - if (((string) current).has_prefix ("children->content; + else + return null; + } - while (p[++j] != '&' && p[j] != 0); - p[0] = p[0].toupper (); - location = ((string) p).ndup (j); - } - current += i; - next_tag (); - } - while (current[0] != 0) { - int i = 1; - while (current[i++] != '>'); - if (((string) current).has_prefix ("
")) { - parse_movie (); - movies++; - } else if (((string) current).has_prefix("
")) { - parse_cinema (); - } else { - current += i; + Xml.Node* search_tag_by_property (Xml.Node* node, string tag, string prop, string val) requires (node != null) { + for (var n = node; n != null; n = n->next) { + if (n->name == tag && n->get_prop (prop) == val) + return n; + if (n->children != null) { + var found = search_tag_by_property (n->children, tag, prop, val); + if (found != null) + return found; } - next_tag (); } + return null; + } - return movies; + Xml.Node* search_tag_by_class (Xml.Node* node, string tag, string @class) requires (node != null) { + return search_tag_by_property (node, tag, "class", @class); } public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) { @@ -365,22 +251,8 @@ public class GoogleParser : Object { stdout.printf ("GET: %s\n", uri); - File file = File.new_for_uri (uri); - InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null); - - char[] buf = new char[256*1024]; - size_t nread; - size_t total = 0; - while (total < 256*1024) { - nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable); - total += nread; - if (cancellable.is_cancelled ()) - return 0; - if (nread == 0) - break; - } - buf[total] = 0; - return parse (ref buf); + string buf = yield curlwrapper.http_get (uri); + return parse (buf); } catch (Error e) { stderr.printf ("Error: %s\n", e.message); }