* along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
*/
-errordomain ParserError {
- WRONG_TAG,
- EOF
-}
-
-public class Cinema {
+public class Theater {
public string name;
public string address;
public string phone;
-
- public Cinema (string _name) {
- name = _name;
- }
}
public class GoogleMovie {
public string title;
public int rating;
- public Cinema cinema;
+ public Theater theater;
public int runtime;
public string fsk;
public string showtimes;
}
-public class GoogleParser : Object {
- char *current;
- Cinema last_cinema;
+class GoogleParser : Object {
+ int movies;
public string location;
string _title;
PatternSpec pattern;
+ CurlWrapper curlwrapper;
+ Regex re_runtime;
public delegate void ReceiveMovie (GoogleMovie movie);
public ReceiveMovie _get_callback;
- public int next_tag_offset () {
- int i = -1;
- while (current[++i] != '<' && current[i] != 0);
- return i;
- }
-
- public void next_tag () {
- if (current[0] == 0)
- return;
- current += next_tag_offset ();
- }
-
- public void finish_tag () {
- while (current[0] != '>' && current[0] != 0)
- current++;
- if (current[0] == '>')
- current++;
- }
-
- public unowned string parse_tag (bool finish = true) throws Error {
- unowned string tag;
- next_tag ();
- int i = 1;
- while (current[++i].isalnum ());
- if (current[i] == 0)
- throw new ParserError.EOF ("EOF in tag");
- if (current[i] == '>')
- finish = false;
- current[i] = 0;
- tag = (string) (current + 1);
- current += i + 1;
- if (finish)
- finish_tag ();
- return tag;
- }
-
- public void expect_tag (string tag) throws Error {
- var found = parse_tag (true);
- if (tag != found) {
- throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
- found, tag);
+ construct {
+ curlwrapper = new CurlWrapper ();
+ try {
+ re_runtime = new Regex ("([0-9]+)hr ([0-9]+)min");
+ } catch (RegexError e) {
+ critical ("Failed to initialize regex: %s\n", e.message);
}
}
- public string parse_text () {
- string text = ((string) current).ndup (next_tag_offset ());
- next_tag ();
- return text;
+ private Html.Doc* get_html_document (string buf) {
+ return Html.Doc.read_memory ((char[]) buf, (int) buf.length,
+ "http://movies.google.de", null, Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING);
}
- public void parse_attribute (string _attr, out string value) {
- string attr;
- if (current[0] == 0)
- return;
- int i = -1;
- while (current[++i] != '=' && current[i] != '>' && current[i] != 0) {
-
- }
- attr = ((string) current).ndup (i);
- current += i;
- if (current[0] == 0)
- return;
- current++;
- i = -1;
- while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) {
- if (current[i] == '"')
- while (current[++i] != '"' && current[i] != 0);
+ public int parse (string buf) throws Error {
+ var doc = get_html_document (buf);
+ if (doc == null) {
+ stderr.printf ("Error: parsing failed\n");
+ return 0;
}
- if (attr == _attr) {
- if (current[0] == '"')
- value = ((string) current).substring (1, i - 2);
- else
- value = ((string) current).ndup (i);
- }
- current += i;
- }
- public void skip_whitespace () {
- if (current[0] == 0)
- return;
- int i = -1;
- while (current[++i].isspace () && current[i] != 0);
- current += i;
- }
+ // TODO: set up location
+ location = "";
- public string? parse_tag_attribute (string tag, string attribute) throws Error {
- var found = parse_tag (false);
- if (tag != found) {
- throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
- found, tag);
+ var theater = search_tag_by_class (doc->children, "div", "theater");
+ if (theater == null) {
+ stderr.printf ("Error: does not contain theater\n");
+ return 0;
}
-
- string? value = null;
- skip_whitespace ();
- while (current[0] != '>' && current[0] != 0) {
- parse_attribute (attribute, out value);
- skip_whitespace ();
+ movies = 0;
+ while (theater != null) {
+ theater = parse_theater (theater);
}
- // Skip the closing '>' bracket
- if (current[0] != 0)
- current++;
-
- return value;
+ return movies;
}
- public string unescape_unicode (string s) {
- string result = "";
- int i, j;
- long l = s.length;
-
- for (i = 0; i < l; i++) {
- if (s[i] == '&' && s[i + 1] == '#') {
- for (j = i + 2; j < l; j++) {
- if (!s[j].isdigit ())
- break;
- if (s[j] == ';')
- break;
- }
- if (s[j] == ';') {
- int codepoint = s.substring (i + 2, j - i - 2).to_int ();
- char[] buf = new char[6];
- ((unichar) codepoint).to_utf8 ((string) buf);
- result += (string) buf;
- i = j;
- continue;
+ private Xml.Node* parse_theater (Xml.Node* t) {
+ var theater = new Theater ();
+ var desc = t->children;
+ if (desc != null && desc->name == "div" && desc->get_prop ("class") == "desc") {
+ var name = desc->children;
+ if (name != null && name->name == "h2" && name->get_prop ("class") == "name") {
+ var a = name->children;
+ if (a != null && a->name == "a")
+ theater.name = get_child_text_content (a);
+ print ("THEATER \"%s\"\n", theater.name);
+ }
+ var info = name->next;
+ if (info != null && info->name == "div" && info->get_prop ("class") == "info") {
+ var text = info->children;
+ if (text != null && text->name == "text") {
+ var address_and_phone = text->content.split (" - ");
+ if (address_and_phone.length >= 2) {
+ theater.address = address_and_phone[0];
+ theater.phone = address_and_phone[1].replace (" ", "").replace ("-", "");
+ }
}
}
- if (s.offset (i).has_prefix ("&")) {
- result += "&";
- i += 4;
- continue;
+ }
+ var showtimes = desc->next;
+ if (showtimes != null && showtimes->name == "div" && showtimes->get_prop ("class") == "showtimes") {
+ var left = search_tag_by_class (showtimes->children, "div", "show_left");
+ if (left != null && left->children != null) {
+ print ("LEFT\n");
+ var movie = search_tag_by_class (left->children, "div", "movie");
+ while (movie != null) {
+ movie = parse_movie (movie, theater);
+ }
}
- if (s.offset (i).has_prefix (""")) {
- result += "\"";
- i += 5;
- continue;
+ var right = search_tag_by_class (left->next, "div", "show_right");
+ if (right != null && right->children != null) {
+ print ("RIGHT\n");
+ var movie = search_tag_by_class (right->children, "div", "movie");
+ while (movie != null) {
+ movie = parse_movie (movie, theater);
+ }
}
- result += s.substring (i, 1);
- }
- return result;
+ }
+ return t->next;
}
- public void parse_movie () throws Error {
- expect_tag ("div"); // class=movie
- expect_tag ("div"); // class=name
- expect_tag ("a"); // href="/movies?near=city&mid=..."
- expect_tag ("span"); // dir=ltr
- var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
- expect_tag ("/span");
- expect_tag ("/a");
- expect_tag ("/div");
- expect_tag ("span"); // class=info
- string info_text = parse_text ().replace ("‎", "");
- string[] runtime_and_fsk = {};
- double rating = 0.0;
- var tag = parse_tag ();
- if (tag == "a") {
- // Trailer
- expect_tag ("/a");
- tag = parse_tag ();
- }
- if (tag == "a") {
- // IMDb
- expect_tag ("/a");
- tag = parse_tag ();
- }
- if (tag == "nobr") {
- expect_tag ("nobr");
- string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0"
- rating = rating_string.to_double ();
- expect_tag ("img");
- expect_tag ("img");
- expect_tag ("img");
- expect_tag ("img");
- expect_tag ("/nobr");
- expect_tag ("/nobr");
- info_text = parse_text ().replace ("‎", "").offset (3);
- if (parse_tag () == "a") {
- // Trailer
- expect_tag ("/a");
- if (parse_tag () == "a") {
- // IMDb link
- expect_tag ("/a");
- expect_tag ("/span");
- }
- }
- }
- runtime_and_fsk = info_text.split (" - ");
- expect_tag ("div"); // class=times
- var showtimes = parse_text ().replace (" ", ",");
- while (parse_tag () == "a") {
- showtimes += parse_text () + ",";
- expect_tag ("/a");
+ private Xml.Node* parse_movie (Xml.Node* m, Theater theater) {
+ var movie = new GoogleMovie ();
+ movie.theater = theater;
+ Xml.Node* n;
+ for (n = m->children; n != null; n = n->next) {
+ if (n->name == "div" && n->get_prop ("class") == "name")
+ movie.title = parse_movie_name (n);
+ if (n->name == "span" && n->get_prop ("class") == "info")
+ parse_movie_info (n, movie);
+ if (n->name == "div" && n->get_prop ("class") == "times")
+ parse_movie_times (n, movie);
}
-
if (pattern == null) {
- if (!title.has_prefix (_title))
- return;
+ if (!movie.title.has_prefix (_title))
+ return m->next;
} else {
- if (!pattern.match ((uint) title.length, title, null))
- return;
+ if (!pattern.match ((uint) movie.title.length, movie.title, null))
+ return m->next;
}
+ _get_callback (movie);
+ movies++;
+ return m->next;
+ }
- var movie = new GoogleMovie ();
-
- movie.title = strip_tags (title).replace ("\"", "\\\"");
- movie.rating = (int) (rating * 10);
-
- movie.cinema = last_cinema;
- movie.runtime = 0;
- if (runtime_and_fsk.length >= 2) {
- unowned string runtime = runtime_and_fsk[0];
- movie.runtime = 3600 * runtime.to_int ();
- runtime = runtime.str ("hr ");
- if (runtime != null)
- movie.runtime += 60 * runtime.offset (3).to_int ();
- movie.fsk = runtime_and_fsk[1];
+ private string? parse_movie_name (Xml.Node* n) {
+ var a = n->children;
+ if (a != null && a->name == "a") {
+ var text = a->children;
+ if (text != null && text->name == "text")
+ print ("\"%s\"\n", text->content);
+ return strip_tags (text->content);
}
- movie.showtimes = showtimes;
- _get_callback (movie);
+ return null;
}
// FIXME - this is specific for Germany
return title.dup ();
}
- public void parse_cinema () throws Error {
- expect_tag ("div"); // class=theater
- expect_tag ("div"); // class=desc id=theater_...
- expect_tag ("h2"); // class=name
- expect_tag ("a"); // href="/movies?near=city&tid=..."
- expect_tag ("span"); // dir=ltr
- var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
- expect_tag ("/span");
- expect_tag ("/a");
- expect_tag ("/h2");
- expect_tag ("div"); // class=info
- var address_and_phone = parse_text ().replace (" ", " ").split (" - ");
- string address = null;
- string phone = null;
- if (address_and_phone.length >= 2) {
- address = address_and_phone[0];
- phone = address_and_phone[1].replace (" ", "").replace ("-", "");
+ private void parse_movie_info (Xml.Node* i, GoogleMovie movie) {
+ var text = i->children;
+ if (text != null && text->name == "text") {
+ MatchInfo match;
+ print ("\t\"%s\"\n", text->content);
+ if (re_runtime.match (text->content, 0, out match)) {
+ movie.runtime = match.fetch (1).to_int () * 3600 +
+ match.fetch (2).to_int () * 60;
+ }
+ movie.fsk = text->content.str ("Rated ").replace (" - ", "");
}
- expect_tag ("a"); // target=_top
- expect_tag ("/a");
- expect_tag ("/div");
- expect_tag ("/div");
+ for (var n = text->next; n != null; n = n->next) {
+ if (n->name == "nobr") {
+ movie.rating = parse_rating (n);
+ if (movie.rating == 0)
+ movie.rating = -1;
+ break;
+ }
+ }
+ }
- last_cinema = new Cinema (name);
- last_cinema.address = address;
- last_cinema.phone = phone;
+ private int parse_rating (Xml.Node* nobr) {
+ for (var n = nobr->children; n != null; n = n->next) {
+ if (n->name == "nobr") {
+ for (var img = n->children; img != null; img = img->next) {
+ if (img->name == "img") {
+ var alt = img->get_prop ("alt"); // "Rated 0.0 out of 5.0"
+ if (alt != null && alt != "") // ^
+ print ("\trating: %s - %f\n", alt, alt.offset (6).to_double ());
+ return (int) (10 * alt.offset (6).to_double ());
+ }
+ }
+ }
+ }
+ return 0;
}
- public int parse (ref char[] buf) throws Error {
- int movies = 0;
+ private void parse_movie_times (Xml.Node* node, GoogleMovie movie) {
+ movie.showtimes = get_child_text_content (node).replace ("\xc2\xa0", ","); // U+00A0 =
+ }
- current = buf;
- next_tag ();
- while (location == null && current[0] != 0) {
- int i = 1;
- while (current[i++] != '>');
- if (((string) current).has_prefix ("<a href=\"/movies?near=")) {
- string href = parse_tag_attribute ("a", "href");
- char* p = (char*) href.offset (13); // skip "/movies?near="
- int j = -1;
+ unowned string? get_child_text_content (Xml.Node *n) {
+ if (n->children != null && n->children->name == "text")
+ return n->children->content;
+ else
+ return null;
+ }
- while (p[++j] != '&' && p[j] != 0);
- p[0] = p[0].toupper ();
- location = ((string) p).ndup (j);
- }
- current += i;
- next_tag ();
- }
- while (current[0] != 0) {
- int i = 1;
- while (current[i++] != '>');
- if (((string) current).has_prefix ("<div class=movie>")) {
- parse_movie ();
- movies++;
- } else if (((string) current).has_prefix("<div class=theater>")) {
- parse_cinema ();
- } else {
- current += i;
+ Xml.Node* search_tag_by_property (Xml.Node* node, string tag, string prop, string val) requires (node != null) {
+ for (var n = node; n != null; n = n->next) {
+ if (n->name == tag && n->get_prop (prop) == val)
+ return n;
+ if (n->children != null) {
+ var found = search_tag_by_property (n->children, tag, prop, val);
+ if (found != null)
+ return found;
}
- next_tag ();
}
+ return null;
+ }
- return movies;
+ Xml.Node* search_tag_by_class (Xml.Node* node, string tag, string @class) requires (node != null) {
+ return search_tag_by_property (node, tag, "class", @class);
}
public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) {
stdout.printf ("GET: %s\n", uri);
- File file = File.new_for_uri (uri);
- InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null);
-
- char[] buf = new char[256*1024];
- size_t nread;
- size_t total = 0;
- while (total < 256*1024) {
- nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable);
- total += nread;
- if (cancellable.is_cancelled ())
- return 0;
- if (nread == 0)
- break;
- }
- buf[total] = 0;
- return parse (ref buf);
+ string buf = yield curlwrapper.http_get (uri);
+ return parse (buf);
} catch (Error e) {
stderr.printf ("Error: %s\n", e.message);
}