Google backend: use libcurl, parse runtimes
[cinaest] / src / backends / google / google-parser.vala
1 /* This file is part of Cinaest.
2  *
3  * Copyright (C) 2009 Philipp Zabel
4  *
5  * Cinaest is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Cinaest is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 public class Theater {
20         public string name;
21         public string address;
22         public string phone;
23 }
24
25 public class GoogleMovie {
26         public string title;
27         public int rating;
28         public Theater theater;
29         public int runtime;
30         public string fsk;
31         public string showtimes;
32 }
33
34 class GoogleParser : Object {
35         int movies;
36         public string location;
37         string _title;
38         PatternSpec pattern;
39         CurlWrapper curlwrapper;
40         Regex re_runtime;
41
42         public delegate void ReceiveMovie (GoogleMovie movie);
43         public ReceiveMovie _get_callback;
44
45         construct {
46                 curlwrapper = new CurlWrapper ();
47                 try {
48                         re_runtime = new Regex ("([0-9]+)hr ([0-9]+)min");
49                 } catch (RegexError e) {
50                         critical ("Failed to initialize regex: %s\n", e.message);
51                 }
52         }
53
54         private Html.Doc* get_html_document (string buf) {
55                 return Html.Doc.read_memory ((char[]) buf, (int) buf.length,
56                                              "http://movies.google.de", null, Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING);
57         }
58
59         public int parse (string buf) throws Error {
60                 var doc = get_html_document (buf);
61                 if (doc == null) {
62                         stderr.printf ("Error: parsing failed\n");
63                         return 0;
64                 }
65
66                 // TODO: set up location
67                 location = "";
68
69                 var theater = search_tag_by_class (doc->children, "div", "theater");
70                 if (theater == null) {
71                         stderr.printf ("Error: does not contain theater\n");
72                         return 0;
73                 }
74                 movies = 0;
75                 while (theater != null) {
76                         theater = parse_theater (theater);
77                 }
78                 return movies;
79         }
80
81         private Xml.Node* parse_theater (Xml.Node* t) {
82                 var theater = new Theater ();
83                 var desc = t->children;
84                 if (desc != null && desc->name == "div" && desc->get_prop ("class") == "desc") {
85                         var name = desc->children;
86                         if (name != null && name->name == "h2" && name->get_prop ("class") == "name") {
87                                 var a = name->children;
88                                 if (a != null && a->name == "a")
89                                         theater.name = get_child_text_content (a);
90                                         print ("THEATER \"%s\"\n", theater.name);
91                         }
92                         var info = name->next;
93                         if (info != null && info->name == "div" && info->get_prop ("class") == "info") {
94                                 var text = info->children;
95                                 if (text != null && text->name == "text") {
96                                         var address_and_phone = text->content.split (" - ");
97                                         if (address_and_phone.length >= 2) {
98                                                 theater.address = address_and_phone[0];
99                                                 theater.phone = address_and_phone[1].replace (" ", "").replace ("-", "");
100                                         }
101                                 }
102                         }
103                 }
104                 var showtimes = desc->next;
105                 if (showtimes != null && showtimes->name == "div" && showtimes->get_prop ("class") == "showtimes") {
106                         var left = search_tag_by_class (showtimes->children, "div", "show_left");
107                         if (left != null && left->children != null) {
108                                 print ("LEFT\n");
109                                 var movie = search_tag_by_class (left->children, "div", "movie");
110                                 while (movie != null) {
111                                         movie = parse_movie (movie, theater);
112                                 }
113                         }
114                         var right = search_tag_by_class (left->next, "div", "show_right");
115                         if (right != null && right->children != null) {
116                                 print ("RIGHT\n");
117                                 var movie = search_tag_by_class (right->children, "div", "movie");
118                                 while (movie != null) {
119                                         movie = parse_movie (movie, theater);
120                                 }
121                         }
122
123                 }
124                 return t->next;
125         }
126
127         private Xml.Node* parse_movie (Xml.Node* m, Theater theater) {
128                 var movie = new GoogleMovie ();
129                 movie.theater = theater;
130                 Xml.Node* n;
131                 for (n = m->children; n != null; n = n->next) {
132                         if (n->name == "div" && n->get_prop ("class") == "name")
133                                 movie.title = parse_movie_name (n);
134                         if (n->name == "span" && n->get_prop ("class") == "info")
135                                 parse_movie_info (n, movie);
136                         if (n->name == "div" && n->get_prop ("class") == "times")
137                                 parse_movie_times (n, movie);
138                 }
139                 if (pattern == null) {
140                         if (!movie.title.has_prefix (_title))
141                                 return m->next;
142                 } else {
143                         if (!pattern.match ((uint) movie.title.length, movie.title, null))
144                                 return m->next;
145                 }
146                 _get_callback (movie);
147                 movies++;
148                 return m->next;
149         }
150
151         private string? parse_movie_name (Xml.Node* n) {
152                 var a = n->children;
153                 if (a != null && a->name == "a") {
154                         var text = a->children;
155                         if (text != null && text->name == "text")
156                                 print ("\"%s\"\n", text->content);
157                                 return strip_tags (text->content);
158                 }
159                 return null;
160         }
161
162         // FIXME - this is specific for Germany
163         private string strip_tags (string title) {
164                 string tag_suffix = " (OmU)"; // original audio with subtitles
165                 if (title.has_suffix (tag_suffix))
166                         return title.substring (0, title.length - tag_suffix.length);
167                 tag_suffix = " (OV)"; // original audio
168                 if (title.has_suffix (tag_suffix))
169                         return title.substring (0, title.length - tag_suffix.length);
170                 return title.dup ();
171         }
172
173         private void parse_movie_info (Xml.Node* i, GoogleMovie movie) {
174                 var text = i->children;
175                 if (text != null && text->name == "text") {
176                         MatchInfo match;
177                         print ("\t\"%s\"\n", text->content);
178                         if (re_runtime.match (text->content, 0, out match)) {
179                                 movie.runtime = match.fetch (1).to_int () * 3600 +
180                                                 match.fetch (2).to_int () * 60;
181                         }
182                         movie.fsk = text->content.str ("Rated ").replace (" - ", "");
183                 }
184                 for (var n = text->next; n != null; n = n->next) {
185                         if (n->name == "nobr") {
186                                 movie.rating = parse_rating (n);
187                                 if (movie.rating == 0)
188                                         movie.rating = -1;
189                                 break;
190                         }
191                 }
192         }
193
194         private int parse_rating (Xml.Node* nobr) {
195                 for (var n = nobr->children; n != null; n = n->next) {
196                         if (n->name == "nobr") {
197                                 for (var img = n->children; img != null; img = img->next) {
198                                         if (img->name == "img") {
199                                                 var alt = img->get_prop ("alt"); // "Rated 0.0 out of 5.0"
200                                                 if (alt != null && alt != "")    //        ^
201                                                         print ("\trating: %s - %f\n", alt, alt.offset (6).to_double ());
202                                                         return (int) (10 * alt.offset (6).to_double ());
203                                         }
204                                 }
205                         }
206                 }
207                 return 0;
208         }
209
210         private void parse_movie_times (Xml.Node* node, GoogleMovie movie) {
211                 movie.showtimes = get_child_text_content (node).replace ("\xc2\xa0", ","); // U+00A0 = &nbsp;
212         }
213
214         unowned string? get_child_text_content (Xml.Node *n) {
215                 if (n->children != null && n->children->name == "text")
216                         return n->children->content;
217                 else
218                         return null;
219         }
220
221         Xml.Node* search_tag_by_property (Xml.Node* node, string tag, string prop, string val) requires (node != null) {
222                 for (var n = node; n != null; n = n->next) {
223                         if (n->name == tag && n->get_prop (prop) == val)
224                                 return n;
225                         if (n->children != null) {
226                                 var found = search_tag_by_property (n->children, tag, prop, val);
227                                 if (found != null)
228                                         return found;
229                         }
230                 }
231                 return null;
232         }
233
234         Xml.Node* search_tag_by_class (Xml.Node* node, string tag, string @class) requires (node != null) {
235                 return search_tag_by_property (node, tag, "class", @class);
236         }
237
238         public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) {
239                 _get_callback = callback;
240                 _title = title;
241                 if (title.chr(title.length, '*') != null) {
242                         pattern = new PatternSpec (title);
243                 } else {
244                         pattern = null;
245                 }
246                 try {
247                         // TODO - use google.de in Germany, also provides genres
248                         string uri = "http://google.com/movies";
249                         if (location != null && location != "")
250                                 uri += "?near=" + location;
251
252                         stdout.printf ("GET: %s\n", uri);
253
254                         string buf = yield curlwrapper.http_get (uri);
255                         return parse (buf);
256                 } catch (Error e) {
257                         stderr.printf ("Error: %s\n", e.message);
258                 }
259
260                 return 0;
261         }
262 }