Google backend: replace parser with a libxml-2.0 based one
[cinaest] / src / backends / google / google-parser.vala
1 /* This file is part of Cinaest.
2  *
3  * Copyright (C) 2009 Philipp Zabel
4  *
5  * Cinaest is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Cinaest is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 public class Theater {
20         public string name;
21         public string address;
22         public string phone;
23 }
24
25 public class GoogleMovie {
26         public string title;
27         public int rating;
28         public Theater theater;
29         public int runtime;
30         public string fsk;
31         public string showtimes;
32 }
33
34 class GoogleParser : Object {
35         int movies;
36         public string location;
37         string _title;
38         PatternSpec pattern;
39
40         public delegate void ReceiveMovie (GoogleMovie movie);
41         public ReceiveMovie _get_callback;
42
43         private Html.Doc* get_html_document (ref char[] buf) {
44                 return Html.Doc.read_memory (buf, (int) buf.length,
45                                              "http://movies.google.de", null, Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING);
46         }
47
48         public int parse (ref char[] buf) throws Error {
49                 var doc = get_html_document (ref buf);
50                 if (doc == null) {
51                         stderr.printf ("Error: parsing failed\n");
52                         return 0;
53                 }
54
55                 // TODO: set up location
56                 location = "";
57
58                 var theater = search_tag_by_class (doc->children, "div", "theater");
59                 if (theater == null) {
60                         stderr.printf ("Error: does not contain theater\n");
61                         return 0;
62                 }
63                 movies = 0;
64                 while (theater != null) {
65                         theater = parse_theater (theater);
66                 }
67                 return movies;
68         }
69
70         private Xml.Node* parse_theater (Xml.Node* t) {
71                 var theater = new Theater ();
72                 var desc = t->children;
73                 if (desc != null && desc->name == "div" && desc->get_prop ("class") == "desc") {
74                         var name = desc->children;
75                         if (name != null && name->name == "h2" && name->get_prop ("class") == "name") {
76                                 var a = name->children;
77                                 if (a != null && a->name == "a")
78                                         theater.name = get_child_text_content (a);
79                                         print ("THEATER \"%s\"\n", theater.name);
80                         }
81                         var info = name->next;
82                         if (info != null && info->name == "div" && info->get_prop ("class") == "info") {
83                                 var text = info->children;
84                                 if (text != null && text->name == "text") {
85                                         var address_and_phone = text->content.split (" - ");
86                                         if (address_and_phone.length >= 2) {
87                                                 theater.address = address_and_phone[0];
88                                                 theater.phone = address_and_phone[1].replace (" ", "").replace ("-", "");
89                                         }
90                                 }
91                         }
92                 }
93                 var showtimes = desc->next;
94                 if (showtimes != null && showtimes->name == "div" && showtimes->get_prop ("class") == "showtimes") {
95                         var left = search_tag_by_class (showtimes->children, "div", "show_left");
96                         if (left != null && left->children != null) {
97                                 print ("LEFT\n");
98                                 var movie = search_tag_by_class (left->children, "div", "movie");
99                                 while (movie != null) {
100                                         movie = parse_movie (movie, theater);
101                                 }
102                         }
103                         var right = search_tag_by_class (left->next, "div", "show_right");
104                         if (right != null && right->children != null) {
105                                 print ("RIGHT\n");
106                                 var movie = search_tag_by_class (right->children, "div", "movie");
107                                 while (movie != null) {
108                                         movie = parse_movie (movie, theater);
109                                 }
110                         }
111
112                 }
113                 return t->next;
114         }
115
116         private Xml.Node* parse_movie (Xml.Node* m, Theater theater) {
117                 var movie = new GoogleMovie ();
118                 movie.theater = theater;
119                 Xml.Node* n;
120                 for (n = m->children; n != null; n = n->next) {
121                         if (n->name == "div" && n->get_prop ("class") == "name")
122                                 movie.title = parse_movie_name (n);
123                         if (n->name == "span" && n->get_prop ("class") == "info")
124                                 parse_movie_info (n, movie);
125                         if (n->name == "div" && n->get_prop ("class") == "times")
126                                 parse_movie_times (n, movie);
127                 }
128                 if (pattern == null) {
129                         if (!movie.title.has_prefix (_title))
130                                 return m->next;
131                 } else {
132                         if (!pattern.match ((uint) movie.title.length, movie.title, null))
133                                 return m->next;
134                 }
135                 _get_callback (movie);
136                 movies++;
137                 return m->next;
138         }
139
140         private string? parse_movie_name (Xml.Node* n) {
141                 var a = n->children;
142                 if (a != null && a->name == "a") {
143                         var text = a->children;
144                         if (text != null && text->name == "text")
145                                 print ("\"%s\"\n", text->content);
146                                 return strip_tags (text->content);
147                 }
148                 return null;
149         }
150
151         // FIXME - this is specific for Germany
152         private string strip_tags (string title) {
153                 string tag_suffix = " (OmU)"; // original audio with subtitles
154                 if (title.has_suffix (tag_suffix))
155                         return title.substring (0, title.length - tag_suffix.length);
156                 tag_suffix = " (OV)"; // original audio
157                 if (title.has_suffix (tag_suffix))
158                         return title.substring (0, title.length - tag_suffix.length);
159                 return title.dup ();
160         }
161
162         private void parse_movie_info (Xml.Node* i, GoogleMovie movie) {
163                 var text = i->children;
164                 if (text != null && text->name == "text")
165                         print ("\t\"%s\"\n", text->content);
166                         //      movie.runtime
167                 for (var n = text->next; n != null; n = n->next) {
168                         if (n->name == "nobr") {
169                                 movie.rating = parse_rating (n);
170                                 if (movie.rating == 0)
171                                         movie.rating = -1;
172                                 break;
173                         }
174                 }
175         }
176
177         private int parse_rating (Xml.Node* nobr) {
178                 for (var n = nobr->children; n != null; n = n->next) {
179                         if (n->name == "nobr") {
180                                 for (var img = n->children; img != null; img = img->next) {
181                                         if (img->name == "img") {
182                                                 var alt = img->get_prop ("alt"); // "Rated 0.0 out of 5.0"
183                                                 if (alt != null && alt != "")    //        ^
184                                                         return (int) (10 * alt.offset (6).to_double ());
185                                                         print ("\trating: %s - %f\n", alt, alt.offset (6).to_double ());
186                                         }
187                                 }
188                         }
189                 }
190                 return 0;
191         }
192
193         private void parse_movie_times (Xml.Node* node, GoogleMovie movie) {
194                 movie.showtimes = get_child_text_content (node).replace ("\xc2\xa0", ","); // U+00A0 = &nbsp;
195         }
196
197         unowned string? get_child_text_content (Xml.Node *n) {
198                 if (n->children != null && n->children->name == "text")
199                         return n->children->content;
200                 else
201                         return null;
202         }
203
204         Xml.Node* search_tag_by_property (Xml.Node* node, string tag, string prop, string val) requires (node != null) {
205                 for (var n = node; n != null; n = n->next) {
206                         if (n->name == tag && n->get_prop (prop) == val)
207                                 return n;
208                         if (n->children != null) {
209                                 var found = search_tag_by_property (n->children, tag, prop, val);
210                                 if (found != null)
211                                         return found;
212                         }
213                 }
214                 return null;
215         }
216
217         Xml.Node* search_tag_by_class (Xml.Node* node, string tag, string @class) requires (node != null) {
218                 return search_tag_by_property (node, tag, "class", @class);
219         }
220
221         public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) {
222                 _get_callback = callback;
223                 _title = title;
224                 if (title.chr(title.length, '*') != null) {
225                         pattern = new PatternSpec (title);
226                 } else {
227                         pattern = null;
228                 }
229                 try {
230                         // TODO - use google.de in Germany, also provides genres
231                         string uri = "http://google.com/movies";
232                         if (location != null && location != "")
233                                 uri += "?near=" + location;
234
235                         stdout.printf ("GET: %s\n", uri);
236
237                         File file = File.new_for_uri (uri);
238                         InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null);
239
240                         char[] buf = new char[256*1024];
241                         size_t nread;
242                         size_t total = 0;
243                         while (total < 256*1024) {
244                                 nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable);
245                                 total += nread;
246                                 if (cancellable.is_cancelled ())
247                                         return 0;
248                                 if (nread == 0)
249                                         break;
250                         }
251                         buf[total] = 0;
252                         return parse (ref buf);
253                 } catch (Error e) {
254                         stderr.printf ("Error: %s\n", e.message);
255                 }
256
257                 return 0;
258         }
259 }