4 from HTMLParser import HTMLParser
6 class StanTimetableParser(HTMLParser):
9 HTMLParser.__init__(self)
17 self.current_tt_line = None
19 self.TT_CAPTURING_TIMETABLE = 'TT_CAPTURING_TIMETABLE'
20 self.TT_CAPTURING_NAVIGATION = 'TT_CAPTURING_NAVIGATION'
21 self.TT_STOP_HOUR = 'TT_STOP_HOUR'
22 self.TT_STOP_NAME = 'TT_STOP_NAME'
23 self.TT_NAVIG_PREV = 'TT_NAVIG_PREV'
24 self.TT_NAVIG_NEXT = 'TT_NAVIG_NEXT'
25 self.TT_NAVIG_PREV_LINK = 'TT_NAVIG_PREV_LINK'
26 self.TT_NAVIG_NEXT_LINK = 'TT_NAVIG_NEXT_LINK'
28 def handle_starttag(self, tag, attrs):
32 attributes[attr[0]] = attr[1]
34 if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'goatResultTop':
35 self.state = self.TT_CAPTURING_NAVIGATION
37 elif tag == 'table' and attributes.has_key('id') and attributes['id'] == 'linehour':
38 self.state = self.TT_CAPTURING_TIMETABLE
40 elif self.state == self.TT_CAPTURING_TIMETABLE:
42 self.current_tt_line = []
44 self.state = self.TT_STOP_NAME
45 elif tag == 'a' and self.last_tag[0] == 'td' and 'plan' in self.last_tag[1]['class']:
46 self.result['stations'].append(attributes['href'])
47 #self.current_tt_line.append(attributes['href'])
48 elif tag == 'td' and attributes.has_key('class') and 'hour' in attributes['class']:
49 self.state = self.TT_STOP_HOUR
51 elif self.state == self.TT_CAPTURING_NAVIGATION:
52 if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourPrev':
53 self.state = self.TT_NAVIG_PREV
54 elif tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourNext':
55 self.state = self.TT_NAVIG_NEXT
57 elif self.state == self.TT_NAVIG_PREV and tag == 'a':
58 self.result['navigation']['prev'] = attributes['href']
59 self.state = self.TT_CAPTURING_NAVIGATION
61 elif self.state == self.TT_NAVIG_NEXT and tag == 'a':
62 self.result['navigation']['next'] = attributes['href']
63 self.state = self.TT_CAPTURING_NAVIGATION
65 self.last_tag = [ tag, attributes ]
70 def handle_data(self, data):
71 if self.state == self.TT_STOP_HOUR:
72 self.current_tt_line.append(data)
73 self.state = self.TT_CAPTURING_TIMETABLE
75 elif self.state == self.TT_STOP_NAME:
76 # remove in-parenthesis
77 data = re.compile('^[^\(]+').match(data).group()
80 data = data.replace(" - ", "\n")
81 self.current_tt_line.append(data)
82 self.state = self.TT_CAPTURING_TIMETABLE
86 def handle_endtag(self, tag):
87 if tag == 'tr' and self.state == self.TT_CAPTURING_TIMETABLE and self.current_tt_line is not None and len(self.current_tt_line) > 0:
88 self.result['timetable'].append(self.current_tt_line)
90 elif tag == 'table' and self.state == self.TT_CAPTURING_TIMETABLE: