4 from HTMLParser import HTMLParser
6 class StanTimetableParser(HTMLParser):
9 HTMLParser.__init__(self)
16 self.current_tt_line = None
18 self.TT_CAPTURING_TIMETABLE = 'TT_CAPTURING_TIMETABLE'
19 self.TT_CAPTURING_NAVIGATION = 'TT_CAPTURING_NAVIGATION'
20 self.TT_STOP_HOUR = 'TT_STOP_HOUR'
21 self.TT_STOP_NAME = 'TT_STOP_NAME'
22 self.TT_NAVIG_PREV = 'TT_NAVIG_PREV'
23 self.TT_NAVIG_NEXT = 'TT_NAVIG_NEXT'
24 self.TT_NAVIG_PREV_LINK = 'TT_NAVIG_PREV_LINK'
25 self.TT_NAVIG_NEXT_LINK = 'TT_NAVIG_NEXT_LINK'
27 def handle_starttag(self, tag, attrs):
32 attributes[attr[0]] = attr[1]
34 if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'goatResultTop':
35 self.state = self.TT_CAPTURING_NAVIGATION
37 elif tag == 'table' and attributes.has_key('id') and attributes['id'] == 'linehour':
38 self.state = self.TT_CAPTURING_TIMETABLE
40 elif self.state == self.TT_CAPTURING_TIMETABLE:
42 self.current_tt_line = []
44 self.state = self.TT_STOP_NAME
45 elif tag == 'td' and attributes.has_key('class') and 'hour' in attributes['class']:
46 self.state = self.TT_STOP_HOUR
48 elif self.state == self.TT_CAPTURING_NAVIGATION:
49 if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourPrev':
50 self.state = self.TT_NAVIG_PREV
51 elif tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourNext':
52 self.state = self.TT_NAVIG_NEXT
54 elif self.state == self.TT_NAVIG_PREV and tag == 'a':
55 self.result['navigation']['prev'] = attributes['href']
56 self.state = self.TT_CAPTURING_NAVIGATION
58 elif self.state == self.TT_NAVIG_NEXT and tag == 'a':
59 self.result['navigation']['next'] = attributes['href']
60 self.state = self.TT_CAPTURING_NAVIGATION
64 def handle_data(self, data):
65 if self.state == self.TT_STOP_HOUR:
66 self.current_tt_line.append(data)
67 self.state = self.TT_CAPTURING_TIMETABLE
69 elif self.state == self.TT_STOP_NAME:
70 # remove in-parenthesis
71 data = re.compile('^[^\(]+').match(data).group()
72 self.current_tt_line.append(data)
73 self.state = self.TT_CAPTURING_TIMETABLE
77 def handle_endtag(self, tag):
78 if tag == 'tr' and self.state == self.TT_CAPTURING_TIMETABLE and self.current_tt_line is not None and len(self.current_tt_line) > 0:
79 self.result['timetable'].append(self.current_tt_line)
81 elif tag == 'table' and self.state == self.TT_CAPTURING_TIMETABLE: