--- /dev/null
+
+import re
+
+from HTMLParser import HTMLParser
+
+class StanTimetableParser(HTMLParser):
+
+ def __init__(self):
+ HTMLParser.__init__(self)
+
+ self.result = {
+ 'navigation': {},
+ 'timetable': []
+ }
+
+ self.current_tt_line = None
+ self.state = None
+ self.TT_CAPTURING_TIMETABLE = 'TT_CAPTURING_TIMETABLE'
+ self.TT_CAPTURING_NAVIGATION = 'TT_CAPTURING_NAVIGATION'
+ self.TT_STOP_HOUR = 'TT_STOP_HOUR'
+ self.TT_STOP_NAME = 'TT_STOP_NAME'
+ self.TT_NAVIG_PREV = 'TT_NAVIG_PREV'
+ self.TT_NAVIG_NEXT = 'TT_NAVIG_NEXT'
+ self.TT_NAVIG_PREV_LINK = 'TT_NAVIG_PREV_LINK'
+ self.TT_NAVIG_NEXT_LINK = 'TT_NAVIG_NEXT_LINK'
+
+ def handle_starttag(self, tag, attrs):
+ self.last_tag = tag
+
+ attributes = {}
+ for attr in attrs:
+ attributes[attr[0]] = attr[1]
+
+ if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'goatResultTop':
+ self.state = self.TT_CAPTURING_NAVIGATION
+
+ elif tag == 'table' and attributes.has_key('id') and attributes['id'] == 'linehour':
+ self.state = self.TT_CAPTURING_TIMETABLE
+
+ elif self.state == self.TT_CAPTURING_TIMETABLE:
+ if tag == 'tr':
+ self.current_tt_line = []
+ elif tag == 'strong':
+ self.state = self.TT_STOP_NAME
+ elif tag == 'td' and attributes.has_key('class') and 'hour' in attributes['class']:
+ self.state = self.TT_STOP_HOUR
+
+ elif self.state == self.TT_CAPTURING_NAVIGATION:
+ if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourPrev':
+ self.state = self.TT_NAVIG_PREV
+ elif tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourNext':
+ self.state = self.TT_NAVIG_NEXT
+
+ elif self.state == self.TT_NAVIG_PREV and tag == 'a':
+ self.result['navigation']['prev'] = attributes['href']
+ self.state = self.TT_CAPTURING_NAVIGATION
+
+ elif self.state == self.TT_NAVIG_NEXT and tag == 'a':
+ self.result['navigation']['next'] = attributes['href']
+ self.state = self.TT_CAPTURING_NAVIGATION
+
+
+
+ def handle_data(self, data):
+ if self.state == self.TT_STOP_HOUR:
+ self.current_tt_line.append(data)
+ self.state = self.TT_CAPTURING_TIMETABLE
+
+ elif self.state == self.TT_STOP_NAME:
+ # remove in-parenthesis
+ data = re.compile('^[^\(]+').match(data).group()
+ self.current_tt_line.append(data)
+ self.state = self.TT_CAPTURING_TIMETABLE
+
+
+
+ def handle_endtag(self, tag):
+ if tag == 'tr' and self.state == self.TT_CAPTURING_TIMETABLE and self.current_tt_line is not None and len(self.current_tt_line) > 0:
+ self.result['timetable'].append(self.current_tt_line)
+
+ elif tag == 'table' and self.state == self.TT_CAPTURING_TIMETABLE:
+ self.state = None