base version
[pystan] / src / pystan / lib / timetable_parser.py
diff --git a/src/pystan/lib/timetable_parser.py b/src/pystan/lib/timetable_parser.py
new file mode 100644 (file)
index 0000000..95fff2d
--- /dev/null
@@ -0,0 +1,91 @@
+
+import re
+
+from HTMLParser import HTMLParser
+
+class StanTimetableParser(HTMLParser):
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+
+        self.result = {
+            'navigation': {},
+            'timetable': [],
+            'stations': []
+        }
+
+        self.current_tt_line = None
+        self.state = None
+        self.TT_CAPTURING_TIMETABLE = 'TT_CAPTURING_TIMETABLE'
+        self.TT_CAPTURING_NAVIGATION = 'TT_CAPTURING_NAVIGATION'
+        self.TT_STOP_HOUR = 'TT_STOP_HOUR'
+        self.TT_STOP_NAME = 'TT_STOP_NAME'
+        self.TT_NAVIG_PREV = 'TT_NAVIG_PREV'
+        self.TT_NAVIG_NEXT = 'TT_NAVIG_NEXT'
+        self.TT_NAVIG_PREV_LINK = 'TT_NAVIG_PREV_LINK'
+        self.TT_NAVIG_NEXT_LINK = 'TT_NAVIG_NEXT_LINK'
+
+    def handle_starttag(self, tag, attrs):
+
+        attributes = {}
+        for attr in attrs:
+            attributes[attr[0]] = attr[1]
+
+        if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'goatResultTop':
+            self.state = self.TT_CAPTURING_NAVIGATION
+
+        elif tag == 'table' and attributes.has_key('id') and attributes['id'] == 'linehour':
+            self.state = self.TT_CAPTURING_TIMETABLE
+
+        elif self.state == self.TT_CAPTURING_TIMETABLE:
+            if tag == 'tr':
+                self.current_tt_line = []
+            elif tag == 'strong':
+                self.state = self.TT_STOP_NAME
+            elif tag == 'a' and self.last_tag[0] == 'td' and 'plan' in self.last_tag[1]['class']:
+                self.result['stations'].append(attributes['href'])
+                #self.current_tt_line.append(attributes['href'])
+            elif tag == 'td' and attributes.has_key('class') and 'hour' in attributes['class']:
+                self.state = self.TT_STOP_HOUR
+
+        elif self.state == self.TT_CAPTURING_NAVIGATION:
+            if tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourPrev':
+                self.state = self.TT_NAVIG_PREV
+            elif tag == 'div' and attributes.has_key('class') and attributes['class'] == 'linehourNext':
+                self.state = self.TT_NAVIG_NEXT
+
+        elif self.state == self.TT_NAVIG_PREV and tag == 'a':
+            self.result['navigation']['prev'] = attributes['href']
+            self.state = self.TT_CAPTURING_NAVIGATION
+
+        elif self.state == self.TT_NAVIG_NEXT and tag == 'a':
+            self.result['navigation']['next'] = attributes['href']
+            self.state = self.TT_CAPTURING_NAVIGATION
+
+        self.last_tag = [ tag, attributes ]
+        pass
+
+
+
+    def handle_data(self, data):
+        if self.state == self.TT_STOP_HOUR:
+            self.current_tt_line.append(data)
+            self.state = self.TT_CAPTURING_TIMETABLE
+
+        elif self.state == self.TT_STOP_NAME:
+            # remove in-parenthesis
+            data = re.compile('^[^\(]+').match(data).group()
+
+            # slice doubles
+            data = data.replace(" - ", "\n")
+            self.current_tt_line.append(data)
+            self.state = self.TT_CAPTURING_TIMETABLE
+
+
+
+    def handle_endtag(self, tag):
+        if tag == 'tr' and self.state == self.TT_CAPTURING_TIMETABLE and self.current_tt_line is not None and len(self.current_tt_line) > 0:
+            self.result['timetable'].append(self.current_tt_line)
+
+        elif tag == 'table' and self.state == self.TT_CAPTURING_TIMETABLE:
+            self.state = None