From 1a9fe0907797ee7a409d0969fd35f7f02da13e79 Mon Sep 17 00:00:00 2001 From: Florian Schweikert Date: Sat, 26 Feb 2011 02:49:07 +0100 Subject: [PATCH] fixed parser crash when overview contains footpath added trip parser --- parseHtml.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/parseHtml.py b/parseHtml.py index aab72b5..94826ab 100644 --- a/parseHtml.py +++ b/parseHtml.py @@ -1,6 +1,7 @@ -from BeautifulSoup import BeautifulSoup +from BeautifulSoup import BeautifulSoup, NavigableString import urllib2 from datetime import time, datetime +from textwrap import wrap class Parser: @@ -16,7 +17,15 @@ class Parser: raise IndexError() def _parse_details(self): - return self.soup.findAll('div', {'class': 'data_table tourdetail'}) + trips = map(lambda x: map(lambda x: { + 'time': map(lambda x: (time(*map(lambda x: int(x), x.split(':')))), wrap(x.find('td', {'class': 'col_time'}).text, 5)), # black magic appears + 'station': map(lambda x: x[2:].strip(), + filter(lambda x: type(x) == NavigableString, x.find('td', {'class': 'col_station'}).contents)), # filter non NaviStrings + 'info': map(lambda x: x.strip(), + filter(lambda x: type(x) == NavigableString, x.find('td', {'class': 'col_info'}).contents)), + }, x.find('tbody').findAll('tr')), + self.soup.findAll('div', {'class': 'data_table tourdetail'})) # all routes + return trips @property def details(self): @@ -39,13 +48,17 @@ class Parser: # get rows rows = table.findAll('tr')[1:] overview = map(lambda x: { - 'date': datetime.strptime(x.find('td', {'class': 'col_date'}).text, '%d.%m.%Y'), # grab date - 'time': map(lambda x: time(*map(lambda x: int(x), x.split(':'))), x.find('td', {'class': 'col_time'}).text.split(' - ')), # extract times - 'duration': time(*map(lambda x: int(x), x.find('td', {'class': 'col_duration'}).text.split(':'))), # grab duration - 'change': int(x.find('td', {'class': 'col_change'}).text), # grab changes - 'price': float(x.find('td', {'class': 'col_price'}).text.replace(',', '.')) # grab price + 'date': datetime.strptime(x.find('td', {'class': 'col_date'}).text, '%d.%m.%Y') # grab date + if x.find('td', {'class': 'col_date'}).text else None, # if date is empty set to None + 'time': map(lambda x: time(*map(lambda x: int(x), x.split(':'))) if x else None, # extract times or set to None if empty + x.find('td', {'class': 'col_time'}).text.split(' - ')), + 'duration': time(*map(lambda x: int(x), x.find('td', {'class': 'col_duration'}).text.split(':'))), # grab duration + 'change': int(x.find('td', {'class': 'col_change'}).text) # grab changes + if x.find('td', {'class': 'col_change'}).text else 0, # if change is empty set to 0 + 'price': float(x.find('td', {'class': 'col_price'}).text.replace(',', '.')) # grab price + if x.find('td', {'class': 'col_price'}).text.find(',') >= 0 else 0.0, # if price is empty set to 0.0 }, - rows) + rows) return overview -- 1.7.9.5