Pulling in latest skeleton changes

[gonvert] / src / util / io.py
diff --git a/src/util/io.py b/src/util/io.py

index aece2dd..aac896d 100644 (file)
--- a/src/util/io.py
+++ b/src/util/io.py
@@ -7,7 +7,12 @@ import os
  import pickle
  import contextlib
  import itertools
-import functools
+import codecs
+import csv
+try:
+       import cStringIO as StringIO
+except ImportError:
+       import StringIO
  
  
  @contextlib.contextmanager
@@ -127,3 +132,78 @@ def relpath(p1, p2):
                 return os.path.join(*relParts)
         else:
                 return "."+os.sep
+
+
+class UTF8Recoder(object):
+       """
+       Iterator that reads an encoded stream and reencodes the input to UTF-8
+       """
+       def __init__(self, f, encoding):
+               self.reader = codecs.getreader(encoding)(f)
+
+       def __iter__(self):
+               return self
+
+       def next(self):
+               return self.reader.next().encode("utf-8")
+
+
+class UnicodeReader(object):
+       """
+       A CSV reader which will iterate over lines in the CSV file "f",
+       which is encoded in the given encoding.
+       """
+
+       def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+               f = UTF8Recoder(f, encoding)
+               self.reader = csv.reader(f, dialect=dialect, **kwds)
+
+       def next(self):
+               row = self.reader.next()
+               return [unicode(s, "utf-8") for s in row]
+
+       def __iter__(self):
+               return self
+
+class UnicodeWriter(object):
+       """
+       A CSV writer which will write rows to CSV file "f",
+       which is encoded in the given encoding.
+       """
+
+       def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+               # Redirect output to a queue
+               self.queue = StringIO.StringIO()
+               self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+               self.stream = f
+               self.encoder = codecs.getincrementalencoder(encoding)()
+
+       def writerow(self, row):
+               self.writer.writerow([s.encode("utf-8") for s in row])
+               # Fetch UTF-8 output from the queue ...
+               data = self.queue.getvalue()
+               data = data.decode("utf-8")
+               # ... and reencode it into the target encoding
+               data = self.encoder.encode(data)
+               # write to the target stream
+               self.stream.write(data)
+               # empty queue
+               self.queue.truncate(0)
+
+       def writerows(self, rows):
+               for row in rows:
+                       self.writerow(row)
+
+
+def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
+       # csv.py doesn't do Unicode; encode temporarily as UTF-8:
+       csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
+                                                       dialect=dialect, **kwargs)
+       for row in csv_reader:
+               # decode UTF-8 back to Unicode, cell by cell:
+               yield [unicode(cell, 'utf-8') for cell in row]
+
+
+def utf_8_encoder(unicode_csv_data):
+       for line in unicode_csv_data:
+               yield line.encode('utf-8')