Adding some unicode helpers I pulled from the python csv module's docs
authorEd Page <eopage@byu.net>
Tue, 30 Nov 2010 04:01:27 +0000 (22:01 -0600)
committerEd Page <eopage@byu.net>
Tue, 30 Nov 2010 04:01:27 +0000 (22:01 -0600)
src/util/io.py

index aece2dd..aac896d 100644 (file)
@@ -7,7 +7,12 @@ import os
 import pickle
 import contextlib
 import itertools
-import functools
+import codecs
+import csv
+try:
+       import cStringIO as StringIO
+except ImportError:
+       import StringIO
 
 
 @contextlib.contextmanager
@@ -127,3 +132,78 @@ def relpath(p1, p2):
                return os.path.join(*relParts)
        else:
                return "."+os.sep
+
+
+class UTF8Recoder(object):
+       """
+       Iterator that reads an encoded stream and reencodes the input to UTF-8
+       """
+       def __init__(self, f, encoding):
+               self.reader = codecs.getreader(encoding)(f)
+
+       def __iter__(self):
+               return self
+
+       def next(self):
+               return self.reader.next().encode("utf-8")
+
+
+class UnicodeReader(object):
+       """
+       A CSV reader which will iterate over lines in the CSV file "f",
+       which is encoded in the given encoding.
+       """
+
+       def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+               f = UTF8Recoder(f, encoding)
+               self.reader = csv.reader(f, dialect=dialect, **kwds)
+
+       def next(self):
+               row = self.reader.next()
+               return [unicode(s, "utf-8") for s in row]
+
+       def __iter__(self):
+               return self
+
+class UnicodeWriter(object):
+       """
+       A CSV writer which will write rows to CSV file "f",
+       which is encoded in the given encoding.
+       """
+
+       def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+               # Redirect output to a queue
+               self.queue = StringIO.StringIO()
+               self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+               self.stream = f
+               self.encoder = codecs.getincrementalencoder(encoding)()
+
+       def writerow(self, row):
+               self.writer.writerow([s.encode("utf-8") for s in row])
+               # Fetch UTF-8 output from the queue ...
+               data = self.queue.getvalue()
+               data = data.decode("utf-8")
+               # ... and reencode it into the target encoding
+               data = self.encoder.encode(data)
+               # write to the target stream
+               self.stream.write(data)
+               # empty queue
+               self.queue.truncate(0)
+
+       def writerows(self, rows):
+               for row in rows:
+                       self.writerow(row)
+
+
+def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
+       # csv.py doesn't do Unicode; encode temporarily as UTF-8:
+       csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
+                                                       dialect=dialect, **kwargs)
+       for row in csv_reader:
+               # decode UTF-8 back to Unicode, cell by cell:
+               yield [unicode(cell, 'utf-8') for cell in row]
+
+
+def utf_8_encoder(unicode_csv_data):
+       for line in unicode_csv_data:
+               yield line.encode('utf-8')