git.maemo.org Git - meabook/blob - parsers/ldif.py

   1 """
   2 ldif - generate and parse LDIF data (see RFC 2849)
   3
   4 See http://www.python-ldap.org/ for details.
   5
   6 $Id: ldif.py,v 1.52 2009/12/03 22:11:26 stroeder Exp $
   7
   8 Python compability note:
   9 Tested with Python 2.0+, but should work with Python 1.5.2+.
  10 """
  11
  12 __version__ = '2.3.11'
  13
  14 __all__ = [
  15   # constants
  16   'ldif_pattern',
  17   # functions
  18   'AttrTypeandValueLDIF','CreateLDIF','ParseLDIF',
  19   # classes
  20   'LDIFWriter',
  21   'LDIFParser',
  22   'LDIFRecordList',
  23   'LDIFCopy',
  24 ]
  25
  26 import urlparse,urllib,base64,re,types
  27
  28 try:
  29   from cStringIO import StringIO
  30 except ImportError:
  31   from StringIO import StringIO
  32
  33 attrtype_pattern = r'[\w;.]+(;[\w_-]+)*'
  34 attrvalue_pattern = r'(([^,]|\\,)+|".*?")'
  35 rdn_pattern = attrtype_pattern + r'[ ]*=[ ]*' + attrvalue_pattern
  36 dn_pattern   = rdn_pattern + r'([ ]*,[ ]*' + rdn_pattern + r')*[ ]*'
  37 dn_regex   = re.compile('^%s$' % dn_pattern)
  38
  39 ldif_pattern = '^((dn(:|::) %(dn_pattern)s)|(%(attrtype_pattern)s(:|::) .*)$)+' % vars()
  40
  41 MOD_OP_INTEGER = {
  42   'add':0,'delete':1,'replace':2
  43 }
  44
  45 MOD_OP_STR = {
  46   0:'add',1:'delete',2:'replace'
  47 }
  48
  49 CHANGE_TYPES = ['add','delete','modify','modrdn']
  50 valid_changetype_dict = {}
  51 for c in CHANGE_TYPES:
  52   valid_changetype_dict[c]=None
  53
  54
  55 def is_dn(s):
  56   """
  57   returns 1 if s is a LDAP DN
  58   """
  59   if s=='':
  60     return 1
  61   rm = dn_regex.match(s)
  62   return rm!=None and rm.group(0)==s
  63
  64
  65 SAFE_STRING_PATTERN = '(^(\000|\n|\r| |:|<)|[\000\n\r\200-\377]+|[ ]+$)'
  66 safe_string_re = re.compile(SAFE_STRING_PATTERN)
  67
  68 def list_dict(l):
  69   """
  70   return a dictionary with all items of l being the keys of the dictionary
  71   """
  72   return dict([(i,None) for i in l])
  73
  74
  75 class LDIFWriter:
  76   """
  77   Write LDIF entry or change records to file object
  78   Copy LDIF input to a file output object containing all data retrieved
  79   via URLs
  80   """
  81
  82   def __init__(self,output_file,base64_attrs=None,cols=76,line_sep='\n'):
  83     """
  84     output_file
  85         file object for output
  86     base64_attrs
  87         list of attribute types to be base64-encoded in any case
  88     cols
  89         Specifies how many columns a line may have before it's
  90         folded into many lines.
  91     line_sep
  92         String used as line separator
  93     """
  94     self._output_file = output_file
  95     self._base64_attrs = list_dict([a.lower() for a in (base64_attrs or [])])
  96     self._cols = cols
  97     self._line_sep = line_sep
  98     self.records_written = 0
  99
 100   def _unfoldLDIFLine(self,line):
 101     """
 102     Write string line as one or more folded lines
 103     """
 104     # Check maximum line length
 105     line_len = len(line)
 106     if line_len<=self._cols:
 107       self._output_file.write(line)
 108       self._output_file.write(self._line_sep)
 109     else:
 110       # Fold line
 111       pos = self._cols
 112       self._output_file.write(line[0:min(line_len,self._cols)])
 113       self._output_file.write(self._line_sep)
 114       while pos<line_len:
 115         self._output_file.write(' ')
 116         self._output_file.write(line[pos:min(line_len,pos+self._cols-1)])
 117         self._output_file.write(self._line_sep)
 118         pos = pos+self._cols-1
 119     return # _unfoldLDIFLine()
 120
 121   def _needs_base64_encoding(self,attr_type,attr_value):
 122     """
 123     returns 1 if attr_value has to be base-64 encoded because
 124     of special chars or because attr_type is in self._base64_attrs
 125     """
 126     return self._base64_attrs.has_key(attr_type.lower()) or \
 127            not safe_string_re.search(attr_value) is None
 128
 129   def _unparseAttrTypeandValue(self,attr_type,attr_value):
 130     """
 131     Write a single attribute type/value pair
 132
 133     attr_type
 134           attribute type
 135     attr_value
 136           attribute value
 137     """
 138     if self._needs_base64_encoding(attr_type,attr_value):
 139       # Encode with base64
 140       self._unfoldLDIFLine(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')]))
 141     else:
 142       self._unfoldLDIFLine(': '.join([attr_type,attr_value]))
 143     return # _unparseAttrTypeandValue()
 144
 145   def _unparseEntryRecord(self,entry):
 146     """
 147     entry
 148         dictionary holding an entry
 149     """
 150     attr_types = entry.keys()[:]
 151     attr_types.sort()
 152     for attr_type in attr_types:
 153       for attr_value in entry[attr_type]:
 154         self._unparseAttrTypeandValue(attr_type,attr_value)
 155
 156   def _unparseChangeRecord(self,modlist):
 157     """
 158     modlist
 159         list of additions (2-tuple) or modifications (3-tuple)
 160     """
 161     mod_len = len(modlist[0])
 162     if mod_len==2:
 163       changetype = 'add'
 164     elif mod_len==3:
 165       changetype = 'modify'
 166     else:
 167       raise ValueError,"modlist item of wrong length"
 168     self._unparseAttrTypeandValue('changetype',changetype)
 169     for mod in modlist:
 170       if mod_len==2:
 171         mod_type,mod_vals = mod
 172       elif mod_len==3:
 173         mod_op,mod_type,mod_vals = mod
 174         self._unparseAttrTypeandValue(MOD_OP_STR[mod_op],mod_type)
 175       else:
 176         raise ValueError,"Subsequent modlist item of wrong length"
 177       if mod_vals:
 178         for mod_val in mod_vals:
 179           self._unparseAttrTypeandValue(mod_type,mod_val)
 180       if mod_len==3:
 181         self._output_file.write('-'+self._line_sep)
 182
 183   def unparse(self,dn,record):
 184     """
 185     dn
 186           string-representation of distinguished name
 187     record
 188           Either a dictionary holding the LDAP entry {attrtype:record}
 189           or a list with a modify list like for LDAPObject.modify().
 190     """
 191     if not record:
 192       # Simply ignore empty records
 193       return
 194     # Start with line containing the distinguished name
 195     self._unparseAttrTypeandValue('dn',dn)
 196     # Dispatch to record type specific writers
 197     if isinstance(record,types.DictType):
 198       self._unparseEntryRecord(record)
 199     elif isinstance(record,types.ListType):
 200       self._unparseChangeRecord(record)
 201     else:
 202       raise ValueError, "Argument record must be dictionary or list"
 203     # Write empty line separating the records
 204     self._output_file.write(self._line_sep)
 205     # Count records written
 206     self.records_written = self.records_written+1
 207     return # unparse()
 208
 209
 210 def CreateLDIF(dn,record,base64_attrs=None,cols=76):
 211   """
 212   Create LDIF single formatted record including trailing empty line.
 213   This is a compability function. Use is deprecated!
 214
 215   dn
 216         string-representation of distinguished name
 217   record
 218         Either a dictionary holding the LDAP entry {attrtype:record}
 219         or a list with a modify list like for LDAPObject.modify().
 220   base64_attrs
 221         list of attribute types to be base64-encoded in any case
 222   cols
 223         Specifies how many columns a line may have before it's
 224         folded into many lines.
 225   """
 226   f = StringIO()
 227   ldif_writer = LDIFWriter(f,base64_attrs,cols,'\n')
 228   ldif_writer.unparse(dn,record)
 229   s = f.getvalue()
 230   f.close()
 231   return s
 232
 233
 234 class LDIFParser:
 235   """
 236   Base class for a LDIF parser. Applications should sub-class this
 237   class and override method handle() to implement something meaningful.
 238
 239   Public class attributes:
 240   records_read
 241         Counter for records processed so far
 242   """
 243
 244   def _stripLineSep(self,s):
 245     """
 246     Strip trailing line separators from s, but no other whitespaces
 247     """
 248     if s[-2:]=='\r\n':
 249       return s[:-2]
 250     elif s[-1:]=='\n':
 251       return s[:-1]
 252     else:
 253       return s
 254
 255   def __init__(
 256     self,
 257     input_file,
 258     ignored_attr_types=None,
 259     max_entries=0,
 260     process_url_schemes=None,
 261     line_sep='\n'
 262   ):
 263     """
 264     Parameters:
 265     input_file
 266         File-object to read the LDIF input from
 267     ignored_attr_types
 268         Attributes with these attribute type names will be ignored.
 269     max_entries
 270         If non-zero specifies the maximum number of entries to be
 271         read from f.
 272     process_url_schemes
 273         List containing strings with URLs schemes to process with urllib.
 274         An empty list turns off all URL processing and the attribute
 275         is ignored completely.
 276     line_sep
 277         String used as line separator
 278     """
 279     self._input_file = input_file
 280     self._max_entries = max_entries
 281     self._process_url_schemes = list_dict([s.lower() for s in (process_url_schemes or [])])
 282     self._ignored_attr_types = list_dict([a.lower() for a in (ignored_attr_types or [])])
 283     self._line_sep = line_sep
 284     self.records_read = 0
 285
 286   def handle(self,dn,entry):
 287     """
 288     Process a single content LDIF record. This method should be
 289     implemented by applications using LDIFParser.
 290     """
 291
 292   def _unfoldLDIFLine(self):
 293     """
 294     Unfold several folded lines with trailing space into one line
 295     """
 296     unfolded_lines = [ self._stripLineSep(self._line) ]
 297     self._line = self._input_file.readline()
 298     while self._line and self._line[0]==' ':
 299       unfolded_lines.append(self._stripLineSep(self._line[1:]))
 300       self._line = self._input_file.readline()
 301     return ''.join(unfolded_lines)
 302
 303   def _parseAttrTypeandValue(self):
 304     """
 305     Parse a single attribute type and value pair from one or
 306     more lines of LDIF data
 307     """
 308     # Reading new attribute line
 309     unfolded_line = self._unfoldLDIFLine()
 310     # Ignore comments which can also be folded
 311     while unfolded_line and unfolded_line[0]=='#':
 312       unfolded_line = self._unfoldLDIFLine()
 313     if not unfolded_line or unfolded_line=='\n' or unfolded_line=='\r\n':
 314       return None,None
 315     try:
 316       colon_pos = unfolded_line.index(':')
 317     except ValueError:
 318       # Treat malformed lines without colon as non-existent
 319       return None,None
 320     attr_type = unfolded_line[0:colon_pos]
 321     # if needed attribute value is BASE64 decoded
 322     value_spec = unfolded_line[colon_pos:colon_pos+2]
 323     if value_spec=='::':
 324       # attribute value needs base64-decoding
 325       attr_value = base64.decodestring(unfolded_line[colon_pos+2:])
 326     elif value_spec==':<':
 327       # fetch attribute value from URL
 328       url = unfolded_line[colon_pos+2:].strip()
 329       attr_value = None
 330       if self._process_url_schemes:
 331         u = urlparse.urlparse(url)
 332         if self._process_url_schemes.has_key(u[0]):
 333           attr_value = urllib.urlopen(url).read()
 334     elif value_spec==':\r\n' or value_spec=='\n':
 335       attr_value = ''
 336     else:
 337       attr_value = unfolded_line[colon_pos+2:].lstrip()
 338     return attr_type,attr_value
 339
 340   def parse(self):
 341     """
 342     Continously read and parse LDIF records
 343     """
 344     self._line = self._input_file.readline()
 345
 346     while self._line and \
 347           (not self._max_entries or self.records_read<self._max_entries):
 348
 349       # Reset record
 350       version = None; dn = None; changetype = None; modop = None; entry = {}
 351
 352       attr_type,attr_value = self._parseAttrTypeandValue()
 353
 354       while attr_type!=None and attr_value!=None:
 355         if attr_type=='dn':
 356           # attr type and value pair was DN of LDIF record
 357           if dn!=None:
 358             raise ValueError, 'Two lines starting with dn: in one record.'
 359           if not is_dn(attr_value):
 360             raise ValueError, 'No valid string-representation of distinguished name %s.' % (repr(attr_value))
 361           dn = attr_value
 362         elif attr_type=='version' and dn is None:
 363           version = 1
 364         elif attr_type=='changetype':
 365           # attr type and value pair was DN of LDIF record
 366           if dn is None:
 367             raise ValueError, 'Read changetype: before getting valid dn: line.'
 368           if changetype!=None:
 369             raise ValueError, 'Two lines starting with changetype: in one record.'
 370           if not valid_changetype_dict.has_key(attr_value):
 371             raise ValueError, 'changetype value %s is invalid.' % (repr(attr_value))
 372           changetype = attr_value
 373         elif attr_value!=None and \
 374              not self._ignored_attr_types.has_key(attr_type.lower()):
 375           # Add the attribute to the entry if not ignored attribute
 376           if entry.has_key(attr_type):
 377             entry[attr_type].append(attr_value)
 378           else:
 379             entry[attr_type]=[attr_value]
 380
 381         # Read the next line within an entry
 382         attr_type,attr_value = self._parseAttrTypeandValue()
 383
 384       if entry:
 385         # append entry to result list
 386         self.handle(dn,entry)
 387         self.records_read = self.records_read+1
 388
 389     return # parse()
 390
 391
 392 class LDIFRecordList(LDIFParser):
 393   """
 394   Collect all records of LDIF input into a single list.
 395   of 2-tuples (dn,entry). It can be a memory hog!
 396   """
 397
 398   def __init__(
 399     self,
 400     input_file,
 401     ignored_attr_types=None,max_entries=0,process_url_schemes=None
 402   ):
 403     """
 404     See LDIFParser.__init__()
 405
 406     Additional Parameters:
 407     all_records
 408         List instance for storing parsed records
 409     """
 410     LDIFParser.__init__(self,input_file,ignored_attr_types,max_entries,process_url_schemes)
 411     self.all_records = []
 412
 413   def handle(self,dn,entry):
 414     """
 415     Append single record to dictionary of all records.
 416     """
 417     self.all_records.append((dn,entry))
 418
 419
 420 class LDIFCopy(LDIFParser):
 421   """
 422   Copy LDIF input to LDIF output containing all data retrieved
 423   via URLs
 424   """
 425
 426   def __init__(
 427     self,
 428     input_file,output_file,
 429     ignored_attr_types=None,max_entries=0,process_url_schemes=None,
 430     base64_attrs=None,cols=76,line_sep='\n'
 431   ):
 432     """
 433     See LDIFParser.__init__() and LDIFWriter.__init__()
 434     """
 435     LDIFParser.__init__(self,input_file,ignored_attr_types,max_entries,process_url_schemes)
 436     self._output_ldif = LDIFWriter(output_file,base64_attrs,cols,line_sep)
 437
 438   def handle(self,dn,entry):
 439     """
 440     Write single LDIF record to output file.
 441     """
 442     self._output_ldif.unparse(dn,entry)
 443
 444
 445 def ParseLDIF(f,ignore_attrs=None,maxentries=0):
 446   """
 447   Parse LDIF records read from file.
 448   This is a compability function. Use is deprecated!
 449   """
 450   ldif_parser = LDIFRecordList(
 451     f,ignored_attr_types=ignore_attrs,max_entries=maxentries,process_url_schemes=0
 452   )
 453   ldif_parser.parse()
 454   return ldif_parser.all_records