""" Normalizes an XML DTD by parsing it and writing it back out. """ # Originally written by Lars. See @bd@ for Bob DuCharme 2003-02-13 #changes. No warrantee expressed or implied. import string, sys from xml.parsers.xmlproc import dtdparser, xmlapp # --- Utilities def content_particle_to_string(cp): if len(cp) == 2: return cp[0] + cp[1] else: return content_model_to_string(cp) def content_model_to_string(cm): if type(cm) == type(""): return cm (sep, cont, mod) = cm return "(%s)%s" % (string.join(map(content_particle_to_string, cont), sep), mod) # #

# The program starts off with a number of functions that are used by the # normalizer implementation. The content_model_to_string # function takes a tuple-encoded content model and produces a string # representing the original content model. #

# def atype_to_string(atype): if type(atype) == type([]): return "(%s)" % string.join(atype, " | ") else: return atype def adecl_to_string(adecl): if adecl == "#DEFAULT": return "" else: return adecl def adef_to_string(adef): if adef == None: return "" else: return "'%s'" % attr_escape(adef) # #

# These three functions create string representations of the attribute # type, default declaration and default value respectively. #

# def entity_escape(value): return string.replace(string.replace(value, "&", "&"), "'", "'") def attr_escape(value): return string.replace(string.replace(string.replace(value, "&", "&"), "'", "'"), "<", "<") # #

# These two functions escape entity and attribute values respectively. # Two functions are needed because entity values may contain markup, so # the < character should not be escaped there. #

# # --- The DTD application class DTDWriter(xmlapp.DTDConsumer): def __init__(self, outf): self._out = outf self._attlist = [] # #

# The DTDWriter class implements the DTDConsumer # interface in order to write the normalized DTD back out. The # _attlist attribute contains a list of attribute declarations. # It is used because the new_attribute method only passes a # single attribute per call. In order to put all attributes that belong # to a single ATTLIST declaration back into a single # declaration this list is used as temporary storage. #

# The new_attribute method merely accumulates declarations in # the _attlist list. For all other declaration types, the # _empty_attlist method is called before anything else is done # to write out the accumulated attribute list, if any. This allows the # writer to handle the attribute declarations together. #

# def new_general_entity(self, name, val): self._empty_attlist() self._out.write("\n" % (name, entity_escape(val))) def new_external_entity(self, name, pubid, sysid, ndata): self._empty_attlist() if pubid: self._out.write("\n" % ndata) else: self._out.write(">\n") # #

# The handling of general entities is relatively simple. #

# def new_parameter_entity(self, name, val): self._empty_attlist() self._out.write("\n" % (name, entity_escape(val))) def new_external_pe(self, name, pubid, sysid): self._empty_attlist() if pubid: self._out.write("\n" % (name, pubid, sysid)) else: self._out.write("\n" % (name, sysid)) # #

# It could be debated whether parameter entity declarations should be # retained or not, but in this case they have been included for # reference. #

# # @bd@ replaced three lines in following def new_notation(self, name, pubid, sysid): self._empty_attlist() if pubid and sysid: # self._out.write("\n" % self._out.write("\n" % (name, pubid, sysid)) elif pubid: # self._out.write("\n" % self._out.write("\n" % (name, pubid)) else: # self._out.write("\n" % self._out.write("\n" % (name, sysid)) def new_element_type(self, name, cont): self._empty_attlist() self._out.write("\n" % (name, content_model_to_string(cont))) def new_attribute(self, elem, attr, a_type, a_decl, a_def): self._attlist.append((elem, attr, a_type, a_decl, a_def)) def handle_comment(self, contents): self._empty_attlist() self._out.write("\n" % contents) def handle_pi(self, target, data): self._empty_attlist() self._out.write("" % (target, data)) # #

# Notations, elements, attributes, comments and processing instructions # are all easy to handle. #

# # internal methods def _empty_attlist(self): prev = None for (elem, attr, a_type, a_decl, a_def) in self._attlist: if elem != prev: if prev != None: self._out.write(">\n") self._out.write("\n") self._attlist = [] # #

# The _empty_attlist method ensures that attributes are written # out correctly. Note that it also handles the case where the # accumulated attributes belong to more than a single element type. #

# # --- Main functions def normalize_dtd_file(dtdfile, outfile): outf = open(outfile, "w") parser = dtdparser.DTDParser() parser.set_dtd_consumer(DTDWriter(outf)) # @bd@ there's probably a better place for this somewhere else... outf.write('\n') parser.parse_resource(dtdfile) outf.close() # #

# This function can normalize a DTD in one file to another by setting up # a DTDParser with the DTDWriter as its application # and parsing the DTD. #

# # --- Main program if __name__ == "__main__": # @bd@ added following if statement if len(sys.argv) != 3: print "Enter\n\n python dtdnorm.py mainfile.dtd outfile.dtd\n" print "to turn mainfile.dtd and all referenced external parameter" print "entities into the single DTD file outfile.dtd. The xmlproc" print "Python XML parser is required to run dtdnorm. " sys.exit() normalize_dtd_file(sys.argv[1], sys.argv[2])