"""Intelligently pretty-print HTML/XML with inline tags. prettify_xml() can be used for any XML text. prettify_html() is specifically for BeautifulSoup.prettify() output, as it does not add tag linebreaks. """ import re import xml.dom.minidom as xmldom class RegExSub: """Dict factory for regex and corresponding substitution expression. Attributes: regex (re.Pattern): Compiled regex to use in re.search()/match() replace_with (TYPE): Description """ def __init__(self, pattern, flags=0, replace_with=''): """Create RexExSub instance. Args: pattern (str): String to compile as regex. flags (re.RegexFlag, optional): Flags for re.compile(). replace_with (str): String to replace regex matches. Default removes match by replacing with empty string. """ self.regex = re.compile(pattern, flags) self.replace_with = replace_with def sub(self, string): """Perform regex substitution on given string. Args: string (str): String to be processed. Returns: str: String after replacements made. """ return self.regex.sub(self.replace_with, string) def apply_re_subs(string, RegExSub_list, debug=False): """Apply multiple regex substitutions to a string. Args: string (str): String to be processed. RegExSub_list (list): List of RegExSub objects to apply. debug (bool, optional): Show results of each regexp application. Returns: str: String after all regex substitutions have been applied. """ processed_string = string for regex_obj in RegExSub_list: processed_string = regex_obj.sub(processed_string) if debug: print('========================================================\n') print(regex_obj.regex) print('--------------------------------------------------------\n') print(processed_string) return processed_string def prettify_xml(xml_string, indent=2, debug=False): """Prettify XML with intelligent inline tags. Args: xml_string (str): XML text to prettify. indent (int, optional): Set size of XML tag indents. debug (bool, optional): Show results of each regexp application. Returns: str: Prettified XML. """ doc = xmldom.parseString(xml_string) indent_str = ' ' * indent ugly_xml = doc.toprettyxml(indent=indent_str) inline_all_tags = RegExSub(r'>\n\s*([^<>\s].*?)\n\s*\g<1>)(\n|\s)*(\g<3>') blankline_re = RegExSub(r'(>)\n$', re.M, r'\g<1>') regexps = [inline_all_tags, whitespace_re, blankline_re] pretty_xml = apply_re_subs(ugly_xml, regexps, debug) return pretty_xml