12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- """Intelligently pretty-print HTML/XML with inline tags.
- prettify_xml() can be used for any XML text.
- prettify_html() is specifically for BeautifulSoup.prettify() output,
- as it does not add tag linebreaks.
- """
- import re
- import xml.dom.minidom as xmldom
- class RegExSub:
- """Dict factory for regex and corresponding substitution expression.
- Attributes:
- regex (re.Pattern): Compiled regex to use in re.search()/match()
- replace_with (TYPE): Description
- """
- def __init__(self, pattern, flags=0, replace_with=''):
- """Create RexExSub instance.
- Args:
- pattern (str): String to compile as regex.
- flags (re.RegexFlag, optional): Flags for re.compile().
- replace_with (str): String to replace regex matches. Default
- removes match by replacing with empty string.
- """
- self.regex = re.compile(pattern, flags)
- self.replace_with = replace_with
- def sub(self, string):
- """Perform regex substitution on given string.
- Args:
- string (str): String to be processed.
- Returns:
- str: String after replacements made.
- """
- return self.regex.sub(self.replace_with, string)
- def apply_re_subs(string, RegExSub_list, debug=False):
- """Apply multiple regex substitutions to a string.
- Args:
- string (str): String to be processed.
- RegExSub_list (list): List of RegExSub objects to apply.
- debug (bool, optional): Show results of each regexp application.
- Returns:
- str: String after all regex substitutions have been applied.
- """
- processed_string = string
- for regex_obj in RegExSub_list:
- processed_string = regex_obj.sub(processed_string)
- if debug:
- print('========================================================\n')
- print(regex_obj.regex)
- print('--------------------------------------------------------\n')
- print(processed_string)
- return processed_string
- def prettify_xml(xml_string, indent=2, debug=False):
- """Prettify XML with intelligent inline tags.
- Args:
- xml_string (str): XML text to prettify.
- indent (int, optional): Set size of XML tag indents.
- debug (bool, optional): Show results of each regexp application.
- Returns:
- str: Prettified XML.
- """
- doc = xmldom.parseString(xml_string)
- indent_str = ' ' * indent
- ugly_xml = doc.toprettyxml(indent=indent_str)
- inline_all_tags = RegExSub(r'>\n\s*([^<>\s].*?)\n\s*</', re.S, r'>\g<1></')
- whitespace_re = RegExSub(r'^[\s\n]*$', re.M)
- empty_tags = RegExSub(r'(<[^/]*?>)(\n|\s)*(</)', re.M, r'\g<1>\g<3>')
- blankline_re = RegExSub(r'(>)\n$', re.M, r'\g<1>')
- regexps = [inline_all_tags, whitespace_re, blankline_re]
- pretty_xml = apply_re_subs(ugly_xml, regexps, debug)
- return pretty_xml
|