"""Intelligently pretty-print HTML/XML with inline tags.
prettify_xml() can be used for any XML text.
prettify_html() is specifically for BeautifulSoup.prettify() output,
as it does not add tag linebreaks.
"""
import re
import xml.dom.minidom as xmldom
class RegExSub:
"""Dict factory for regex and corresponding substitution expression.
Attributes:
regex (re.Pattern): Compiled regex to use in re.search()/match()
replace_with (TYPE): Description
"""
def __init__(self, pattern, flags=0, replace_with=''):
"""Create RexExSub instance.
Args:
pattern (str): String to compile as regex.
flags (re.RegexFlag, optional): Flags for re.compile().
replace_with (str): String to replace regex matches. Default
removes match by replacing with empty string.
"""
self.regex = re.compile(pattern, flags)
self.replace_with = replace_with
def sub(self, string):
"""Perform regex substitution on given string.
Args:
string (str): String to be processed.
Returns:
str: String after replacements made.
"""
return self.regex.sub(self.replace_with, string)
def apply_re_subs(string, RegExSub_list, debug=False):
"""Apply multiple regex substitutions to a string.
Args:
string (str): String to be processed.
RegExSub_list (list): List of RegExSub objects to apply.
debug (bool, optional): Show results of each regexp application.
Returns:
str: String after all regex substitutions have been applied.
"""
processed_string = string
for regex_obj in RegExSub_list:
processed_string = regex_obj.sub(processed_string)
if debug:
print('========================================================\n')
print(regex_obj.regex)
print('--------------------------------------------------------\n')
print(processed_string)
return processed_string
def prettify_xml(xml_string, indent=2, debug=False):
"""Prettify XML with intelligent inline tags.
Args:
xml_string (str): XML text to prettify.
indent (int, optional): Set size of XML tag indents.
debug (bool, optional): Show results of each regexp application.
Returns:
str: Prettified XML.
"""
doc = xmldom.parseString(xml_string)
indent_str = ' ' * indent
ugly_xml = doc.toprettyxml(indent=indent_str)
inline_all_tags = RegExSub(r'>\n\s*([^<>\s].*?)\n\s*', re.S, r'>\g<1>')
whitespace_re = RegExSub(r'^[\s\n]*$', re.M)
empty_tags = RegExSub(r'(<[^/]*?>)(\n|\s)*()', re.M, r'\g<1>\g<3>')
blankline_re = RegExSub(r'(>)\n$', re.M, r'\g<1>')
regexps = [inline_all_tags, whitespace_re, blankline_re]
pretty_xml = apply_re_subs(ugly_xml, regexps, debug)
return pretty_xml