xml_prettify.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """Intelligently pretty-print HTML/XML with inline tags.
  2. prettify_xml() can be used for any XML text.
  3. prettify_html() is specifically for BeautifulSoup.prettify() output,
  4. as it does not add tag linebreaks.
  5. """
  6. import re
  7. import xml.dom.minidom as xmldom
  8. class RegExSub:
  9. """Dict factory for regex and corresponding substitution expression.
  10. Attributes:
  11. regex (re.Pattern): Compiled regex to use in re.search()/match()
  12. replace_with (TYPE): Description
  13. """
  14. def __init__(self, pattern, flags=0, replace_with=''):
  15. """Create RexExSub instance.
  16. Args:
  17. pattern (str): String to compile as regex.
  18. flags (re.RegexFlag, optional): Flags for re.compile().
  19. replace_with (str): String to replace regex matches. Default
  20. removes match by replacing with empty string.
  21. """
  22. self.regex = re.compile(pattern, flags)
  23. self.replace_with = replace_with
  24. def sub(self, string):
  25. """Perform regex substitution on given string.
  26. Args:
  27. string (str): String to be processed.
  28. Returns:
  29. str: String after replacements made.
  30. """
  31. return self.regex.sub(self.replace_with, string)
  32. def apply_re_subs(string, RegExSub_list, debug=False):
  33. """Apply multiple regex substitutions to a string.
  34. Args:
  35. string (str): String to be processed.
  36. RegExSub_list (list): List of RegExSub objects to apply.
  37. debug (bool, optional): Show results of each regexp application.
  38. Returns:
  39. str: String after all regex substitutions have been applied.
  40. """
  41. processed_string = string
  42. for regex_obj in RegExSub_list:
  43. processed_string = regex_obj.sub(processed_string)
  44. if debug:
  45. print('========================================================\n')
  46. print(regex_obj.regex)
  47. print('--------------------------------------------------------\n')
  48. print(processed_string)
  49. return processed_string
  50. def prettify_xml(xml_string, indent=2, debug=False):
  51. """Prettify XML with intelligent inline tags.
  52. Args:
  53. xml_string (str): XML text to prettify.
  54. indent (int, optional): Set size of XML tag indents.
  55. debug (bool, optional): Show results of each regexp application.
  56. Returns:
  57. str: Prettified XML.
  58. """
  59. doc = xmldom.parseString(xml_string)
  60. indent_str = ' ' * indent
  61. ugly_xml = doc.toprettyxml(indent=indent_str)
  62. inline_all_tags = RegExSub(r'>\n\s*([^<>\s].*?)\n\s*</', re.S, r'>\g<1></')
  63. whitespace_re = RegExSub(r'^[\s\n]*$', re.M)
  64. empty_tags = RegExSub(r'(<[^/]*?>)(\n|\s)*(</)', re.M, r'\g<1>\g<3>')
  65. blankline_re = RegExSub(r'(>)\n$', re.M, r'\g<1>')
  66. regexps = [inline_all_tags, whitespace_re, blankline_re]
  67. pretty_xml = apply_re_subs(ugly_xml, regexps, debug)
  68. return pretty_xml