convert.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. from pathlib import Path
  2. import re
  3. import zipfile
  4. class Converter:
  5. pass
  6. class DadaConverter:
  7. current_file: str = ''
  8. def __init__(self, folder):
  9. base = Path(folder)
  10. self.input_dir = base.joinpath('input')
  11. self.output_dir = base.joinpath('output')
  12. self.temp_dir = base.joinpath('temp')
  13. self.logs_dir = base.joinpath('logs')
  14. self.re_comp = re.compile(r'>([^<]+)<')
  15. with open(base.joinpath('config/translation.csv'), 'r') as frh:
  16. self.translation = dict([row.split('\t') for row in frh.read().split('\n') if row.strip() != ''])
  17. self.translation['\n'] = '\n'
  18. self.translation[' '] = ' '
  19. self.missing = []
  20. def run(self):
  21. for filename in self.input_dir.glob('*.epub'):
  22. self.cleanup_temp(self.temp_dir)
  23. print(filename.name)
  24. self.epub_workflow(filename)
  25. self.cleanup_temp(self.temp_dir)
  26. with open(self.logs_dir.joinpath('missing.txt'), 'w') as fwh:
  27. fwh.write(','.join(set(self.missing)))
  28. def cleanup_temp(self, folder):
  29. for name in folder.glob('*'):
  30. if name.is_dir():
  31. self.cleanup_temp(name)
  32. name.rmdir()
  33. else:
  34. name.unlink()
  35. def epub_workflow(self, filename):
  36. self.unzip(filename)
  37. # self.convert_epub_htm_folder()
  38. self.rezip(self.output_dir.joinpath(filename.name))
  39. def unzip(self, filename):
  40. with zipfile.ZipFile(filename, 'r') as zfrh:
  41. zfrh.extractall(self.temp_dir)
  42. def convert_epub_htm_folder(self):
  43. for filename in self.temp_dir.joinpath('OEBPS/Text').glob('*.htm'):
  44. print(filename)
  45. self.convert_file(filename)
  46. def convert_file(self, filename):
  47. with open(filename, 'r') as frh:
  48. content = frh.read()
  49. content = self.re_comp.sub(self.convert_string, content)
  50. with open(filename, 'w') as frh:
  51. frh.write(content)
  52. def convert_string(self, match):
  53. text = match.group(1)
  54. try:
  55. res = [self.translation[c] for c in text]
  56. except KeyError as e:
  57. self.missing.append(e.args[0])
  58. res = list(text)
  59. return '>' + ''.join(res) + '<'
  60. def rezip(self, filename):
  61. file_list = self.file_list(self.temp_dir)
  62. with zipfile.ZipFile(filename, 'w', compression=0, compresslevel=None) as zfwh:
  63. for name in file_list:
  64. zfwh.write(name)
  65. def file_list(self, folder):
  66. res = []
  67. for name in folder.glob('*'):
  68. if name.is_dir():
  69. res.extend(self.file_list(name))
  70. else:
  71. res.append(name)
  72. return res
  73. if __name__ == '__main__':
  74. DadaConverter(str(Path(__file__).parent)).run()