from pathlib import Path import re import zipfile class Converter: pass class DadaConverter: current_file: str = '' def __init__(self, folder): base = Path(folder) self.input_dir = base.joinpath('input') self.output_dir = base.joinpath('output') self.temp_dir = base.joinpath('temp') self.logs_dir = base.joinpath('logs') self.re_comp = re.compile(r'>([^<]+)<') with open(base.joinpath('config/translation.csv'), 'r') as frh: self.translation = dict([row.split('\t') for row in frh.read().split('\n') if row.strip() != '']) self.translation['\n'] = '\n' self.translation[' '] = ' ' self.missing = [] def run(self): for filename in self.input_dir.glob('*.epub'): self.cleanup_temp(self.temp_dir) print(filename.name) self.epub_workflow(filename) self.cleanup_temp(self.temp_dir) with open(self.logs_dir.joinpath('missing.txt'), 'w') as fwh: fwh.write(','.join(set(self.missing))) def cleanup_temp(self, folder): for name in folder.glob('*'): if name.is_dir(): self.cleanup_temp(name) name.rmdir() else: name.unlink() def epub_workflow(self, filename): self.unzip(filename) # self.convert_epub_htm_folder() self.rezip(self.output_dir.joinpath(filename.name)) def unzip(self, filename): with zipfile.ZipFile(filename, 'r') as zfrh: zfrh.extractall(self.temp_dir) def convert_epub_htm_folder(self): for filename in self.temp_dir.joinpath('OEBPS/Text').glob('*.htm'): print(filename) self.convert_file(filename) def convert_file(self, filename): with open(filename, 'r') as frh: content = frh.read() content = self.re_comp.sub(self.convert_string, content) with open(filename, 'w') as frh: frh.write(content) def convert_string(self, match): text = match.group(1) try: res = [self.translation[c] for c in text] except KeyError as e: self.missing.append(e.args[0]) res = list(text) return '>' + ''.join(res) + '<' def rezip(self, filename): file_list = self.file_list(self.temp_dir) with zipfile.ZipFile(filename, 'w', compression=0, compresslevel=None) as zfwh: for name in file_list: zfwh.write(name) def file_list(self, folder): res = [] for name in folder.glob('*'): if name.is_dir(): res.extend(self.file_list(name)) else: res.append(name) return res if __name__ == '__main__': DadaConverter(str(Path(__file__).parent)).run()