123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- from pathlib import Path
- import re
- import zipfile
- class Converter:
- pass
- class DadaConverter:
- current_file: str = ''
- def __init__(self, folder):
- base = Path(folder)
- self.input_dir = base.joinpath('input')
- self.output_dir = base.joinpath('output')
- self.temp_dir = base.joinpath('temp')
- self.logs_dir = base.joinpath('logs')
- self.re_comp = re.compile(r'>([^<]+)<')
- with open(base.joinpath('config/translation.csv'), 'r') as frh:
- self.translation = dict([row.split('\t') for row in frh.read().split('\n') if row.strip() != ''])
- self.translation['\n'] = '\n'
- self.translation[' '] = ' '
- self.missing = []
- def run(self):
- for filename in self.input_dir.glob('*.epub'):
- self.cleanup_temp(self.temp_dir)
- print(filename.name)
- self.epub_workflow(filename)
- self.cleanup_temp(self.temp_dir)
- with open(self.logs_dir.joinpath('missing.txt'), 'w') as fwh:
- fwh.write(','.join(set(self.missing)))
- def cleanup_temp(self, folder):
- for name in folder.glob('*'):
- if name.is_dir():
- self.cleanup_temp(name)
- name.rmdir()
- else:
- name.unlink()
- def epub_workflow(self, filename):
- self.unzip(filename)
- # self.convert_epub_htm_folder()
- self.rezip(self.output_dir.joinpath(filename.name))
- def unzip(self, filename):
- with zipfile.ZipFile(filename, 'r') as zfrh:
- zfrh.extractall(self.temp_dir)
- def convert_epub_htm_folder(self):
- for filename in self.temp_dir.joinpath('OEBPS/Text').glob('*.htm'):
- print(filename)
- self.convert_file(filename)
- def convert_file(self, filename):
- with open(filename, 'r') as frh:
- content = frh.read()
- content = self.re_comp.sub(self.convert_string, content)
- with open(filename, 'w') as frh:
- frh.write(content)
- def convert_string(self, match):
- text = match.group(1)
- try:
- res = [self.translation[c] for c in text]
- except KeyError as e:
- self.missing.append(e.args[0])
- res = list(text)
- return '>' + ''.join(res) + '<'
- def rezip(self, filename):
- file_list = self.file_list(self.temp_dir)
- with zipfile.ZipFile(filename, 'w', compression=0, compresslevel=None) as zfwh:
- for name in file_list:
- zfwh.write(name)
- def file_list(self, folder):
- res = []
- for name in folder.glob('*'):
- if name.is_dir():
- res.extend(self.file_list(name))
- else:
- res.append(name)
- return res
- if __name__ == '__main__':
- DadaConverter(str(Path(__file__).parent)).run()
|