|
@@ -0,0 +1,90 @@
|
|
|
+from pathlib import Path
|
|
|
+import re
|
|
|
+import zipfile
|
|
|
+
|
|
|
+
|
|
|
+class Converter:
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+class DadaConverter:
|
|
|
+ current_file: str = ''
|
|
|
+
|
|
|
+ def __init__(self, folder):
|
|
|
+ base = Path(folder)
|
|
|
+ self.input_dir = base.joinpath('input')
|
|
|
+ self.output_dir = base.joinpath('output')
|
|
|
+ self.temp_dir = base.joinpath('temp')
|
|
|
+ self.logs_dir = base.joinpath('logs')
|
|
|
+ self.re_comp = re.compile(r'>([^<]+)<')
|
|
|
+ with open(base.joinpath('config/translation.csv'), 'r') as frh:
|
|
|
+ self.translation = dict([row.split('\t') for row in frh.read().split('\n') if row.strip() != ''])
|
|
|
+ self.translation['\n'] = '\n'
|
|
|
+ self.translation[' '] = ' '
|
|
|
+ self.missing = []
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ for filename in self.input_dir.glob('*.epub'):
|
|
|
+ self.cleanup_temp(self.temp_dir)
|
|
|
+ print(filename.name)
|
|
|
+ self.epub_workflow(filename)
|
|
|
+ self.cleanup_temp(self.temp_dir)
|
|
|
+ with open(self.logs_dir.joinpath('missing.txt'), 'w') as fwh:
|
|
|
+ fwh.write(','.join(set(self.missing)))
|
|
|
+
|
|
|
+ def cleanup_temp(self, folder):
|
|
|
+ for name in folder.glob('*'):
|
|
|
+ if name.is_dir():
|
|
|
+ self.cleanup_temp(name)
|
|
|
+ name.rmdir()
|
|
|
+ else:
|
|
|
+ name.unlink()
|
|
|
+
|
|
|
+ def epub_workflow(self, filename):
|
|
|
+ self.unzip(filename)
|
|
|
+ # self.convert_epub_htm_folder()
|
|
|
+ self.rezip(self.output_dir.joinpath(filename.name))
|
|
|
+
|
|
|
+ def unzip(self, filename):
|
|
|
+ with zipfile.ZipFile(filename, 'r') as zfrh:
|
|
|
+ zfrh.extractall(self.temp_dir)
|
|
|
+
|
|
|
+ def convert_epub_htm_folder(self):
|
|
|
+ for filename in self.temp_dir.joinpath('OEBPS/Text').glob('*.htm'):
|
|
|
+ print(filename)
|
|
|
+ self.convert_file(filename)
|
|
|
+
|
|
|
+ def convert_file(self, filename):
|
|
|
+ with open(filename, 'r') as frh:
|
|
|
+ content = frh.read()
|
|
|
+
|
|
|
+ content = self.re_comp.sub(self.convert_string, content)
|
|
|
+ with open(filename, 'w') as frh:
|
|
|
+ frh.write(content)
|
|
|
+
|
|
|
+ def convert_string(self, match):
|
|
|
+ text = match.group(1)
|
|
|
+ try:
|
|
|
+ res = [self.translation[c] for c in text]
|
|
|
+ except KeyError as e:
|
|
|
+ self.missing.append(e.args[0])
|
|
|
+ res = list(text)
|
|
|
+ return '>' + ''.join(res) + '<'
|
|
|
+
|
|
|
+ def rezip(self, filename):
|
|
|
+ file_list = self.file_list(self.temp_dir)
|
|
|
+ with zipfile.ZipFile(filename, 'w', compression=0, compresslevel=None) as zfwh:
|
|
|
+ for name in file_list:
|
|
|
+ zfwh.write(name)
|
|
|
+
|
|
|
+ def file_list(self, folder):
|
|
|
+ res = []
|
|
|
+ for name in folder.glob('*'):
|
|
|
+ if name.is_dir():
|
|
|
+ res.extend(self.file_list(name))
|
|
|
+ else:
|
|
|
+ res.append(name)
|
|
|
+ return res
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ DadaConverter(str(Path(__file__).parent)).run()
|