Browse Source

Dada Converter fast fertig - zumindest für epub

Robert Bedner 3 years ago
parent
commit
b4926e2fc0

+ 0 - 0
dada/__init__.py


+ 97 - 0
dada/config/translation.csv

@@ -0,0 +1,97 @@
+e	 
+n	 
+i	 
+s	 
+r	 
+a	 
+t	 
+d	 
+h	 
+u	 
+l	 
+c	 
+g	 
+
+m	⁠ 
+o	⁠ 
+b	⁠ 
+w	⁠ 
+f	⁠ 
+k	⁠ 
+z	⁠ 
+p	⁠ 
+v	⁠ 
+j	⁠ 
+y	⁠ 
+x	⁠ 
+q	⁠ 
+
+E	 
+N	 
+I	 
+S	 
+R	 
+A	 
+T	 
+D	 
+H	 
+U	 
+L	 
+C	 
+G	 
+
+M	⁠ 
+O	⁠ 
+B	⁠ 
+W	⁠ 
+F	⁠ 
+K	⁠ 
+Z	⁠ 
+P	⁠ 
+V	⁠ 
+J	⁠ 
+Y	⁠ 
+X	⁠ 
+Q	⁠ 
+
+1	​ 
+3	​ 
+5	​ 
+7	​ 
+9	​ 
+Ä	​ 
+Ö	​ 
+Ü	​ 
+-	​ 
+:	​ 
+;	​ 
+!	​ 
+"	​ 
+
+0	​ 
+2	​ 
+4	​ 
+6	​ 
+8	​ 
+ä	​ 
+ö	​ 
+ü	​ 
+ß	​ 
+.	​ 
+,	​ 
+?	​ 
+'	​ 
+	
+»	​⁠ 
+«	​⁠ 
+(	​⁠ 
+)	​⁠ 
+=	​⁠ 
+&	​⁠ 
+*	​⁠ 
+/	​⁠ 
+‹	​⁠ 
+›	​⁠ 
+	​⁠ 
+	​⁠ 
+	​⁠ 

+ 90 - 0
dada/convert.py

@@ -0,0 +1,90 @@
+from pathlib import Path
+import re
+import zipfile
+
+
+class Converter:
+    pass
+
+
+class DadaConverter:
+    current_file: str = ''
+
+    def __init__(self, folder):
+        base = Path(folder)
+        self.input_dir = base.joinpath('input')
+        self.output_dir = base.joinpath('output')
+        self.temp_dir = base.joinpath('temp')
+        self.logs_dir = base.joinpath('logs')
+        self.re_comp = re.compile(r'>([^<]+)<')
+        with open(base.joinpath('config/translation.csv'), 'r') as frh:
+            self.translation = dict([row.split('\t') for row in frh.read().split('\n') if row.strip() != ''])
+        self.translation['\n'] = '\n'
+        self.translation[' '] = ' '
+        self.missing = []
+
+    def run(self):
+        for filename in self.input_dir.glob('*.epub'):
+            self.cleanup_temp(self.temp_dir)
+            print(filename.name)
+            self.epub_workflow(filename)
+        self.cleanup_temp(self.temp_dir)
+        with open(self.logs_dir.joinpath('missing.txt'), 'w') as fwh:
+            fwh.write(','.join(set(self.missing)))
+
+    def cleanup_temp(self, folder):
+        for name in folder.glob('*'):
+            if name.is_dir():
+                self.cleanup_temp(name)
+                name.rmdir()
+            else:
+                name.unlink()
+
+    def epub_workflow(self, filename):
+        self.unzip(filename)
+        # self.convert_epub_htm_folder()
+        self.rezip(self.output_dir.joinpath(filename.name))
+
+    def unzip(self, filename):
+        with zipfile.ZipFile(filename, 'r') as zfrh:
+            zfrh.extractall(self.temp_dir)
+
+    def convert_epub_htm_folder(self):
+        for filename in self.temp_dir.joinpath('OEBPS/Text').glob('*.htm'):
+            print(filename)
+            self.convert_file(filename)
+
+    def convert_file(self, filename):
+        with open(filename, 'r') as frh:
+            content = frh.read()
+
+        content = self.re_comp.sub(self.convert_string, content)
+        with open(filename, 'w') as frh:
+            frh.write(content)
+
+    def convert_string(self, match):
+        text = match.group(1)
+        try:
+            res = [self.translation[c] for c in text]
+        except KeyError as e:
+            self.missing.append(e.args[0])
+            res = list(text)
+        return '>' + ''.join(res) + '<'
+
+    def rezip(self, filename):
+        file_list = self.file_list(self.temp_dir)
+        with zipfile.ZipFile(filename, 'w', compression=0, compresslevel=None) as zfwh:
+            for name in file_list:
+                zfwh.write(name)
+
+    def file_list(self, folder):
+        res = []
+        for name in folder.glob('*'):
+            if name.is_dir():
+                res.extend(self.file_list(name))
+            else:
+                res.append(name)
+        return res
+
+if __name__ == '__main__':
+    DadaConverter(str(Path(__file__).parent)).run()

+ 0 - 0
dada/epub_converter.py


BIN
dada/input/Douglas Adams - Per Anhalter durch die Galaxis_V1.epub


BIN
dada/input/Douglas Adams - Per Anhalter durch die Galaxis_V2.epub


BIN
dada/input/Douglas Adams - Per Anhalter durch die Galaxis_V3.epub


+ 0 - 0
dada/logs/missing.txt


BIN
dada/output/Douglas Adams - Per Anhalter durch die Galaxis_V1.epub


BIN
dada/output/Douglas Adams - Per Anhalter durch die Galaxis_V2.epub


BIN
dada/output/Douglas Adams - Per Anhalter durch die Galaxis_V3.epub


+ 0 - 83
dada/translation.csv

@@ -1,83 +0,0 @@
-e	\u3000
-n	\u2003
-i	\u2007
-s	\u2002
-r	\u2004
-a	\u00A0
-t	\u2005
-d	\u205F
-h	\u202F
-u	\u2008
-l	\u2009
-c	\u2006
-g	\u200A
-	
-m	\u2060\u3000
-o	\u2060\u2003
-b	\u2060\u2007
-w	\u2060\u2002
-f	\u2060\u2004
-k	\u2060\u00A0
-z	\u2060\u2005
-p	\u2060\u205F
-v	\u2060\u202F
-j	\u2060\u2008
-y	\u2060\u2009
-x	\u2060\u2006
-q	\u2060\u200A
-	
-E	\uFEFF\u3000
-N	\uFEFF\u2003
-I	\uFEFF\u2007
-S	\uFEFF\u2002
-R	\uFEFF\u2004
-A	\uFEFF\u00A0
-T	\uFEFF\u2005
-D	\uFEFF\u205F
-H	\uFEFF\u202F
-U	\uFEFF\u2008
-L	\uFEFF\u2009
-C	\uFEFF\u2006
-G	\uFEFF\u200A
-	
-M	\uFEFF\u2060\u3000
-O	\uFEFF\u2060\u2003
-B	\uFEFF\u2060\u2007
-W	\uFEFF\u2060\u2002
-F	\uFEFF\u2060\u2004
-K	\uFEFF\u2060\u00A0
-Z	\uFEFF\u2060\u2005
-P	\uFEFF\u2060\u205F
-V	\uFEFF\u2060\u202F
-J	\uFEFF\u2060\u2008
-Y	\uFEFF\u2060\u2009
-X	\uFEFF\u2060\u2006
-Q	\uFEFF\u2060\u200A
-
-0	\u200B\u3000
-2	\u200B\u2003
-4	\u200B\u2007
-6	\u200B\u2002
-8	\u200B\u2004
-ä	\u200B\u00A0
-ö	\u200B\u2005
-ü	\u200B\u205F
-ß	\u200B\u202F
-.	\u200B\u2008
-,	\u200B\u2009
-?	\u200B\u2006
-'	\u200B\u200A
-	
-1	\uFEFF\u200B\u3000
-3	\uFEFF\u200B\u2003
-5	\uFEFF\u200B\u2007
-7	\uFEFF\u200B\u2002
-9	\uFEFF\u200B\u2004
-Ä	\uFEFF\u200B\u00A0
-Ö	\uFEFF\u200B\u2005
-Ü	\uFEFF\u200B\u205F
--	\uFEFF\u200B\u202F
-:	\uFEFF\u200B\u2008
-;	\uFEFF\u200B\u2009
-!	\uFEFF\u200B\u2006
-"	\uFEFF\u200B\u200A