convert pdf to pdf/a

2015-04-03 15:36:47 +02:00 · 2015-04-03 15:36:47 +02:00 · be71c85778
parent aabb0effc5
commit be71c85778
1 changed files with 16 additions and 10 deletions
--- a/fs/opt/groomer/functions.py
+++ b/fs/opt/groomer/functions.py
@ -10,8 +10,8 @@ import subprocess
 import time


-
 LIBREOFFICE = '/usr/bin/unoconv'
+GS = '/usr/bin/gs'
 PDF2HTMLEX = '/usr/bin/pdf2htmlEX'
 SEVENZ = '/usr/bin/7z'

@ -242,39 +242,45 @@ class KittenGroomer(object):
    def _office_related(self):
        self.cur_file.add_log_details('processing_type', 'office')
        dst_dir, filename = os.path.split(self.cur_file.dst_path)
-        name, ext = os.path.splitext(filename)
        tmpdir = os.path.join(dst_dir, 'temp')
+        name, ext = os.path.splitext(filename)
        tmppath = os.path.join(tmpdir, name + '.pdf')
        self._safe_mkdir(tmpdir)
        lo_command = '{} --format pdf -eSelectPdfVersion=1 --output {} {}'.format(
            LIBREOFFICE, tmppath, self.cur_file.src_path)
        self._run_process(lo_command)
-        self.__pdf(tmppath)
+        self._pdfa(tmppath)
        self._safe_rmtree(tmpdir)

-    def __pdf(self, tmpsrcpath):
+    def _pdfa(self, tmpsrcpath):
        pdf_command = '{} --dest-dir / {} {}'.format(PDF2HTMLEX, tmpsrcpath,
                                                     self.cur_file.dst_path + '.html')
        self._run_process(pdf_command)

    def _pdf(self):
        self.cur_file.add_log_details('processing_type', 'pdf')
-        # FIXME: convert pdf to pdf/a if needed prior to converting to html
-        # TODO: Convert to pdf/A
-        self.__pdf(self.cur_file.src_path)
+        dst_dir, filename = os.path.split(self.cur_file.dst_path)
+        tmpdir = os.path.join(dst_dir, 'temp')
+        tmppath = os.path.join(tmpdir, filename)
+        self._safe_mkdir(tmpdir)
+        gs_command = '{} -dPDFA -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile={} {}'.format(
+            GS, tmppath, self.cur_file.src_path)
+        self._run_process(gs_command)
+        self._pdfa(tmppath)
+        self._safe_rmtree(tmpdir)

    def _archive(self):
        self.cur_file.add_log_details('processing_type', 'archive')
        self.cur_file.is_recursive = True
        self.cur_file.log_string += 'Archive extracted, processing content.'
-        self.recursive += 1
        tmpdir = self.cur_file.dst_path + '_temp'
        self._safe_mkdir(tmpdir)
        extract_command = '{} -p1 x {} -o{} -bd'.format(SEVENZ, self.cur_file.src_path, tmpdir)
        self._run_process(extract_command)
+        self.recursive += 1
        self.processdir(self.cur_file.dst_path, tmpdir)
-        self._safe_rmtree(tmpdir)
        self.recursive -= 1
+        self._safe_rmtree(tmpdir)

    def _unknown_app(self):
        self.cur_file.make_unknown()
@ -302,7 +308,7 @@ class KittenGroomer(object):
    def _media_processing(self):
        self.cur_log.fields(processing_type='media')
        if not self.cur_file.verify_mime() or not self.cur_file.verify_extension():
-            # The extension is unknown or doesn't match the mime type, suspicious
+            # The extension is unknown or doesn't match the mime type => suspicious
            # TODO: write details in the logfile
            self.cur_file.make_dangerous()
        self._safe_copy()