convert pdf to pdf/a

pull/33/head
Raphaël Vinot 2015-04-03 15:36:47 +02:00
parent aabb0effc5
commit be71c85778
1 changed files with 16 additions and 10 deletions

View File

@ -10,8 +10,8 @@ import subprocess
import time
LIBREOFFICE = '/usr/bin/unoconv'
GS = '/usr/bin/gs'
PDF2HTMLEX = '/usr/bin/pdf2htmlEX'
SEVENZ = '/usr/bin/7z'
@ -242,39 +242,45 @@ class KittenGroomer(object):
def _office_related(self):
self.cur_file.add_log_details('processing_type', 'office')
dst_dir, filename = os.path.split(self.cur_file.dst_path)
name, ext = os.path.splitext(filename)
tmpdir = os.path.join(dst_dir, 'temp')
name, ext = os.path.splitext(filename)
tmppath = os.path.join(tmpdir, name + '.pdf')
self._safe_mkdir(tmpdir)
lo_command = '{} --format pdf -eSelectPdfVersion=1 --output {} {}'.format(
LIBREOFFICE, tmppath, self.cur_file.src_path)
self._run_process(lo_command)
self.__pdf(tmppath)
self._pdfa(tmppath)
self._safe_rmtree(tmpdir)
def __pdf(self, tmpsrcpath):
def _pdfa(self, tmpsrcpath):
pdf_command = '{} --dest-dir / {} {}'.format(PDF2HTMLEX, tmpsrcpath,
self.cur_file.dst_path + '.html')
self._run_process(pdf_command)
def _pdf(self):
self.cur_file.add_log_details('processing_type', 'pdf')
# FIXME: convert pdf to pdf/a if needed prior to converting to html
# TODO: Convert to pdf/A
self.__pdf(self.cur_file.src_path)
dst_dir, filename = os.path.split(self.cur_file.dst_path)
tmpdir = os.path.join(dst_dir, 'temp')
tmppath = os.path.join(tmpdir, filename)
self._safe_mkdir(tmpdir)
gs_command = '{} -dPDFA -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile={} {}'.format(
GS, tmppath, self.cur_file.src_path)
self._run_process(gs_command)
self._pdfa(tmppath)
self._safe_rmtree(tmpdir)
def _archive(self):
self.cur_file.add_log_details('processing_type', 'archive')
self.cur_file.is_recursive = True
self.cur_file.log_string += 'Archive extracted, processing content.'
self.recursive += 1
tmpdir = self.cur_file.dst_path + '_temp'
self._safe_mkdir(tmpdir)
extract_command = '{} -p1 x {} -o{} -bd'.format(SEVENZ, self.cur_file.src_path, tmpdir)
self._run_process(extract_command)
self.recursive += 1
self.processdir(self.cur_file.dst_path, tmpdir)
self._safe_rmtree(tmpdir)
self.recursive -= 1
self._safe_rmtree(tmpdir)
def _unknown_app(self):
self.cur_file.make_unknown()
@ -302,7 +308,7 @@ class KittenGroomer(object):
def _media_processing(self):
self.cur_log.fields(processing_type='media')
if not self.cur_file.verify_mime() or not self.cur_file.verify_extension():
# The extension is unknown or doesn't match the mime type, suspicious
# The extension is unknown or doesn't match the mime type => suspicious
# TODO: write details in the logfile
self.cur_file.make_dangerous()
self._safe_copy()