diff --git a/TODO b/TODO index 51f1dfa..b238efe 100644 --- a/TODO +++ b/TODO @@ -28,3 +28,9 @@ TODO [Done] use /etc/mime.types and file -b --mime-type to find out the type of the file * Extract metadata fromĀ all the files => https://mat.boum.org/ + +HTML Files +========== + +- disable JS +- cleanup external imports (js/css/images) diff --git a/filesystem/opt/groomer/constraint_conv.sh b/filesystem/opt/groomer/constraint_conv.sh new file mode 100644 index 0000000..ca22c8b --- /dev/null +++ b/filesystem/opt/groomer/constraint_conv.sh @@ -0,0 +1,4 @@ +# Paths to the commands used to convert the files +PDF="/usr/local/bin/pdf2htmlEX" +LO="/usr/bin/libreoffice" + diff --git a/filesystem/opt/groomer/functions.sh b/filesystem/opt/groomer/functions.sh index 089d5c4..f29e14f 100755 --- a/filesystem/opt/groomer/functions.sh +++ b/filesystem/opt/groomer/functions.sh @@ -1,6 +1,11 @@ #!/bin/bash source ./constraint.sh +source ./constraint_conv.sh + +# https://blogs.msdn.com/b/vsofficedeveloper/archive/2008/05/08/office-2007-open-xml-mime-types.aspx +# http://plan-b-for-openoffice.org/glossary/term/mime-type +OFFICE_MIME="msword|vnd.openxmlformats-officedocument.*|vnd.ms-*|vnd.oasis.opendocument*" copy(){ src_file=${1} @@ -12,6 +17,7 @@ copy(){ # Plain text text(){ echo Text file ${1} + # XXX: append .txt ? copy ${1} ${2}${1##$SRC} } @@ -40,17 +46,33 @@ application(){ dst_file=${2}${1##$SRC} mime_details=${3} case ${mime_details} in - "pdf") + pdf) echo "Got a pdf" - # WARNING: This command randomly fails, and loop indefinitely... - pdf2ps -dSAFER -sOutputFile="%stdout" ${src_file} | ps2pdfwr - ${dst_file} + ${PDF} --dest-dir ${2} ${src_file} + ;; + ${OFFICE_MIME}) + echo "MS Office or ODF document" + temp=${2}/temp + mkdir ${temp} + ${LO} --convert-to pdf --outdir ${temp} ${src_file} + ${PDF} --dest-dir ${2} ${temp}/*.pdf + rm -rf ${temp} ;; *xml*) echo "Got an XML" - text ${1} ${2} + text ${src_file} ${2} + ;; + x-dosexec) + echo "Win executable" + copy ${src_file} ${dst_file}_DANGEROUS + ;; + octet-stream) + echo "Unknown type." + copy ${src_file} ${dst_file}.bin ;; *) - echo "Unknown type." + echo "Unhandled type" + copy ${src_file} ${dst_file} ;; esac