mirror of https://github.com/CIRCL/Circlean
NOT TESTED - convert most important formats.
parent
28c11ada8f
commit
f6aadbc246
6
TODO
6
TODO
|
@ -28,3 +28,9 @@ TODO
|
||||||
[Done] use /etc/mime.types and file -b --mime-type <filename> to find out the type of
|
[Done] use /etc/mime.types and file -b --mime-type <filename> to find out the type of
|
||||||
the file
|
the file
|
||||||
* Extract metadata from all the files => https://mat.boum.org/
|
* Extract metadata from all the files => https://mat.boum.org/
|
||||||
|
|
||||||
|
HTML Files
|
||||||
|
==========
|
||||||
|
|
||||||
|
- disable JS
|
||||||
|
- cleanup external imports (js/css/images)
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
# Paths to the commands used to convert the files
|
||||||
|
PDF="/usr/local/bin/pdf2htmlEX"
|
||||||
|
LO="/usr/bin/libreoffice"
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
source ./constraint.sh
|
source ./constraint.sh
|
||||||
|
source ./constraint_conv.sh
|
||||||
|
|
||||||
|
# https://blogs.msdn.com/b/vsofficedeveloper/archive/2008/05/08/office-2007-open-xml-mime-types.aspx
|
||||||
|
# http://plan-b-for-openoffice.org/glossary/term/mime-type
|
||||||
|
OFFICE_MIME="msword|vnd.openxmlformats-officedocument.*|vnd.ms-*|vnd.oasis.opendocument*"
|
||||||
|
|
||||||
copy(){
|
copy(){
|
||||||
src_file=${1}
|
src_file=${1}
|
||||||
|
@ -12,6 +17,7 @@ copy(){
|
||||||
# Plain text
|
# Plain text
|
||||||
text(){
|
text(){
|
||||||
echo Text file ${1}
|
echo Text file ${1}
|
||||||
|
# XXX: append .txt ?
|
||||||
copy ${1} ${2}${1##$SRC}
|
copy ${1} ${2}${1##$SRC}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,17 +46,33 @@ application(){
|
||||||
dst_file=${2}${1##$SRC}
|
dst_file=${2}${1##$SRC}
|
||||||
mime_details=${3}
|
mime_details=${3}
|
||||||
case ${mime_details} in
|
case ${mime_details} in
|
||||||
"pdf")
|
pdf)
|
||||||
echo "Got a pdf"
|
echo "Got a pdf"
|
||||||
# WARNING: This command randomly fails, and loop indefinitely...
|
${PDF} --dest-dir ${2} ${src_file}
|
||||||
pdf2ps -dSAFER -sOutputFile="%stdout" ${src_file} | ps2pdfwr - ${dst_file}
|
;;
|
||||||
|
${OFFICE_MIME})
|
||||||
|
echo "MS Office or ODF document"
|
||||||
|
temp=${2}/temp
|
||||||
|
mkdir ${temp}
|
||||||
|
${LO} --convert-to pdf --outdir ${temp} ${src_file}
|
||||||
|
${PDF} --dest-dir ${2} ${temp}/*.pdf
|
||||||
|
rm -rf ${temp}
|
||||||
;;
|
;;
|
||||||
*xml*)
|
*xml*)
|
||||||
echo "Got an XML"
|
echo "Got an XML"
|
||||||
text ${1} ${2}
|
text ${src_file} ${2}
|
||||||
|
;;
|
||||||
|
x-dosexec)
|
||||||
|
echo "Win executable"
|
||||||
|
copy ${src_file} ${dst_file}_DANGEROUS
|
||||||
|
;;
|
||||||
|
octet-stream)
|
||||||
|
echo "Unknown type."
|
||||||
|
copy ${src_file} ${dst_file}.bin
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown type."
|
echo "Unhandled type"
|
||||||
|
copy ${src_file} ${dst_file}
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue