mirror of https://github.com/CIRCL/Circlean
initial commit
commit
accc177cd6
|
@ -0,0 +1,2 @@
|
|||
filesystem/opt/groomer/pdfbox-app-*.jar
|
||||
filesystem/etc
|
|
@ -0,0 +1,75 @@
|
|||
How To
|
||||
======
|
||||
|
||||
0. power off device
|
||||
1. insert source USB in the TOP usb slot
|
||||
2. insert target USB in the BOTTOM usb slot
|
||||
3. wait. wait some more. it's slow and can take 30-60 minutes depending on how
|
||||
many document conversions take place
|
||||
4. when the only status LED left is the power indicator on the rPI, the process
|
||||
is finished
|
||||
5. power off the device and disconnect the drives
|
||||
|
||||
Notes
|
||||
=====
|
||||
|
||||
* don't plug in USB devices with a hub because there's no way to tell it which
|
||||
is source and target - its the first drive enumerated (top port) that is the
|
||||
source and the second (bootom port) is the target
|
||||
* don't turn it off without shuting down the system, when grooming is done it
|
||||
shuts down automatically: losing power while it's running can trash the OS
|
||||
on the SD cards because SD cards don't always like dirty shutdowns (ie power loss)
|
||||
* Using a target usb stick that has a status light as long as the device has
|
||||
power is a really useful thing as there the other status lights on the groomer
|
||||
are less than indicative at times: because teh 'OK' led on the rPi toggles on activity
|
||||
it can be off for a long time while processing something and only comes back
|
||||
on when that process finishes - hence why a USB that has some sort of LED activity
|
||||
when jsut plugged in (even if not reading or writing but while the USB port is
|
||||
powered) is helpful in determining when the process is finished - when
|
||||
teh rPI is shutdown, the USB port power is shut off and that LED will also
|
||||
then be off on the USB device
|
||||
* Use a larger target device as all zip files get unpacked and processed onto
|
||||
the target
|
||||
* if you have an hdmi monitor plugged in you can watch what's happening for about
|
||||
30 mintues until the rPI's power saving's kick in and turn off the monitor
|
||||
* if only one usb stick is present at power up, it doesn't groom and looks like
|
||||
a normal rPi
|
||||
* if you want to ssh into the rPi username is 'pi' password 'raspberry' as per defaults
|
||||
|
||||
|
||||
Technical notes
|
||||
===============
|
||||
|
||||
* groomer script is in /opt/groomer/ with the other required files
|
||||
* dependancies are libre-office and OpenJRE
|
||||
* and the ip address is 192.168.1.89
|
||||
* the groomer process is kicked off in /etc/rc.local
|
||||
* the heavy lifting takes place or is dispatched from /opt/groomer/groomer.sh
|
||||
in that script file is what file types get processed (or if not listed there,
|
||||
get ignored)
|
||||
* there are two ways pdf's can get handled -right now they have their text extracted
|
||||
to the target device, the otherway copies it and extracts the text
|
||||
* the pdf text extraction isn't perfect and is the slowest part of it, but should
|
||||
be able to handle unicode stuff and currently doesn't do image extraction from
|
||||
pdf's but could do that too
|
||||
|
||||
|
||||
Discussion
|
||||
==========
|
||||
|
||||
* however image exports of pdf pages only have the images and no text so it's not
|
||||
like saving each page to a jpg which would be a really handy and safe way of
|
||||
converting pdf's
|
||||
* spread sheets and presentations get converted to pdfs to kill off any embedded
|
||||
macros and it's assumed that it's not producing evil pdf's on export but does
|
||||
nothing to sanitize any embedded links within those documents
|
||||
* for spreadsheets, if they are longer than a page, only a page worth from that
|
||||
sheet is exported right from the middle of the sheet (ie the top and bottom of
|
||||
that sheet will get cut off and only the contents in teh middle exported to pdf)
|
||||
dumb but i figure if you want to go back to the source because it's interesting
|
||||
enough on teh groomed side of it, then you can take the extra precautions
|
||||
* the groomed target only copies "safe" files, and does its best to convert any
|
||||
potentiall unsafe files to a safer format
|
||||
* safe files being one that i know of that can't contain malicious embedded macros
|
||||
or other crap like that, and those than can get converted to something that wont
|
||||
contain code after conversion
|
|
@ -0,0 +1,24 @@
|
|||
TODO
|
||||
====
|
||||
|
||||
* the script locations should be changed in the next version so they don't sit
|
||||
next to teh rPi's example development code that ships with teh stock rPi
|
||||
* the system isn't optimised and should be : cleanup and making it as close to
|
||||
stock as possible
|
||||
* Starting process should be more obfuscated
|
||||
* strip exif data and leave it in a .txt file next to the image it came from
|
||||
=> exiftool
|
||||
* set filesystem of OS in RO (physical switch and/or remount OS)
|
||||
* mount source key in RO and noexec
|
||||
* mount target key with noexec
|
||||
* convert spreadsheets in csv ?
|
||||
* convert documents (pdfs/*office/...) in images ?
|
||||
* Have a look at Ghostscript to work on PDFs (.pdf -> .eps -> .png?)
|
||||
* do not run the conversions as root
|
||||
* take eth0 down in /etc/netowrk/interfaces or in the groomer script disable the
|
||||
interface before anything happens
|
||||
* hdmi should stay up: solveable by poking the power management timer
|
||||
(better not to disable the PM completely)
|
||||
* get rid of pdfbox ?
|
||||
* scripts to generate a SD card automatically (win/mac/linux)
|
||||
* move the scripts away from /opt/
|
|
@ -0,0 +1,29 @@
|
|||
#!/bin/sh -e
|
||||
#
|
||||
# rc.local
|
||||
#
|
||||
# This script is executed at the end of each multiuser runlevel.
|
||||
# Make sure that the script will "exit 0" on success or any other
|
||||
# value on error.
|
||||
#
|
||||
# In order to enable or disable this script just change the execution
|
||||
# bits.
|
||||
#
|
||||
# By default this script does nothing.
|
||||
|
||||
# Print the IP address
|
||||
_IP=$(hostname -I) || true
|
||||
if [ "$_IP" ]; then
|
||||
printf "My IP address is %s\n" "$_IP"
|
||||
fi
|
||||
|
||||
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
|
||||
if [ -e /dev/sda ]; then
|
||||
if [ -e /dev/sdb ]; then
|
||||
/opt/groomer/groomer.sh
|
||||
/sbin/shutdown -h now
|
||||
fi
|
||||
fi
|
||||
|
||||
exit 0
|
|
@ -0,0 +1,159 @@
|
|||
#!/bin/bash
|
||||
|
||||
# groom da kitteh!
|
||||
|
||||
GH=/opt/groomer/
|
||||
JAVA=/usr/bin/java
|
||||
|
||||
pdfCopyDirty()
|
||||
{
|
||||
# copy all pdf's over to their relative same locations
|
||||
find $1 -iname "*.pdf" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${2}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${2}$X/$F \n' | while read l; do eval $l; done
|
||||
# extract all the txt we can from potentially evil pdf's
|
||||
find $2 -iname "*.pdf" -printf 'echo %p extracting text to %p-extracted.txt; $JAVA -jar $GH/pdfbox-app-1.7.1.jar ExtractText %p %p-extracted.txt 2> /dev/null \n' | while read l; do eval $l; done
|
||||
}
|
||||
|
||||
pdfCopyClean()
|
||||
{
|
||||
# convert pdf's on the fly from src to relative dst location
|
||||
find $1 -iname "*.pdf" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${2}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; echo "%p" extracting text to ${2}$X/$F-extracted.txt; $JAVA -jar $GH/pdfbox-app-1.7.1.jar ExtractText "%p" ${2}$X/$F-extracted.txt 2> /dev/null \n' | while read l; do eval $l; done
|
||||
}
|
||||
|
||||
copySafeFiles()
|
||||
{
|
||||
TYPES="\
|
||||
jpg jpeg gif png tif tga raw \
|
||||
mp4 avi mov \
|
||||
mp3 wav \
|
||||
txt xml csv tsv \
|
||||
"
|
||||
for type in $TYPES
|
||||
do
|
||||
find $1 -iname "*.$type" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${2}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${2}$X/$F \n' | while read l; do eval $l; done
|
||||
done
|
||||
}
|
||||
|
||||
convertCopyFiles()
|
||||
{
|
||||
# wordy documents
|
||||
TYPES="doc docx odt sxw rtf wpd htm html"
|
||||
FILTER=Text; OUT=txt
|
||||
convertCopyFilesHelper $1 $2 $3 $TYPES $OUT $FILTER
|
||||
|
||||
# spreadsheets
|
||||
TYPES="xls xslx ods sxc"
|
||||
FILTER=calc_pdf_Export; OUT=pdf
|
||||
convertCopyFilesHelper $1 $2 $3 $TYPES $OUT $FILTER
|
||||
|
||||
# presentation files
|
||||
TYPES="ppt pptx odp sxi"
|
||||
FILTER=impress_pdf_Export; OUT=pdf
|
||||
convertCopyFilesHelper $1 $2 $3 $TYPES $OUT $FILTER
|
||||
}
|
||||
convertCopyFilesHelper()
|
||||
{
|
||||
for type in $TYPES
|
||||
do
|
||||
find $1 -iname "*.$type" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${3}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${3}$X/$F \n' | while read l; do eval $l; done
|
||||
find $3 -iname "*.$type" -printf 'X=`echo %h | sed s:${3}::`; mkdir -p ${2}${X}; soffice --headless --convert-to ${type}-extraced.$OUT:$FILTER %p --outdir ${2}${X} \n' | while read l; do eval $l; done
|
||||
done
|
||||
}
|
||||
|
||||
unpackZip()
|
||||
{
|
||||
find $1 -iname "*.zip" -printf 'X=`echo %h | sed -f $GH/sedKillSpace -e s:${1}::`; mkdir -p ${3}${X}; F=`echo %f | sed -f $GH/sedKillSpace`; cp -fv "%p" ${3}$X/$F \n' | while read l; do eval $l; done
|
||||
find $3 -iname "*.zip" -printf 'X=`echo %h | sed s:${3}::`; mkdir -p ${ZIPTEMP}/${X}/UNZIPPED_%f/; unzip "%p" -d ${ZIPTEMP}${X}/UNZIPPED_%f/ 2> /dev/null; \n' | while read l; do eval $l; done
|
||||
find $3 -iname "*.zip" -printf 'rm -rf %p \n' | while read l; do eval $l; done
|
||||
|
||||
if [ -d ${ZIPTEMP} ]; then
|
||||
if [ $COPYDIRTYPDF -eq 1 ]; then
|
||||
pdfCopyDirty $ZIPTEMP $targetDir
|
||||
else
|
||||
pdfCopyClean $ZIPTEMP $targetDir
|
||||
fi
|
||||
copySafeFiles $ZIPTEMP $2 $3
|
||||
convertCopyFiles $ZIPTEMP $2 $3
|
||||
rm -rf ${TEMP}/*
|
||||
rm -rf ${ZIPTEMP}/*
|
||||
fi
|
||||
}
|
||||
|
||||
SRC=/src
|
||||
DST=/dst
|
||||
if [ ! -d $SRC ]; then
|
||||
mkdir $SRC
|
||||
fi
|
||||
if [ ! -d $DST ]; then
|
||||
mkdir $DST
|
||||
fi
|
||||
|
||||
TEMP=/dst/temp
|
||||
ZIPTEMP=/dst/ziptemp
|
||||
FL=${DST}/filelist.txt
|
||||
|
||||
umount $DST 2> /dev/null
|
||||
mount /dev/sdb1 $DST
|
||||
if [ $? -ne 0 ]; then
|
||||
# echo Could not mount target USB stick!
|
||||
exit 1
|
||||
else
|
||||
echo Target USB device mounted at $DST
|
||||
rm -rf $DST/FROM_PARTITION_*
|
||||
|
||||
# mount temp and make sure it's empty
|
||||
mkdir -p $TEMP
|
||||
mkdir -p $ZIPTEMP
|
||||
|
||||
rm -rf ${TEMP}/*
|
||||
rm -rf ${ZIPTEMP}/*
|
||||
|
||||
echo Full file list from source USB > $FL
|
||||
fi
|
||||
|
||||
COPYDIRTYPDF=0
|
||||
PARTCOUNT=1
|
||||
PARTITIONS=`ls /dev/sda* | grep '/dev/sda[1-9][0-6]*'`
|
||||
for partition in $PARTITIONS
|
||||
do
|
||||
echo Processing partition: ${PARTCOUNT} $partition
|
||||
umount $SRC 2> /dev/null
|
||||
mount -r $partition $SRC
|
||||
if [ $? -ne 0 ]; then
|
||||
echo could not mount $partition at /$SRC
|
||||
else
|
||||
echo $partition mounted at $SRC
|
||||
|
||||
echo PARTITION $PARTCOUNT >> $FL
|
||||
find $SRC/* -printf 'echo %p | sed s:$SRC:: >> $FL \n' | while read l; do eval $l; done
|
||||
|
||||
# create a director on sdb named PARTION_n
|
||||
targetDir=${DST}/FROM_PARTITION_${PARTCOUNT}
|
||||
echo copying to: $targetDir
|
||||
mkdir -p $targetDir
|
||||
|
||||
if [ $COPYDIRTYPDF -eq 1 ]; then
|
||||
pdfCopyDirty $SRC $targetDir
|
||||
else
|
||||
pdfCopyClean $SRC $targetDir
|
||||
fi
|
||||
|
||||
# copy stuff
|
||||
copySafeFiles $SRC $targetDir
|
||||
convertCopyFiles $SRC $targetDir $TEMP
|
||||
rm -rf ${TEMP}/*
|
||||
|
||||
# unpack and process archives
|
||||
unpackZip $SRC $targetDir $TEMP
|
||||
fi
|
||||
let PARTCOUNT=$PARTCOUNT+1
|
||||
done
|
||||
|
||||
#cleanup
|
||||
rm -rf ${TEMP}*
|
||||
rm -rf ${ZIPTEMP}*
|
||||
sync
|
||||
umount $SRC
|
||||
umount $DST
|
||||
|
||||
/sbin/shutdown -h now
|
||||
|
|
@ -0,0 +1 @@
|
|||
s:\ :_:g
|
Loading…
Reference in New Issue